From dafebc5bb70303f0b5baf0b087cf4d9a64b5c7f0 Mon Sep 17 00:00:00 2001
From: marha <marha@users.sourceforge.net>
Date: Mon, 12 Sep 2011 11:27:51 +0200
Subject: Synchronised line endinge with release branch

---
 pixman/Makefile.am                      |  262 +-
 pixman/RELEASING                        |  114 +-
 pixman/configure.ac                     | 1688 ++++----
 pixman/demos/tri-test.c                 |   96 +-
 pixman/pixman/Makefile.win32            |  294 +-
 pixman/pixman/pixman-access.c           | 6172 +++++++++++++--------------
 pixman/pixman/pixman-arm-common.h       |  832 ++--
 pixman/pixman/pixman-arm-neon-asm.S     | 6994 +++++++++++++++----------------
 pixman/pixman/pixman-arm-neon.c         | 1014 ++---
 pixman/pixman/pixman-combine.c.template | 4920 +++++++++++-----------
 pixman/pixman/pixman-fast-path.c        | 3976 +++++++++---------
 pixman/pixman/pixman-image.c            | 1562 +++----
 pixman/pixman/pixman-inlines.h          | 2560 +++++------
 pixman/pixman/pixman-trap.c             | 1336 +++---
 pixman/pixman/pixman.h                  | 1980 ++++-----
 pixman/test/Makefile.am                 |  100 +-
 pixman/test/blitters-test.c             |  856 ++--
 pixman/test/composite-traps-test.c      |  514 +--
 pixman/test/composite.c                 | 1842 ++++----
 pixman/test/fetch-test.c                |  412 +-
 pixman/test/stress-test.c               | 1744 ++++----
 pixman/test/trap-crasher.c              |   54 +-
 pixman/test/utils.c                     | 1408 +++----
 pixman/test/utils.h                     |  302 +-
 24 files changed, 20516 insertions(+), 20516 deletions(-)

(limited to 'pixman')

diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index aa43f6145..ff87e26a3 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -1,131 +1,131 @@
-SUBDIRS = pixman demos test
-
-pkgconfigdir=$(libdir)/pkgconfig
-pkgconfig_DATA=pixman-1.pc
-
-$(pkgconfig_DATA): pixman-1.pc.in
-
-snapshot:
-	distdir="$(distdir)-`date '+%Y%m%d'`"; \
-	test -d "$(srcdir)/.git" && distdir=$$distdir-`cd "$(srcdir)" && git rev-parse HEAD | cut -c 1-6`; \
-	$(MAKE) $(AM_MAKEFLAGS) distdir="$$distdir" dist
-
-GPGKEY=6FF7C1A8
-USERNAME=$$USER
-RELEASE_OR_SNAPSHOT = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo release; else echo snapshot; fi)
-RELEASE_CAIRO_HOST =	$(USERNAME)@cairographics.org
-RELEASE_CAIRO_DIR =	/srv/cairo.freedesktop.org/www/$(RELEASE_OR_SNAPSHOT)s
-RELEASE_CAIRO_URL = 	http://cairographics.org/$(RELEASE_OR_SNAPSHOT)s
-RELEASE_XORG_URL =	http://xorg.freedesktop.org/archive/individual/lib
-RELEASE_XORG_HOST =	$(USERNAME)@xorg.freedesktop.org
-RELEASE_XORG_DIR =	/srv/xorg.freedesktop.org/archive/individual/lib
-RELEASE_ANNOUNCE_LIST = cairo-announce@cairographics.org, xorg-announce@lists.freedesktop.org, pixman@lists.freedesktop.org
-
-tar_gz = $(PACKAGE)-$(VERSION).tar.gz
-tar_bz2 = $(PACKAGE)-$(VERSION).tar.bz2
-
-sha1_tgz = $(tar_gz).sha1
-md5_tgz = $(tar_gz).md5
-
-sha1_tbz2 = $(tar_bz2).sha1
-md5_tbz2 = $(tar_bz2).md5
-
-gpg_file = $(sha1_tgz).asc
-
-$(sha1_tgz): $(tar_gz)
-	sha1sum $^ > $@
-
-$(md5_tgz): $(tar_gz)
-	md5sum $^ > $@
-
-$(sha1_tbz2): $(tar_bz2)
-	sha1sum $^ > $@
-
-$(md5_tbz2): $(tar_bz2)
-	md5sum $^ > $@
-
-$(gpg_file): $(sha1_tgz)
-	@echo "Please enter your GPG password to sign the checksum."
-	gpg --armor --sign $^ 
-
-HASHFILES = $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(md5_tbz2)
-
-release-verify-newer:
-	@echo -n "Checking that no $(VERSION) release already exists at $(RELEASE_XORG_HOST)..."
-	@ssh $(RELEASE_XORG_HOST) test ! -e $(RELEASE_XORG_DIR)/$(tar_gz) \
-		|| (echo "Ouch." && echo "Found: $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)/$(tar_gz)" \
-		&& echo "Refusing to try to generate a new release of the same name." \
-		&& false)
-	@ssh $(RELEASE_CAIRO_HOST) test ! -e $(RELEASE_CAIRO_DIR)/$(tar_gz) \
-		|| (echo "Ouch." && echo "Found: $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)/$(tar_gz)" \
-		&& echo "Refusing to try to generate a new release of the same name." \
-		&& false)
-	@echo "Good."
-
-release-remove-old:
-	$(RM) $(tar_gz) $(tar_bz2) $(HASHFILES) $(gpg_file)
-
-ensure-prev:
-	@if [[ "$(PREV)" == "" ]]; then							\
-		echo ""							          &&	\
-		echo "You must set the PREV variable on the make command line to" &&	\
-		echo "the last version."				  	  &&	\
-		echo ""								  &&	\
-		echo "For example:"						  &&	\
-		echo "      make PREV=0.7.3"				  	  &&	\
-		echo ""								  &&	\
-		false;									\
-	fi
-
-release-check: ensure-prev release-verify-newer release-remove-old distcheck
-
-release-tag:
-	git tag -u $(GPGKEY) -m "$(PACKAGE) $(VERSION) release" $(PACKAGE)-$(VERSION)
-
-release-upload: release-check $(tar_gz) $(tar_bz2) $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(gpg_file)
-	scp $(tar_gz) $(sha1_tgz) $(gpg_file) $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)
-	scp $(tar_gz) $(tar_bz2) $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)
-	ssh $(RELEASE_CAIRO_HOST) "rm -f $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-[0-9]* && ln -s $(tar_gz) $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-$(VERSION)"
-
-release-publish-message: $(HASHFILES) ensure-prev
-	@echo "Please follow the instructions in RELEASING to push stuff out and"
-	@echo "send out the announcement mails.  Here is the excerpt you need:"
-	@echo ""
-	@echo "Lists:  $(RELEASE_ANNOUNCE_LIST)"
-	@echo "Subject: [ANNOUNCE] $(PACKAGE) release $(VERSION) now available"
-	@echo "============================== CUT HERE =============================="
-	@echo "A new $(PACKAGE) release $(VERSION) is now available"
-	@echo ""
-	@echo "tar.gz:"
-	@echo "	$(RELEASE_CAIRO_URL)/$(tar_gz)"
-	@echo "	$(RELEASE_XORG_URL)/$(tar_gz)"
-	@echo ""
-	@echo "tar.bz2:"
-	@echo "	$(RELEASE_XORG_URL)/$(tar_bz2)"
-	@echo ""
-	@echo "Hashes:"
-	@echo -n "	MD5:  "
-	@cat $(md5_tgz)
-	@echo -n "	MD5:  "
-	@cat $(md5_tbz2)
-	@echo -n "	SHA1: "
-	@cat $(sha1_tgz)
-	@echo -n "	SHA1: "
-	@cat $(sha1_tbz2)
-	@echo ""
-	@echo "GPG signature:"
-	@echo "	$(RELEASE_CAIRO_URL)/$(gpg_file)"
-	@echo "	(signed by `git config --get user.name` <`git config --get user.email`>)"
-	@echo ""
-	@echo "Git:"
-	@echo "	git://git.freedesktop.org/git/pixman"
-	@echo "	tag: $(PACKAGE)-$(VERSION)"
-	@echo ""
-	@echo "Log:"
-	@git log --no-merges "$(PACKAGE)-$(PREV)".."$(PACKAGE)-$(VERSION)" | git shortlog | awk '{ printf "\t"; print ; }' | cut -b1-80
-	@echo "============================== CUT HERE =============================="
-	@echo ""
-
-release-publish: release-upload release-tag release-publish-message
-
-.PHONY: release-upload release-publish release-publish-message release-tag
+SUBDIRS = pixman demos test
+
+pkgconfigdir=$(libdir)/pkgconfig
+pkgconfig_DATA=pixman-1.pc
+
+$(pkgconfig_DATA): pixman-1.pc.in
+
+snapshot:
+	distdir="$(distdir)-`date '+%Y%m%d'`"; \
+	test -d "$(srcdir)/.git" && distdir=$$distdir-`cd "$(srcdir)" && git rev-parse HEAD | cut -c 1-6`; \
+	$(MAKE) $(AM_MAKEFLAGS) distdir="$$distdir" dist
+
+GPGKEY=6FF7C1A8
+USERNAME=$$USER
+RELEASE_OR_SNAPSHOT = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo release; else echo snapshot; fi)
+RELEASE_CAIRO_HOST =	$(USERNAME)@cairographics.org
+RELEASE_CAIRO_DIR =	/srv/cairo.freedesktop.org/www/$(RELEASE_OR_SNAPSHOT)s
+RELEASE_CAIRO_URL = 	http://cairographics.org/$(RELEASE_OR_SNAPSHOT)s
+RELEASE_XORG_URL =	http://xorg.freedesktop.org/archive/individual/lib
+RELEASE_XORG_HOST =	$(USERNAME)@xorg.freedesktop.org
+RELEASE_XORG_DIR =	/srv/xorg.freedesktop.org/archive/individual/lib
+RELEASE_ANNOUNCE_LIST = cairo-announce@cairographics.org, xorg-announce@lists.freedesktop.org, pixman@lists.freedesktop.org
+
+tar_gz = $(PACKAGE)-$(VERSION).tar.gz
+tar_bz2 = $(PACKAGE)-$(VERSION).tar.bz2
+
+sha1_tgz = $(tar_gz).sha1
+md5_tgz = $(tar_gz).md5
+
+sha1_tbz2 = $(tar_bz2).sha1
+md5_tbz2 = $(tar_bz2).md5
+
+gpg_file = $(sha1_tgz).asc
+
+$(sha1_tgz): $(tar_gz)
+	sha1sum $^ > $@
+
+$(md5_tgz): $(tar_gz)
+	md5sum $^ > $@
+
+$(sha1_tbz2): $(tar_bz2)
+	sha1sum $^ > $@
+
+$(md5_tbz2): $(tar_bz2)
+	md5sum $^ > $@
+
+$(gpg_file): $(sha1_tgz)
+	@echo "Please enter your GPG password to sign the checksum."
+	gpg --armor --sign $^ 
+
+HASHFILES = $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(md5_tbz2)
+
+release-verify-newer:
+	@echo -n "Checking that no $(VERSION) release already exists at $(RELEASE_XORG_HOST)..."
+	@ssh $(RELEASE_XORG_HOST) test ! -e $(RELEASE_XORG_DIR)/$(tar_gz) \
+		|| (echo "Ouch." && echo "Found: $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)/$(tar_gz)" \
+		&& echo "Refusing to try to generate a new release of the same name." \
+		&& false)
+	@ssh $(RELEASE_CAIRO_HOST) test ! -e $(RELEASE_CAIRO_DIR)/$(tar_gz) \
+		|| (echo "Ouch." && echo "Found: $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)/$(tar_gz)" \
+		&& echo "Refusing to try to generate a new release of the same name." \
+		&& false)
+	@echo "Good."
+
+release-remove-old:
+	$(RM) $(tar_gz) $(tar_bz2) $(HASHFILES) $(gpg_file)
+
+ensure-prev:
+	@if [[ "$(PREV)" == "" ]]; then							\
+		echo ""							          &&	\
+		echo "You must set the PREV variable on the make command line to" &&	\
+		echo "the last version."				  	  &&	\
+		echo ""								  &&	\
+		echo "For example:"						  &&	\
+		echo "      make PREV=0.7.3"				  	  &&	\
+		echo ""								  &&	\
+		false;									\
+	fi
+
+release-check: ensure-prev release-verify-newer release-remove-old distcheck
+
+release-tag:
+	git tag -u $(GPGKEY) -m "$(PACKAGE) $(VERSION) release" $(PACKAGE)-$(VERSION)
+
+release-upload: release-check $(tar_gz) $(tar_bz2) $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(gpg_file)
+	scp $(tar_gz) $(sha1_tgz) $(gpg_file) $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)
+	scp $(tar_gz) $(tar_bz2) $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)
+	ssh $(RELEASE_CAIRO_HOST) "rm -f $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-[0-9]* && ln -s $(tar_gz) $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-$(VERSION)"
+
+release-publish-message: $(HASHFILES) ensure-prev
+	@echo "Please follow the instructions in RELEASING to push stuff out and"
+	@echo "send out the announcement mails.  Here is the excerpt you need:"
+	@echo ""
+	@echo "Lists:  $(RELEASE_ANNOUNCE_LIST)"
+	@echo "Subject: [ANNOUNCE] $(PACKAGE) release $(VERSION) now available"
+	@echo "============================== CUT HERE =============================="
+	@echo "A new $(PACKAGE) release $(VERSION) is now available"
+	@echo ""
+	@echo "tar.gz:"
+	@echo "	$(RELEASE_CAIRO_URL)/$(tar_gz)"
+	@echo "	$(RELEASE_XORG_URL)/$(tar_gz)"
+	@echo ""
+	@echo "tar.bz2:"
+	@echo "	$(RELEASE_XORG_URL)/$(tar_bz2)"
+	@echo ""
+	@echo "Hashes:"
+	@echo -n "	MD5:  "
+	@cat $(md5_tgz)
+	@echo -n "	MD5:  "
+	@cat $(md5_tbz2)
+	@echo -n "	SHA1: "
+	@cat $(sha1_tgz)
+	@echo -n "	SHA1: "
+	@cat $(sha1_tbz2)
+	@echo ""
+	@echo "GPG signature:"
+	@echo "	$(RELEASE_CAIRO_URL)/$(gpg_file)"
+	@echo "	(signed by `git config --get user.name` <`git config --get user.email`>)"
+	@echo ""
+	@echo "Git:"
+	@echo "	git://git.freedesktop.org/git/pixman"
+	@echo "	tag: $(PACKAGE)-$(VERSION)"
+	@echo ""
+	@echo "Log:"
+	@git log --no-merges "$(PACKAGE)-$(PREV)".."$(PACKAGE)-$(VERSION)" | git shortlog | awk '{ printf "\t"; print ; }' | cut -b1-80
+	@echo "============================== CUT HERE =============================="
+	@echo ""
+
+release-publish: release-upload release-tag release-publish-message
+
+.PHONY: release-upload release-publish release-publish-message release-tag
diff --git a/pixman/RELEASING b/pixman/RELEASING
index 8644f2d8d..fbe15813d 100644
--- a/pixman/RELEASING
+++ b/pixman/RELEASING
@@ -1,57 +1,57 @@
-Here are the steps to follow to create a new pixman release:
-
-1) Ensure that there are no uncommitted changes or unpushed commits,
-   and that you are up to date with the latest commits in the central
-   repository. Here are a couple of useful commands:
-
-	git diff			(no output)
-	
-	git status			(should report "nothing to commit")
-
-	git log master...origin		(no output; note: *3* dots)
-
-2) Increment pixman_(major|minor|micro) in configure.ac according to
-   the directions in that file.
-
-3) Make sure that new version works, including
-
-	- make distcheck passes
-
-	- the X server still works with the new pixman version
-	  installed
-
-	- the cairo test suite hasn't gained any new failures compared
-	  to last pixman version.
-
-4) Use "git commit" to record the changes made in step 2 and 3.
-
-5) Generate and publish the tar files by running 
-
-	make PREV=<last version> GPGKEY=<your gpg key id> release-publish
-
-   If your freedesktop user name is different from your local one,
-   then also set the variable USER to your freedesktop user name.
-
-6) Run 
-
-	make release-publish-message
-
-   to generate a draft release announcement. Edit it as appropriate and
-   send it to 
-
-	cairo-announce@cairographics.org
-
-	pixman@lists.freedesktop.org
-
-	xorg-announce@lists.freedesktop.org
-
-7) Increment pixman_micro to the next larger (odd) number in
-   configure.ac. Commit this change, and push all commits created
-   during this process using
-
-	git push
-	git push --tags
-
-   You must use "--tags" here; otherwise the new tag will not
-   be pushed out.
-
+Here are the steps to follow to create a new pixman release:
+
+1) Ensure that there are no uncommitted changes or unpushed commits,
+   and that you are up to date with the latest commits in the central
+   repository. Here are a couple of useful commands:
+
+	git diff			(no output)
+	
+	git status			(should report "nothing to commit")
+
+	git log master...origin		(no output; note: *3* dots)
+
+2) Increment pixman_(major|minor|micro) in configure.ac according to
+   the directions in that file.
+
+3) Make sure that new version works, including
+
+	- make distcheck passes
+
+	- the X server still works with the new pixman version
+	  installed
+
+	- the cairo test suite hasn't gained any new failures compared
+	  to last pixman version.
+
+4) Use "git commit" to record the changes made in step 2 and 3.
+
+5) Generate and publish the tar files by running 
+
+	make PREV=<last version> GPGKEY=<your gpg key id> release-publish
+
+   If your freedesktop user name is different from your local one,
+   then also set the variable USER to your freedesktop user name.
+
+6) Run 
+
+	make release-publish-message
+
+   to generate a draft release announcement. Edit it as appropriate and
+   send it to 
+
+	cairo-announce@cairographics.org
+
+	pixman@lists.freedesktop.org
+
+	xorg-announce@lists.freedesktop.org
+
+7) Increment pixman_micro to the next larger (odd) number in
+   configure.ac. Commit this change, and push all commits created
+   during this process using
+
+	git push
+	git push --tags
+
+   You must use "--tags" here; otherwise the new tag will not
+   be pushed out.
+
diff --git a/pixman/configure.ac b/pixman/configure.ac
index 41a766197..21613e135 100644
--- a/pixman/configure.ac
+++ b/pixman/configure.ac
@@ -1,844 +1,844 @@
-dnl  Copyright 2005 Red Hat, Inc.
-dnl 
-dnl  Permission to use, copy, modify, distribute, and sell this software and its
-dnl  documentation for any purpose is hereby granted without fee, provided that
-dnl  the above copyright notice appear in all copies and that both that
-dnl  copyright notice and this permission notice appear in supporting
-dnl  documentation, and that the name of Red Hat not be used in
-dnl  advertising or publicity pertaining to distribution of the software without
-dnl  specific, written prior permission.  Red Hat makes no
-dnl  representations about the suitability of this software for any purpose.  It
-dnl  is provided "as is" without express or implied warranty.
-dnl 
-dnl  RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
-dnl  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
-dnl  EVENT SHALL RED HAT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
-dnl  CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
-dnl  DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-dnl  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-dnl  PERFORMANCE OF THIS SOFTWARE.
-dnl
-dnl Process this file with autoconf to create configure.
-
-AC_PREREQ([2.57])
-
-#   Pixman versioning scheme
-#
-#   - The version in git has an odd MICRO version number
-#
-#   - Released versions both development and stable have an even MICRO 
-#     version number
-#
-#   - Released development versions have an odd MINOR number
-#
-#   - Released stable versions have an even MINOR number
-#
-#   - Versions that break ABI must have a new MAJOR number
-#
-#   - If you break the ABI, then at least this must be done:
-#
-#        - increment MAJOR
-#
-#        - In the first development release where you break ABI, find
-#          all instances of "pixman-n" and change them to pixman-(n+1)
-#
-#          This needs to be done at least in 
-#                    configure.ac
-#                    all Makefile.am's
-#                    pixman-n.pc.in
-#
-#      This ensures that binary incompatible versions can be installed
-#      in parallel.  See http://www106.pair.com/rhp/parallel.html for
-#      more information
-#
-
-m4_define([pixman_major], 0)
-m4_define([pixman_minor], 23)
-m4_define([pixman_micro], 5)
-
-m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
-
-AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman)
-AM_INIT_AUTOMAKE([foreign dist-bzip2])
-
-# Suppress verbose compile lines
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-
-AM_CONFIG_HEADER(config.h)
-
-AC_CANONICAL_HOST
-
-test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
-
-AC_PROG_CC
-AM_PROG_AS
-AC_PROG_LIBTOOL
-AC_CHECK_FUNCS([getisax])
-AC_C_BIGENDIAN
-AC_C_INLINE
-
-dnl PIXMAN_LINK_WITH_ENV(env-setup, program, true-action, false-action)
-dnl
-dnl Compiles and links the given program in the environment setup by env-setup
-dnl and executes true-action on success and false-action on failure.
-AC_DEFUN([PIXMAN_LINK_WITH_ENV],[dnl
-	save_CFLAGS="$CFLAGS"
-	save_LDFLAGS="$LDFLAGS"
-	save_LIBS="$LIBS"
-	CFLAGS=""
-	LDFLAGS=""
-	LIBS=""
-	$1
-	AC_LINK_IFELSE(
-		[AC_LANG_SOURCE([$2])],
-		[pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
-		 pixman_cc_flag=yes],
-		[pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
-		 pixman_cc_flag=no])
-
-	if test "x$pixman_cc_stderr" != "x"; then
-		pixman_cc_flag=no
-	fi
-
-	if test "x$pixman_cc_flag" = "xyes"; then
-		ifelse([$3], , :, [$3])
-	else
-		ifelse([$4], , :, [$4])
-	fi
-	CFLAGS="$save_CFLAGS"
-	LDFLAGS="$save_LDFLAGS"
-	LIBS="$save_LIBS"
-])
-
-dnl Find a -Werror for catching warnings.
-WERROR=
-for w in -Werror -errwarn; do
-    if test "z$WERROR" = "z"; then
-        AC_MSG_CHECKING([whether the compiler supports $w])
-        PIXMAN_LINK_WITH_ENV(
-		[CFLAGS=$w],
-		[int main(int c, char **v) { (void)c; (void)v; return 0; }],
-		[WERROR=$w; yesno=yes], [yesno=no])
-	AC_MSG_RESULT($yesno)
-    fi
-done
-
-dnl PIXMAN_CHECK_CFLAG(flag, [program])
-dnl  Adds flag to CFLAGS if the given program links without warnings or errors.
-AC_DEFUN([PIXMAN_CHECK_CFLAG], [dnl
-	AC_MSG_CHECKING([whether the compiler supports $1])
-	PIXMAN_LINK_WITH_ENV(
-		[CFLAGS="$WERROR $1"],
-		[$2
-		 int main(int c, char **v) { (void)c; (void)v; return 0; }
-		],
-		[_yesno=yes],
-		[_yesno=no])
-	if test "x$_yesno" = xyes; then
-	   CFLAGS="$CFLAGS $1"
-	fi
-	AC_MSG_RESULT($_yesno)
-])
-
-AC_CHECK_SIZEOF(long)
-
-# Checks for Sun Studio compilers
-AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
-AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
-
-# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
-# if we're using Sun Studio and neither the user nor a config.site
-# has set CFLAGS.
-if test $SUNCC = yes &&			\
-   test "$test_CFLAGS" == "" &&		\
-   test "$CFLAGS" = "-g"
-then
-  CFLAGS="-O -g"
-fi
-
-# 
-# We ignore pixman_major in the version here because the major version should
-# always be encoded in the actual library name. Ie., the soname is:
-#
-#      pixman-$(pixman_major).0.minor.micro
-#
-m4_define([lt_current], [pixman_minor])
-m4_define([lt_revision], [pixman_micro])
-m4_define([lt_age], [pixman_minor])
-
-LT_VERSION_INFO="lt_current:lt_revision:lt_age"
-
-PIXMAN_VERSION_MAJOR=pixman_major()
-AC_SUBST(PIXMAN_VERSION_MAJOR)
-PIXMAN_VERSION_MINOR=pixman_minor()
-AC_SUBST(PIXMAN_VERSION_MINOR)
-PIXMAN_VERSION_MICRO=pixman_micro()
-AC_SUBST(PIXMAN_VERSION_MICRO)
-
-AC_SUBST(LT_VERSION_INFO)
-
-# Check for dependencies
-
-PIXMAN_CHECK_CFLAG([-Wall])
-PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
-
-AC_PATH_PROG(PERL, perl, no)
-if test "x$PERL" = xno; then
-    AC_MSG_ERROR([Perl is required to build pixman.])
-fi
-AC_SUBST(PERL)
-
-dnl =========================================================================
-dnl OpenMP for the test suite?
-dnl
-
-# Check for OpenMP support only when autoconf support that (require autoconf >=2.62)
-OPENMP_CFLAGS=
-m4_ifdef([AC_OPENMP], [AC_OPENMP])
-
-if test "x$enable_openmp" = "xyes" && test "x$ac_cv_prog_c_openmp" = "xunsupported" ; then
-  AC_MSG_WARN([OpenMP support requested but found unsupported])
-fi
-
-dnl May not fail to link without -Wall -Werror added
-dnl So try to link only when openmp is supported
-dnl ac_cv_prog_c_openmp is not defined when --disable-openmp is used
-if test "x$ac_cv_prog_c_openmp" != "xunsupported" && test "x$ac_cv_prog_c_openmp" != "x"; then
-  m4_define([openmp_test_program],[dnl
-  #include <stdio.h>
-
-  extern unsigned int lcg_seed;
-  #pragma omp threadprivate(lcg_seed)
-  unsigned int lcg_seed;
-
-  unsigned function(unsigned a, unsigned b)
-  {
-	lcg_seed ^= b;
-	return ((a + b) ^ a ) + lcg_seed;
-  }
-
-  int main(int argc, char **argv)
-  {
-	int i;
-	int n1 = 0, n2 = argc;
-	unsigned checksum = 0;
-	int verbose = argv != NULL;
-	unsigned (*test_function)(unsigned, unsigned);
-	test_function = function;
-	#pragma omp parallel for reduction(+:checksum) default(none) \
-					shared(n1, n2, test_function, verbose)
-	for (i = n1; i < n2; i++)
-	{
-		unsigned crc = test_function (i, 0);
-		if (verbose)
-			printf ("%d: %08X\n", i, crc);
-		checksum += crc;
-	}
-	printf("%u\n", checksum);
-	return 0;
-  }
-  ])
-
-  PIXMAN_LINK_WITH_ENV(
-	[CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"],
-	[openmp_test_program],
-	[have_openmp=yes],
-	[have_openmp=no])
-  if test "x$have_openmp" = "xyes" ; then
-    AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
-  fi
-fi
-AC_SUBST(OPENMP_CFLAGS)
-
-dnl =========================================================================
-dnl -fvisibility stuff
-
-PIXMAN_CHECK_CFLAG([-fvisibility=hidden], [dnl
-#if defined(__GNUC__) && (__GNUC__ >= 4)
-#ifdef _WIN32
-#error Have -fvisibility but it is ignored and generates a warning
-#endif
-#else
-error Need GCC 4.0 for visibility
-#endif
-])
-
-PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
-#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
-#else
-error Need Sun Studio 8 for visibility
-#endif
-])
-
-dnl ===========================================================================
-dnl Check for MMX
-
-if test "x$MMX_CFLAGS" = "x" ; then
-   if test "x$SUNCC" = "xyes"; then
-      # Sun Studio doesn't have an -xarch=mmx flag, so we have to use sse
-      # but if we're building 64-bit, mmx & sse support is on by default and
-      # -xarch=sse throws an error instead
-      if test "$AMD64_ABI" = "no" ; then
-         MMX_CFLAGS="-xarch=sse"
-      fi
-   else
-      MMX_CFLAGS="-mmmx -Winline"
-   fi
-fi
-
-have_mmx_intrinsics=no
-AC_MSG_CHECKING(whether to use MMX intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$MMX_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-error "Need GCC >= 3.4 for MMX intrinsics"
-#endif
-#include <mmintrin.h>
-int main () {
-    __m64 v = _mm_cvtsi32_si64 (1);
-    return _mm_cvtsi64_si32 (v);
-}]])], have_mmx_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(mmx,
-   [AC_HELP_STRING([--disable-mmx],
-                   [disable MMX fast paths])],
-   [enable_mmx=$enableval], [enable_mmx=auto])
-
-if test $enable_mmx = no ; then
-   have_mmx_intrinsics=disabled
-fi
-
-if test $have_mmx_intrinsics = yes ; then
-   AC_DEFINE(USE_MMX, 1, [use MMX compiler intrinsics])
-else
-   MMX_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_mmx_intrinsics)
-if test $enable_mmx = yes && test $have_mmx_intrinsics = no ; then
-   AC_MSG_ERROR([MMX intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_MMX, test $have_mmx_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Check for SSE2
-
-if test "x$SSE2_CFLAGS" = "x" ; then
-   if test "x$SUNCC" = "xyes"; then
-      # SSE2 is enabled by default in the Sun Studio 64-bit environment
-      if test "$AMD64_ABI" = "no" ; then
-         SSE2_CFLAGS="-xarch=sse2"
-      fi
-   else
-      SSE2_CFLAGS="-msse2 -Winline"
-   fi
-fi
-
-have_sse2_intrinsics=no
-AC_MSG_CHECKING(whether to use SSE2 intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$SSE2_CFLAGS $CFLAGS"
-
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
-#   if !defined(__amd64__) && !defined(__x86_64__)
-#      error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
-#   endif
-#endif
-#include <mmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-int main () {
-    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
-	c = _mm_xor_si128 (a, b);
-    return 0;
-}]])], have_sse2_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(sse2,
-   [AC_HELP_STRING([--disable-sse2],
-                   [disable SSE2 fast paths])],
-   [enable_sse2=$enableval], [enable_sse2=auto])
-
-if test $enable_sse2 = no ; then
-   have_sse2_intrinsics=disabled
-fi
-
-if test $have_sse2_intrinsics = yes ; then
-   AC_DEFINE(USE_SSE2, 1, [use SSE2 compiler intrinsics])
-fi
-
-AC_MSG_RESULT($have_sse2_intrinsics)
-if test $enable_sse2 = yes && test $have_sse2_intrinsics = no ; then
-   AC_MSG_ERROR([SSE2 intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Other special flags needed when building code using MMX or SSE instructions
-case $host_os in
-   solaris*)
-      # When building 32-bit binaries, apply a mapfile to ensure that the
-      # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
-      # since they check at runtime before using those instructions.
-      # Not all linkers grok the mapfile format so we check for that first.
-      if test "$AMD64_ABI" = "no" ; then
-	 use_hwcap_mapfile=no
-	 AC_MSG_CHECKING(whether to use a hardware capability map file)
-	 hwcap_save_LDFLAGS="$LDFLAGS"
-	 HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
-	 LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
-	 AC_LINK_IFELSE([AC_LANG_SOURCE([[int main() { return 0; }]])],
-			use_hwcap_mapfile=yes,
-			HWCAP_LDFLAGS="")
-	 LDFLAGS="$hwcap_save_LDFLAGS"
-	 AC_MSG_RESULT($use_hwcap_mapfile)
-      fi
-      if test "x$MMX_LDFLAGS" = "x" ; then
-         MMX_LDFLAGS="$HWCAP_LDFLAGS"
-      fi
-      if test "x$SSE2_LDFLAGS" = "x" ; then
-	 SSE2_LDFLAGS="$HWCAP_LDFLAGS"
-      fi
-      ;;
-esac
-
-AC_SUBST(MMX_CFLAGS)
-AC_SUBST(MMX_LDFLAGS)
-AC_SUBST(SSE2_CFLAGS)
-AC_SUBST(SSE2_LDFLAGS)
-
-dnl ===========================================================================
-dnl Check for VMX/Altivec
-if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
-    VMX_CFLAGS="-faltivec"
-else
-    VMX_CFLAGS="-maltivec -mabi=altivec"
-fi
-
-have_vmx_intrinsics=no
-AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$VMX_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-error "Need GCC >= 3.4 for sane altivec support"
-#endif
-#include <altivec.h>
-int main () {
-    vector unsigned int v = vec_splat_u32 (1);
-    v = vec_sub (v, v);
-    return 0;
-}]])], have_vmx_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(vmx,
-   [AC_HELP_STRING([--disable-vmx],
-                   [disable VMX fast paths])],
-   [enable_vmx=$enableval], [enable_vmx=auto])
-
-if test $enable_vmx = no ; then
-   have_vmx_intrinsics=disabled
-fi
-
-if test $have_vmx_intrinsics = yes ; then
-   AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
-else
-   VMX_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_vmx_intrinsics)
-if test $enable_vmx = yes && test $have_vmx_intrinsics = no ; then
-   AC_MSG_ERROR([VMX intrinsics not detected])
-fi
-
-AC_SUBST(VMX_CFLAGS)
-
-AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports ARM SIMD instructions
-have_arm_simd=no
-AC_MSG_CHECKING(whether to use ARM SIMD assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-x assembler-with-cpp $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-.text
-.arch armv6
-.object_arch armv4
-.arm
-.altmacro
-#ifndef __ARM_EABI__
-#error EABI is required (to be sure that calling conventions are compatible)
-#endif
-pld [r0]
-uqadd8 r0, r0, r0]])], have_arm_simd=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(arm-simd,
-   [AC_HELP_STRING([--disable-arm-simd],
-                   [disable ARM SIMD fast paths])],
-   [enable_arm_simd=$enableval], [enable_arm_simd=auto])
-
-if test $enable_arm_simd = no ; then
-   have_arm_simd=disabled
-fi
-
-if test $have_arm_simd = yes ; then
-   AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
-
-AC_MSG_RESULT($have_arm_simd)
-if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
-   AC_MSG_ERROR([ARM SIMD intrinsics not detected])
-fi
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports NEON instructions
-have_arm_neon=no
-AC_MSG_CHECKING(whether to use ARM NEON assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-x assembler-with-cpp $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-.text
-.fpu neon
-.arch armv7a
-.object_arch armv4
-.eabi_attribute 10, 0
-.arm
-.altmacro
-#ifndef __ARM_EABI__
-#error EABI is required (to be sure that calling conventions are compatible)
-#endif
-pld [r0]
-vmovn.u16 d0, q0]])], have_arm_neon=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(arm-neon,
-   [AC_HELP_STRING([--disable-arm-neon],
-                   [disable ARM NEON fast paths])],
-   [enable_arm_neon=$enableval], [enable_arm_neon=auto])
-
-if test $enable_arm_neon = no ; then
-   have_arm_neon=disabled
-fi
-
-if test $have_arm_neon = yes ; then
-   AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
-
-AC_MSG_RESULT($have_arm_neon)
-if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
-   AC_MSG_ERROR([ARM NEON intrinsics not detected])
-fi
-
-dnl =========================================================================================
-dnl Check for GNU-style inline assembly support
-
-have_gcc_inline_asm=no
-AC_MSG_CHECKING(whether to use GNU-style inline assembler)
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-int main () {
-    /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
-	asm volatile ( "\tnop\n" : : : "cc", "memory" );
-    return 0;
-}]])], have_gcc_inline_asm=yes)
-
-AC_ARG_ENABLE(gcc-inline-asm,
-   [AC_HELP_STRING([--disable-gcc-inline-asm],
-                   [disable GNU-style inline assembler])],
-   [enable_gcc_inline_asm=$enableval], [enable_gcc_inline_asm=auto])
-
-if test $enable_gcc_inline_asm = no ; then
-   have_gcc_inline_asm=disabled
-fi
-
-if test $have_gcc_inline_asm = yes ; then
-   AC_DEFINE(USE_GCC_INLINE_ASM, 1, [use GNU-style inline assembler])
-fi
-
-AC_MSG_RESULT($have_gcc_inline_asm)
-if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then
-   AC_MSG_ERROR([GNU-style inline assembler not detected])
-fi
-
-AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
-
-dnl ==============================================
-dnl Static test programs
-
-AC_ARG_ENABLE(static-testprogs,
-   [AC_HELP_STRING([--enable-static-testprogs],
-		   [build test programs as static binaries [default=no]])],
-   [enable_static_testprogs=$enableval], [enable_static_testprogs=no])
-
-TESTPROGS_EXTRA_LDFLAGS=
-if test "x$enable_static_testprogs" = "xyes" ; then
-   TESTPROGS_EXTRA_LDFLAGS="-all-static"
-fi
-AC_SUBST(TESTPROGS_EXTRA_LDFLAGS)
-
-dnl ==============================================
-dnl Timers
-
-AC_ARG_ENABLE(timers,
-   [AC_HELP_STRING([--enable-timers],
-		   [enable TIMER_BEGIN and TIMER_END macros [default=no]])],
-   [enable_timers=$enableval], [enable_timers=no])
-
-if test $enable_timers = yes ; then 
-   AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros])
-fi
-AC_SUBST(PIXMAN_TIMERS)
-
-dnl ===================================
-dnl GTK+
-
-AC_ARG_ENABLE(gtk,
-   [AC_HELP_STRING([--enable-gtk],
-                   [enable tests using GTK+ [default=auto]])],
-   [enable_gtk=$enableval], [enable_gtk=auto])
-
-PKG_PROG_PKG_CONFIG
-
-if test $enable_gtk = yes ; then
-   AC_CHECK_LIB([pixman-1], [pixman_version_string])
-   PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1])
-fi
-
-if test $enable_gtk = auto ; then
-   AC_CHECK_LIB([pixman-1], [pixman_version_string], [enable_gtk=auto], [enable_gtk=no])
-fi
-
-if test $enable_gtk = auto ; then
-   PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1], [enable_gtk=yes], [enable_gtk=no])
-fi
-
-AM_CONDITIONAL(HAVE_GTK, [test "x$enable_gtk" = xyes])
-
-AC_SUBST(GTK_CFLAGS)
-AC_SUBST(GTK_LIBS)
-AC_SUBST(DEP_CFLAGS)
-AC_SUBST(DEP_LIBS)
-
-dnl =====================================
-dnl posix_memalign, sigaction, alarm, gettimeofday
-
-AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
-if test x$have_posix_memalign = xyes; then
-   AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
-fi
-
-AC_CHECK_FUNC(sigaction, have_sigaction=yes, have_sigaction=no)
-if test x$have_sigaction = xyes; then
-   AC_DEFINE(HAVE_SIGACTION, 1, [Whether we have sigaction()])
-fi
-
-AC_CHECK_FUNC(alarm, have_alarm=yes, have_alarm=no)
-if test x$have_alarm = xyes; then
-   AC_DEFINE(HAVE_ALARM, 1, [Whether we have alarm()])
-fi
-
-AC_CHECK_HEADER([sys/mman.h],
-   [AC_DEFINE(HAVE_SYS_MMAN_H, [1], [Define to 1 if we have <sys/mman.h>])])
-
-AC_CHECK_FUNC(mmap, have_mmap=yes, have_mmap=no)
-if test x$have_mmap = xyes; then
-   AC_DEFINE(HAVE_MMAP, 1, [Whether we have mmap()])
-fi
-
-AC_CHECK_FUNC(mprotect, have_mprotect=yes, have_mprotect=no)
-if test x$have_mprotect = xyes; then
-   AC_DEFINE(HAVE_MPROTECT, 1, [Whether we have mprotect()])
-fi
-
-AC_CHECK_FUNC(getpagesize, have_getpagesize=yes, have_getpagesize=no)
-if test x$have_getpagesize = xyes; then
-   AC_DEFINE(HAVE_GETPAGESIZE, 1, [Whether we have getpagesize()])
-fi
-
-AC_CHECK_HEADER([fenv.h],
-   [AC_DEFINE(HAVE_FENV_H, [1], [Define to 1 if we have <fenv.h>])])
-
-AC_CHECK_LIB(m, feenableexcept, have_feenableexcept=yes, have_feenableexcept=no)
-if test x$have_feenableexcept = xyes; then
-   AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()])
-fi
-
-AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no)
-AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no)
-if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
-   AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()])
-fi
-
-dnl =====================================
-dnl Thread local storage
-
-support_for__thread=no
-
-AC_MSG_CHECKING(for __thread)
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-#if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
-#error This MinGW version has broken __thread support
-#endif
-#ifdef __OpenBSD__
-#error OpenBSD has broken __thread support
-#endif
-static __thread int x ;
-int main () { x = 123; return x; }
-]])], support_for__thread=yes)
-
-if test $support_for__thread = yes; then 
-   AC_DEFINE([TOOLCHAIN_SUPPORTS__THREAD],[],[Whether the tool chain supports __thread])
-fi
-
-AC_MSG_RESULT($support_for__thread)
-
-dnl
-dnl posix tls
-dnl
-
-m4_define([pthread_test_program],AC_LANG_SOURCE([[dnl
-#include <stdlib.h>
-#include <pthread.h>
-
-static pthread_once_t once_control = PTHREAD_ONCE_INIT;
-static pthread_key_t key;
-
-static void
-make_key (void)
-{
-    pthread_key_create (&key, NULL);
-}
-
-int
-main ()
-{
-    void *value = NULL;
-
-    if (pthread_once (&once_control, make_key) != 0)
-    {
-	value = NULL;
-    }
-    else
-    {
-	value = pthread_getspecific (key);
-	if (!value)
-	{
-	    value = malloc (100);
-	    pthread_setspecific (key, value);
-	}
-    }
-    return 0;
-}
-]]))
-
-AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
-    if test "z$support_for_pthread_setspecific" != "zyes"; then
-	PIXMAN_LINK_WITH_ENV(
-		[$1], [pthread_test_program],
-		[PTHREAD_CFLAGS="$CFLAGS"
-		 PTHREAD_LIBS="$LIBS"
-		 PTHREAD_LDFLAGS="$LDFLAGS"
-		 support_for_pthread_setspecific=yes])
-    fi
-])
-
-if test $support_for__thread = no; then
-    support_for_pthread_setspecific=no
-
-    AC_MSG_CHECKING(for pthread_setspecific)
-
-    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
-    PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
-    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
-    
-    if test $support_for_pthread_setspecific = yes; then
-	CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-	AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
-    fi
-
-    AC_MSG_RESULT($support_for_pthread_setspecific);
-fi
-
-AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
-AC_SUBST(HAVE_PTHREAD_SETSPECIFIC)
-AC_SUBST(PTHREAD_LDFLAGS)
-AC_SUBST(PTHREAD_LIBS)
-
-dnl =====================================
-dnl __attribute__((constructor))
-
-support_for_attribute_constructor=no
-
-AC_MSG_CHECKING(for __attribute__((constructor)))
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
-/* attribute 'constructor' is supported since gcc 2.7, but some compilers
- * may only pretend to be gcc, so let's try to actually use it
- */
-static int x = 1;
-static void __attribute__((constructor)) constructor_function () { x = 0; }
-int main (void) { return x; }
-#else
-#error not gcc or gcc version is older than 2.7
-#endif
-]])], support_for_attribute_constructor=yes)
-
-if test x$support_for_attribute_constructor = xyes; then
-   AC_DEFINE([TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR],
-             [],[Whether the tool chain supports __attribute__((constructor))])
-fi
-
-AC_MSG_RESULT($support_for_attribute_constructor)
-AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
-
-dnl ==================
-dnl libpng
-
-AC_CHECK_LIB([png], [png_write_info], [have_libpng=yes], [have_libpng=no])
-
-if test x$have_libpng = xyes; then
-    AC_DEFINE([HAVE_LIBPNG], [1], [Whether we have libpng])
-fi
-
-AC_SUBST(HAVE_LIBPNG)
-
-AC_OUTPUT([pixman-1.pc
-           pixman-1-uninstalled.pc
-           Makefile
-	   pixman/Makefile
-	   pixman/pixman-version.h
-	   demos/Makefile
-	   test/Makefile])
-
-m4_if(m4_eval(pixman_minor % 2), [1], [
-   echo
-   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
-   echo
-   echo "      Thanks for testing this development snapshot of pixman. Please"
-   echo "      report any problems you find, either by sending email to "
-   echo
-   echo "          pixman@lists.freedesktop.org"
-   echo
-   echo "      or by filing a bug at "
-   echo
-   echo "          https://bugs.freedesktop.org/enter_bug.cgi?product=pixman "
-   echo
-   echo "      If you are looking for a stable release of pixman, please note "
-   echo "      that stable releases have _even_ minor version numbers. Ie., "
-   echo "      pixman-0.]m4_eval(pixman_minor & ~1)[.x are stable releases, whereas pixman-$PIXMAN_VERSION_MAJOR.$PIXMAN_VERSION_MINOR.$PIXMAN_VERSION_MICRO is a "
-   echo "      development snapshot that may contain bugs and experimental "
-   echo "      features. "
-   echo 
-   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
-   echo
-])
+dnl  Copyright 2005 Red Hat, Inc.
+dnl 
+dnl  Permission to use, copy, modify, distribute, and sell this software and its
+dnl  documentation for any purpose is hereby granted without fee, provided that
+dnl  the above copyright notice appear in all copies and that both that
+dnl  copyright notice and this permission notice appear in supporting
+dnl  documentation, and that the name of Red Hat not be used in
+dnl  advertising or publicity pertaining to distribution of the software without
+dnl  specific, written prior permission.  Red Hat makes no
+dnl  representations about the suitability of this software for any purpose.  It
+dnl  is provided "as is" without express or implied warranty.
+dnl 
+dnl  RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+dnl  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+dnl  EVENT SHALL RED HAT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+dnl  CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+dnl  DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+dnl  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+dnl  PERFORMANCE OF THIS SOFTWARE.
+dnl
+dnl Process this file with autoconf to create configure.
+
+AC_PREREQ([2.57])
+
+#   Pixman versioning scheme
+#
+#   - The version in git has an odd MICRO version number
+#
+#   - Released versions both development and stable have an even MICRO 
+#     version number
+#
+#   - Released development versions have an odd MINOR number
+#
+#   - Released stable versions have an even MINOR number
+#
+#   - Versions that break ABI must have a new MAJOR number
+#
+#   - If you break the ABI, then at least this must be done:
+#
+#        - increment MAJOR
+#
+#        - In the first development release where you break ABI, find
+#          all instances of "pixman-n" and change them to pixman-(n+1)
+#
+#          This needs to be done at least in 
+#                    configure.ac
+#                    all Makefile.am's
+#                    pixman-n.pc.in
+#
+#      This ensures that binary incompatible versions can be installed
+#      in parallel.  See http://www106.pair.com/rhp/parallel.html for
+#      more information
+#
+
+m4_define([pixman_major], 0)
+m4_define([pixman_minor], 23)
+m4_define([pixman_micro], 5)
+
+m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
+
+AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman)
+AM_INIT_AUTOMAKE([foreign dist-bzip2])
+
+# Suppress verbose compile lines
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+AM_CONFIG_HEADER(config.h)
+
+AC_CANONICAL_HOST
+
+test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
+
+AC_PROG_CC
+AM_PROG_AS
+AC_PROG_LIBTOOL
+AC_CHECK_FUNCS([getisax])
+AC_C_BIGENDIAN
+AC_C_INLINE
+
+dnl PIXMAN_LINK_WITH_ENV(env-setup, program, true-action, false-action)
+dnl
+dnl Compiles and links the given program in the environment setup by env-setup
+dnl and executes true-action on success and false-action on failure.
+AC_DEFUN([PIXMAN_LINK_WITH_ENV],[dnl
+	save_CFLAGS="$CFLAGS"
+	save_LDFLAGS="$LDFLAGS"
+	save_LIBS="$LIBS"
+	CFLAGS=""
+	LDFLAGS=""
+	LIBS=""
+	$1
+	AC_LINK_IFELSE(
+		[AC_LANG_SOURCE([$2])],
+		[pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+		 pixman_cc_flag=yes],
+		[pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+		 pixman_cc_flag=no])
+
+	if test "x$pixman_cc_stderr" != "x"; then
+		pixman_cc_flag=no
+	fi
+
+	if test "x$pixman_cc_flag" = "xyes"; then
+		ifelse([$3], , :, [$3])
+	else
+		ifelse([$4], , :, [$4])
+	fi
+	CFLAGS="$save_CFLAGS"
+	LDFLAGS="$save_LDFLAGS"
+	LIBS="$save_LIBS"
+])
+
+dnl Find a -Werror for catching warnings.
+WERROR=
+for w in -Werror -errwarn; do
+    if test "z$WERROR" = "z"; then
+        AC_MSG_CHECKING([whether the compiler supports $w])
+        PIXMAN_LINK_WITH_ENV(
+		[CFLAGS=$w],
+		[int main(int c, char **v) { (void)c; (void)v; return 0; }],
+		[WERROR=$w; yesno=yes], [yesno=no])
+	AC_MSG_RESULT($yesno)
+    fi
+done
+
+dnl PIXMAN_CHECK_CFLAG(flag, [program])
+dnl  Adds flag to CFLAGS if the given program links without warnings or errors.
+AC_DEFUN([PIXMAN_CHECK_CFLAG], [dnl
+	AC_MSG_CHECKING([whether the compiler supports $1])
+	PIXMAN_LINK_WITH_ENV(
+		[CFLAGS="$WERROR $1"],
+		[$2
+		 int main(int c, char **v) { (void)c; (void)v; return 0; }
+		],
+		[_yesno=yes],
+		[_yesno=no])
+	if test "x$_yesno" = xyes; then
+	   CFLAGS="$CFLAGS $1"
+	fi
+	AC_MSG_RESULT($_yesno)
+])
+
+AC_CHECK_SIZEOF(long)
+
+# Checks for Sun Studio compilers
+AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
+AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
+
+# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
+# if we're using Sun Studio and neither the user nor a config.site
+# has set CFLAGS.
+if test $SUNCC = yes &&			\
+   test "$test_CFLAGS" == "" &&		\
+   test "$CFLAGS" = "-g"
+then
+  CFLAGS="-O -g"
+fi
+
+# 
+# We ignore pixman_major in the version here because the major version should
+# always be encoded in the actual library name. Ie., the soname is:
+#
+#      pixman-$(pixman_major).0.minor.micro
+#
+m4_define([lt_current], [pixman_minor])
+m4_define([lt_revision], [pixman_micro])
+m4_define([lt_age], [pixman_minor])
+
+LT_VERSION_INFO="lt_current:lt_revision:lt_age"
+
+PIXMAN_VERSION_MAJOR=pixman_major()
+AC_SUBST(PIXMAN_VERSION_MAJOR)
+PIXMAN_VERSION_MINOR=pixman_minor()
+AC_SUBST(PIXMAN_VERSION_MINOR)
+PIXMAN_VERSION_MICRO=pixman_micro()
+AC_SUBST(PIXMAN_VERSION_MICRO)
+
+AC_SUBST(LT_VERSION_INFO)
+
+# Check for dependencies
+
+PIXMAN_CHECK_CFLAG([-Wall])
+PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
+
+AC_PATH_PROG(PERL, perl, no)
+if test "x$PERL" = xno; then
+    AC_MSG_ERROR([Perl is required to build pixman.])
+fi
+AC_SUBST(PERL)
+
+dnl =========================================================================
+dnl OpenMP for the test suite?
+dnl
+
+# Check for OpenMP support only when autoconf support that (require autoconf >=2.62)
+OPENMP_CFLAGS=
+m4_ifdef([AC_OPENMP], [AC_OPENMP])
+
+if test "x$enable_openmp" = "xyes" && test "x$ac_cv_prog_c_openmp" = "xunsupported" ; then
+  AC_MSG_WARN([OpenMP support requested but found unsupported])
+fi
+
+dnl May not fail to link without -Wall -Werror added
+dnl So try to link only when openmp is supported
+dnl ac_cv_prog_c_openmp is not defined when --disable-openmp is used
+if test "x$ac_cv_prog_c_openmp" != "xunsupported" && test "x$ac_cv_prog_c_openmp" != "x"; then
+  m4_define([openmp_test_program],[dnl
+  #include <stdio.h>
+
+  extern unsigned int lcg_seed;
+  #pragma omp threadprivate(lcg_seed)
+  unsigned int lcg_seed;
+
+  unsigned function(unsigned a, unsigned b)
+  {
+	lcg_seed ^= b;
+	return ((a + b) ^ a ) + lcg_seed;
+  }
+
+  int main(int argc, char **argv)
+  {
+	int i;
+	int n1 = 0, n2 = argc;
+	unsigned checksum = 0;
+	int verbose = argv != NULL;
+	unsigned (*test_function)(unsigned, unsigned);
+	test_function = function;
+	#pragma omp parallel for reduction(+:checksum) default(none) \
+					shared(n1, n2, test_function, verbose)
+	for (i = n1; i < n2; i++)
+	{
+		unsigned crc = test_function (i, 0);
+		if (verbose)
+			printf ("%d: %08X\n", i, crc);
+		checksum += crc;
+	}
+	printf("%u\n", checksum);
+	return 0;
+  }
+  ])
+
+  PIXMAN_LINK_WITH_ENV(
+	[CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"],
+	[openmp_test_program],
+	[have_openmp=yes],
+	[have_openmp=no])
+  if test "x$have_openmp" = "xyes" ; then
+    AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
+  fi
+fi
+AC_SUBST(OPENMP_CFLAGS)
+
+dnl =========================================================================
+dnl -fvisibility stuff
+
+PIXMAN_CHECK_CFLAG([-fvisibility=hidden], [dnl
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#ifdef _WIN32
+#error Have -fvisibility but it is ignored and generates a warning
+#endif
+#else
+error Need GCC 4.0 for visibility
+#endif
+])
+
+PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
+#else
+error Need Sun Studio 8 for visibility
+#endif
+])
+
+dnl ===========================================================================
+dnl Check for MMX
+
+if test "x$MMX_CFLAGS" = "x" ; then
+   if test "x$SUNCC" = "xyes"; then
+      # Sun Studio doesn't have an -xarch=mmx flag, so we have to use sse
+      # but if we're building 64-bit, mmx & sse support is on by default and
+      # -xarch=sse throws an error instead
+      if test "$AMD64_ABI" = "no" ; then
+         MMX_CFLAGS="-xarch=sse"
+      fi
+   else
+      MMX_CFLAGS="-mmmx -Winline"
+   fi
+fi
+
+have_mmx_intrinsics=no
+AC_MSG_CHECKING(whether to use MMX intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$MMX_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 3.4 for MMX intrinsics"
+#endif
+#include <mmintrin.h>
+int main () {
+    __m64 v = _mm_cvtsi32_si64 (1);
+    return _mm_cvtsi64_si32 (v);
+}]])], have_mmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(mmx,
+   [AC_HELP_STRING([--disable-mmx],
+                   [disable MMX fast paths])],
+   [enable_mmx=$enableval], [enable_mmx=auto])
+
+if test $enable_mmx = no ; then
+   have_mmx_intrinsics=disabled
+fi
+
+if test $have_mmx_intrinsics = yes ; then
+   AC_DEFINE(USE_MMX, 1, [use MMX compiler intrinsics])
+else
+   MMX_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_mmx_intrinsics)
+if test $enable_mmx = yes && test $have_mmx_intrinsics = no ; then
+   AC_MSG_ERROR([MMX intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_MMX, test $have_mmx_intrinsics = yes)
+
+dnl ===========================================================================
+dnl Check for SSE2
+
+if test "x$SSE2_CFLAGS" = "x" ; then
+   if test "x$SUNCC" = "xyes"; then
+      # SSE2 is enabled by default in the Sun Studio 64-bit environment
+      if test "$AMD64_ABI" = "no" ; then
+         SSE2_CFLAGS="-xarch=sse2"
+      fi
+   else
+      SSE2_CFLAGS="-msse2 -Winline"
+   fi
+fi
+
+have_sse2_intrinsics=no
+AC_MSG_CHECKING(whether to use SSE2 intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSE2_CFLAGS $CFLAGS"
+
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
+#   if !defined(__amd64__) && !defined(__x86_64__)
+#      error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
+#   endif
+#endif
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+int main () {
+    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+	c = _mm_xor_si128 (a, b);
+    return 0;
+}]])], have_sse2_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(sse2,
+   [AC_HELP_STRING([--disable-sse2],
+                   [disable SSE2 fast paths])],
+   [enable_sse2=$enableval], [enable_sse2=auto])
+
+if test $enable_sse2 = no ; then
+   have_sse2_intrinsics=disabled
+fi
+
+if test $have_sse2_intrinsics = yes ; then
+   AC_DEFINE(USE_SSE2, 1, [use SSE2 compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_sse2_intrinsics)
+if test $enable_sse2 = yes && test $have_sse2_intrinsics = no ; then
+   AC_MSG_ERROR([SSE2 intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
+
+dnl ===========================================================================
+dnl Other special flags needed when building code using MMX or SSE instructions
+case $host_os in
+   solaris*)
+      # When building 32-bit binaries, apply a mapfile to ensure that the
+      # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
+      # since they check at runtime before using those instructions.
+      # Not all linkers grok the mapfile format so we check for that first.
+      if test "$AMD64_ABI" = "no" ; then
+	 use_hwcap_mapfile=no
+	 AC_MSG_CHECKING(whether to use a hardware capability map file)
+	 hwcap_save_LDFLAGS="$LDFLAGS"
+	 HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
+	 LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
+	 AC_LINK_IFELSE([AC_LANG_SOURCE([[int main() { return 0; }]])],
+			use_hwcap_mapfile=yes,
+			HWCAP_LDFLAGS="")
+	 LDFLAGS="$hwcap_save_LDFLAGS"
+	 AC_MSG_RESULT($use_hwcap_mapfile)
+      fi
+      if test "x$MMX_LDFLAGS" = "x" ; then
+         MMX_LDFLAGS="$HWCAP_LDFLAGS"
+      fi
+      if test "x$SSE2_LDFLAGS" = "x" ; then
+	 SSE2_LDFLAGS="$HWCAP_LDFLAGS"
+      fi
+      ;;
+esac
+
+AC_SUBST(MMX_CFLAGS)
+AC_SUBST(MMX_LDFLAGS)
+AC_SUBST(SSE2_CFLAGS)
+AC_SUBST(SSE2_LDFLAGS)
+
+dnl ===========================================================================
+dnl Check for VMX/Altivec
+if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
+    VMX_CFLAGS="-faltivec"
+else
+    VMX_CFLAGS="-maltivec -mabi=altivec"
+fi
+
+have_vmx_intrinsics=no
+AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$VMX_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 3.4 for sane altivec support"
+#endif
+#include <altivec.h>
+int main () {
+    vector unsigned int v = vec_splat_u32 (1);
+    v = vec_sub (v, v);
+    return 0;
+}]])], have_vmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(vmx,
+   [AC_HELP_STRING([--disable-vmx],
+                   [disable VMX fast paths])],
+   [enable_vmx=$enableval], [enable_vmx=auto])
+
+if test $enable_vmx = no ; then
+   have_vmx_intrinsics=disabled
+fi
+
+if test $have_vmx_intrinsics = yes ; then
+   AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
+else
+   VMX_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_vmx_intrinsics)
+if test $enable_vmx = yes && test $have_vmx_intrinsics = no ; then
+   AC_MSG_ERROR([VMX intrinsics not detected])
+fi
+
+AC_SUBST(VMX_CFLAGS)
+
+AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
+
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports ARM SIMD instructions
+have_arm_simd=no
+AC_MSG_CHECKING(whether to use ARM SIMD assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+.text
+.arch armv6
+.object_arch armv4
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+uqadd8 r0, r0, r0]])], have_arm_simd=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-simd,
+   [AC_HELP_STRING([--disable-arm-simd],
+                   [disable ARM SIMD fast paths])],
+   [enable_arm_simd=$enableval], [enable_arm_simd=auto])
+
+if test $enable_arm_simd = no ; then
+   have_arm_simd=disabled
+fi
+
+if test $have_arm_simd = yes ; then
+   AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
+
+AC_MSG_RESULT($have_arm_simd)
+if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
+   AC_MSG_ERROR([ARM SIMD intrinsics not detected])
+fi
+
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports NEON instructions
+have_arm_neon=no
+AC_MSG_CHECKING(whether to use ARM NEON assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+vmovn.u16 d0, q0]])], have_arm_neon=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-neon,
+   [AC_HELP_STRING([--disable-arm-neon],
+                   [disable ARM NEON fast paths])],
+   [enable_arm_neon=$enableval], [enable_arm_neon=auto])
+
+if test $enable_arm_neon = no ; then
+   have_arm_neon=disabled
+fi
+
+if test $have_arm_neon = yes ; then
+   AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
+
+AC_MSG_RESULT($have_arm_neon)
+if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
+   AC_MSG_ERROR([ARM NEON intrinsics not detected])
+fi
+
+dnl =========================================================================================
+dnl Check for GNU-style inline assembly support
+
+have_gcc_inline_asm=no
+AC_MSG_CHECKING(whether to use GNU-style inline assembler)
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+int main () {
+    /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
+	asm volatile ( "\tnop\n" : : : "cc", "memory" );
+    return 0;
+}]])], have_gcc_inline_asm=yes)
+
+AC_ARG_ENABLE(gcc-inline-asm,
+   [AC_HELP_STRING([--disable-gcc-inline-asm],
+                   [disable GNU-style inline assembler])],
+   [enable_gcc_inline_asm=$enableval], [enable_gcc_inline_asm=auto])
+
+if test $enable_gcc_inline_asm = no ; then
+   have_gcc_inline_asm=disabled
+fi
+
+if test $have_gcc_inline_asm = yes ; then
+   AC_DEFINE(USE_GCC_INLINE_ASM, 1, [use GNU-style inline assembler])
+fi
+
+AC_MSG_RESULT($have_gcc_inline_asm)
+if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then
+   AC_MSG_ERROR([GNU-style inline assembler not detected])
+fi
+
+AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
+
+dnl ==============================================
+dnl Static test programs
+
+AC_ARG_ENABLE(static-testprogs,
+   [AC_HELP_STRING([--enable-static-testprogs],
+		   [build test programs as static binaries [default=no]])],
+   [enable_static_testprogs=$enableval], [enable_static_testprogs=no])
+
+TESTPROGS_EXTRA_LDFLAGS=
+if test "x$enable_static_testprogs" = "xyes" ; then
+   TESTPROGS_EXTRA_LDFLAGS="-all-static"
+fi
+AC_SUBST(TESTPROGS_EXTRA_LDFLAGS)
+
+dnl ==============================================
+dnl Timers
+
+AC_ARG_ENABLE(timers,
+   [AC_HELP_STRING([--enable-timers],
+		   [enable TIMER_BEGIN and TIMER_END macros [default=no]])],
+   [enable_timers=$enableval], [enable_timers=no])
+
+if test $enable_timers = yes ; then 
+   AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros])
+fi
+AC_SUBST(PIXMAN_TIMERS)
+
+dnl ===================================
+dnl GTK+
+
+AC_ARG_ENABLE(gtk,
+   [AC_HELP_STRING([--enable-gtk],
+                   [enable tests using GTK+ [default=auto]])],
+   [enable_gtk=$enableval], [enable_gtk=auto])
+
+PKG_PROG_PKG_CONFIG
+
+if test $enable_gtk = yes ; then
+   AC_CHECK_LIB([pixman-1], [pixman_version_string])
+   PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1])
+fi
+
+if test $enable_gtk = auto ; then
+   AC_CHECK_LIB([pixman-1], [pixman_version_string], [enable_gtk=auto], [enable_gtk=no])
+fi
+
+if test $enable_gtk = auto ; then
+   PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1], [enable_gtk=yes], [enable_gtk=no])
+fi
+
+AM_CONDITIONAL(HAVE_GTK, [test "x$enable_gtk" = xyes])
+
+AC_SUBST(GTK_CFLAGS)
+AC_SUBST(GTK_LIBS)
+AC_SUBST(DEP_CFLAGS)
+AC_SUBST(DEP_LIBS)
+
+dnl =====================================
+dnl posix_memalign, sigaction, alarm, gettimeofday
+
+AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
+if test x$have_posix_memalign = xyes; then
+   AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
+fi
+
+AC_CHECK_FUNC(sigaction, have_sigaction=yes, have_sigaction=no)
+if test x$have_sigaction = xyes; then
+   AC_DEFINE(HAVE_SIGACTION, 1, [Whether we have sigaction()])
+fi
+
+AC_CHECK_FUNC(alarm, have_alarm=yes, have_alarm=no)
+if test x$have_alarm = xyes; then
+   AC_DEFINE(HAVE_ALARM, 1, [Whether we have alarm()])
+fi
+
+AC_CHECK_HEADER([sys/mman.h],
+   [AC_DEFINE(HAVE_SYS_MMAN_H, [1], [Define to 1 if we have <sys/mman.h>])])
+
+AC_CHECK_FUNC(mmap, have_mmap=yes, have_mmap=no)
+if test x$have_mmap = xyes; then
+   AC_DEFINE(HAVE_MMAP, 1, [Whether we have mmap()])
+fi
+
+AC_CHECK_FUNC(mprotect, have_mprotect=yes, have_mprotect=no)
+if test x$have_mprotect = xyes; then
+   AC_DEFINE(HAVE_MPROTECT, 1, [Whether we have mprotect()])
+fi
+
+AC_CHECK_FUNC(getpagesize, have_getpagesize=yes, have_getpagesize=no)
+if test x$have_getpagesize = xyes; then
+   AC_DEFINE(HAVE_GETPAGESIZE, 1, [Whether we have getpagesize()])
+fi
+
+AC_CHECK_HEADER([fenv.h],
+   [AC_DEFINE(HAVE_FENV_H, [1], [Define to 1 if we have <fenv.h>])])
+
+AC_CHECK_LIB(m, feenableexcept, have_feenableexcept=yes, have_feenableexcept=no)
+if test x$have_feenableexcept = xyes; then
+   AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()])
+fi
+
+AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no)
+AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no)
+if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
+   AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()])
+fi
+
+dnl =====================================
+dnl Thread local storage
+
+support_for__thread=no
+
+AC_MSG_CHECKING(for __thread)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+#if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
+#error This MinGW version has broken __thread support
+#endif
+#ifdef __OpenBSD__
+#error OpenBSD has broken __thread support
+#endif
+static __thread int x ;
+int main () { x = 123; return x; }
+]])], support_for__thread=yes)
+
+if test $support_for__thread = yes; then 
+   AC_DEFINE([TOOLCHAIN_SUPPORTS__THREAD],[],[Whether the tool chain supports __thread])
+fi
+
+AC_MSG_RESULT($support_for__thread)
+
+dnl
+dnl posix tls
+dnl
+
+m4_define([pthread_test_program],AC_LANG_SOURCE([[dnl
+#include <stdlib.h>
+#include <pthread.h>
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+static pthread_key_t key;
+
+static void
+make_key (void)
+{
+    pthread_key_create (&key, NULL);
+}
+
+int
+main ()
+{
+    void *value = NULL;
+
+    if (pthread_once (&once_control, make_key) != 0)
+    {
+	value = NULL;
+    }
+    else
+    {
+	value = pthread_getspecific (key);
+	if (!value)
+	{
+	    value = malloc (100);
+	    pthread_setspecific (key, value);
+	}
+    }
+    return 0;
+}
+]]))
+
+AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
+    if test "z$support_for_pthread_setspecific" != "zyes"; then
+	PIXMAN_LINK_WITH_ENV(
+		[$1], [pthread_test_program],
+		[PTHREAD_CFLAGS="$CFLAGS"
+		 PTHREAD_LIBS="$LIBS"
+		 PTHREAD_LDFLAGS="$LDFLAGS"
+		 support_for_pthread_setspecific=yes])
+    fi
+])
+
+if test $support_for__thread = no; then
+    support_for_pthread_setspecific=no
+
+    AC_MSG_CHECKING(for pthread_setspecific)
+
+    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
+    PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
+    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
+    
+    if test $support_for_pthread_setspecific = yes; then
+	CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+	AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
+    fi
+
+    AC_MSG_RESULT($support_for_pthread_setspecific);
+fi
+
+AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
+AC_SUBST(HAVE_PTHREAD_SETSPECIFIC)
+AC_SUBST(PTHREAD_LDFLAGS)
+AC_SUBST(PTHREAD_LIBS)
+
+dnl =====================================
+dnl __attribute__((constructor))
+
+support_for_attribute_constructor=no
+
+AC_MSG_CHECKING(for __attribute__((constructor)))
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
+/* attribute 'constructor' is supported since gcc 2.7, but some compilers
+ * may only pretend to be gcc, so let's try to actually use it
+ */
+static int x = 1;
+static void __attribute__((constructor)) constructor_function () { x = 0; }
+int main (void) { return x; }
+#else
+#error not gcc or gcc version is older than 2.7
+#endif
+]])], support_for_attribute_constructor=yes)
+
+if test x$support_for_attribute_constructor = xyes; then
+   AC_DEFINE([TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR],
+             [],[Whether the tool chain supports __attribute__((constructor))])
+fi
+
+AC_MSG_RESULT($support_for_attribute_constructor)
+AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
+
+dnl ==================
+dnl libpng
+
+AC_CHECK_LIB([png], [png_write_info], [have_libpng=yes], [have_libpng=no])
+
+if test x$have_libpng = xyes; then
+    AC_DEFINE([HAVE_LIBPNG], [1], [Whether we have libpng])
+fi
+
+AC_SUBST(HAVE_LIBPNG)
+
+AC_OUTPUT([pixman-1.pc
+           pixman-1-uninstalled.pc
+           Makefile
+	   pixman/Makefile
+	   pixman/pixman-version.h
+	   demos/Makefile
+	   test/Makefile])
+
+m4_if(m4_eval(pixman_minor % 2), [1], [
+   echo
+   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+   echo
+   echo "      Thanks for testing this development snapshot of pixman. Please"
+   echo "      report any problems you find, either by sending email to "
+   echo
+   echo "          pixman@lists.freedesktop.org"
+   echo
+   echo "      or by filing a bug at "
+   echo
+   echo "          https://bugs.freedesktop.org/enter_bug.cgi?product=pixman "
+   echo
+   echo "      If you are looking for a stable release of pixman, please note "
+   echo "      that stable releases have _even_ minor version numbers. Ie., "
+   echo "      pixman-0.]m4_eval(pixman_minor & ~1)[.x are stable releases, whereas pixman-$PIXMAN_VERSION_MAJOR.$PIXMAN_VERSION_MINOR.$PIXMAN_VERSION_MICRO is a "
+   echo "      development snapshot that may contain bugs and experimental "
+   echo "      features. "
+   echo 
+   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+   echo
+])
diff --git a/pixman/demos/tri-test.c b/pixman/demos/tri-test.c
index dca84556b..a71869a6a 100644
--- a/pixman/demos/tri-test.c
+++ b/pixman/demos/tri-test.c
@@ -1,48 +1,48 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "../test/utils.h"
-#include "gtk-utils.h"
-
-int
-main (int argc, char **argv)
-{
-#define WIDTH 200
-#define HEIGHT 200
-
-#define POINT(x,y)							\
-    { pixman_double_to_fixed ((x)), pixman_double_to_fixed ((y)) }
-    
-    pixman_image_t *src_img, *dest_img;
-    pixman_triangle_t tris[4] =
-    {
-	{ POINT (100, 100), POINT (10, 50), POINT (110, 10) },
-	{ POINT (100, 100), POINT (150, 10), POINT (200, 50) },
-	{ POINT (100, 100), POINT (10, 170), POINT (90, 175) },
-	{ POINT (100, 100), POINT (170, 150), POINT (120, 190) },
-    };
-    pixman_color_t color = { 0x4444, 0x4444, 0xffff, 0xffff };
-    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
-    int i;
-
-    for (i = 0; i < WIDTH * HEIGHT; ++i)
-	bits[i] = (i / HEIGHT) * 0x01010000;
-    
-    src_img = pixman_image_create_solid_fill (&color);
-    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
-    
-    pixman_composite_triangles (PIXMAN_OP_ATOP_REVERSE,
-				src_img,
-				dest_img,
-				PIXMAN_a8,
-				200, 200,
-				-5, 5,
-				ARRAY_LENGTH (tris), tris);
-    show_image (dest_img);
-    
-    pixman_image_unref (src_img);
-    pixman_image_unref (dest_img);
-    free (bits);
-    
-    return 0;
-}
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../test/utils.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define POINT(x,y)							\
+    { pixman_double_to_fixed ((x)), pixman_double_to_fixed ((y)) }
+    
+    pixman_image_t *src_img, *dest_img;
+    pixman_triangle_t tris[4] =
+    {
+	{ POINT (100, 100), POINT (10, 50), POINT (110, 10) },
+	{ POINT (100, 100), POINT (150, 10), POINT (200, 50) },
+	{ POINT (100, 100), POINT (10, 170), POINT (90, 175) },
+	{ POINT (100, 100), POINT (170, 150), POINT (120, 190) },
+    };
+    pixman_color_t color = { 0x4444, 0x4444, 0xffff, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	bits[i] = (i / HEIGHT) * 0x01010000;
+    
+    src_img = pixman_image_create_solid_fill (&color);
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_composite_triangles (PIXMAN_OP_ATOP_REVERSE,
+				src_img,
+				dest_img,
+				PIXMAN_a8,
+				200, 200,
+				-5, 5,
+				ARRAY_LENGTH (tris), tris);
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/pixman/pixman/Makefile.win32 b/pixman/pixman/Makefile.win32
index f54a16f2a..7c92722c3 100644
--- a/pixman/pixman/Makefile.win32
+++ b/pixman/pixman/Makefile.win32
@@ -1,147 +1,147 @@
-LIBRARY     = pixman-1
-
-CC   = cl
-LINK = link
-
-CFG_VAR = $(CFG)
-ifeq ($(CFG_VAR),)
-CFG_VAR=release
-endif
-
-MMX_VAR = $(MMX)
-ifeq ($(MMX_VAR),)
-MMX_VAR=on
-endif
-
-SSE2_VAR = $(SSE2)
-ifeq ($(SSE2_VAR),)
-SSE2_VAR=on
-endif
-
-CFLAGS     = -MD -nologo -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -I../pixman/src -I. -DPACKAGE=$(LIBRARY) -DPACKAGE_VERSION="" -DPACKAGE_BUGREPORT=""
-MMX_CFLAGS = -DUSE_MMX -w14710 -w14714
-SSE2_CFLAGS = -DUSE_SSE2
-
-# optimization flags
-ifeq ($(CFG_VAR),debug)
-CFLAGS += -Od -Zi
-else
-CFLAGS += -O2
-endif
-
-SOURCES =				\
-	pixman-image.c			\
-	pixman-access.c			\
-	pixman-access-accessors.c	\
-	pixman-region16.c		\
-	pixman-region32.c		\
-	pixman-combine32.c		\
-	pixman-combine64.c		\
-	pixman-utils.c			\
-	pixman-edge.c			\
-	pixman-edge-accessors.c		\
-	pixman-trap.c			\
-	pixman-timer.c			\
-	pixman-matrix.c			\
-	pixman-gradient-walker.c	\
-	pixman-conical-gradient.c	\
-	pixman-linear-gradient.c	\
-	pixman-radial-gradient.c	\
-	pixman-bits-image.c		\
-	pixman.c			\
-	pixman-noop.c			\
-	pixman-cpu.c			\
-	pixman-fast-path.c		\
-	pixman-implementation.c		\
-	pixman-solid-fill.c		\
-	pixman-general.c		\
-	$(NULL)
-
-BUILT_SOURCES = pixman-combine32.h pixman-combine32.c pixman-combine64.h pixman-combine64.c
-
-# MMX compilation flags
-ifeq ($(MMX_VAR),on)
-CFLAGS += $(MMX_CFLAGS)
-SOURCES += pixman-mmx.c
-endif
-
-# SSE2 compilation flags
-ifeq ($(SSE2_VAR),on)
-CFLAGS += $(SSE2_CFLAGS)
-SOURCES += pixman-sse2.c
-endif
-
-OBJECTS     = $(patsubst %.c, $(CFG_VAR)/%.obj, $(SOURCES))
-
-# targets
-all: inform informMMX informSSE2 $(CFG_VAR)/$(LIBRARY).lib
-	@exit 0
-clean: inform clean_r 
-	@exit 0
-pixman: inform informMMX informSSE2 $(CFG_VAR)/$(LIBRARY).lib 
-	@exit 0
-
-inform:
-ifneq ($(CFG),release)
-ifneq ($(CFG),debug)
-ifneq ($(CFG),)
-	@echo "Invalid specified configuration option : "$(CFG)"."
-	@echo
-	@echo -n "Possible choices for configuration are "
-	@echo "'release' and 'debug'"
-	@echo ""
-	@exit 1
-endif
-	@echo "Using default RELEASE configuration... (use CFG=release or CFG=debug)"
-endif
-endif
-
-informMMX:
-ifneq ($(MMX),off)
-ifneq ($(MMX),on)
-ifneq ($(MMX),)
-	@echo "Invalid specified MMX option : "$(MMX_VAR)"."
-	@echo
-	@echo -n "Possible choices for MMX are 'on' or 'off'"
-	@echo ""
-	@exit 1
-endif
-	@echo "Setting MMX flag to default value 'on'... (use MMX=on or MMX=off)"
-endif
-endif
-
-informSSE2:
-ifneq ($(SSE2),off)
-ifneq ($(SSE2),on)
-ifneq ($(SSE2),)
-	@echo "Invalid specified SSE option : "$(SSE2)"."
-	@echo
-	@echo -n "Possible choices for SSE2 are 'on' or 'off'"
-	@echo ""
-	@exit 1
-endif
-	@echo "Setting SSE2 flag to default value 'on'... (use SSE2=on or SSE2=off)"
-endif
-endif
-
-# pixman compilation and linking
-$(CFG_VAR)/%.obj: %.c $(BUILT_SOURCES)
-	@mkdir -p $(CFG_VAR)
-	@$(CC) -c $(CFLAGS) -Fo"$@" $<
-
-$(CFG_VAR)/$(LIBRARY).lib: $(OBJECTS)
-	lib -NOLOGO -OUT:$@ $(OBJECTS) || exit 0
-
-pixman-combine32.c: pixman-combine.c.template pixman-combine32.h make-combine.pl
-	perl ./make-combine.pl 8 < $< > $@ || ($(RM) $@; exit 1)
-pixman-combine32.h: pixman-combine.h.template make-combine.pl
-	perl ./make-combine.pl 8 < $< > $@ || ($(RM) $@; exit 1)
-
-pixman-combine64.c: pixman-combine.c.template pixman-combine64.h make-combine.pl
-	perl ./make-combine.pl 16 < $< > $@ || ($(RM) $@; exit 1)
-pixman-combine64.h: pixman-combine.h.template make-combine.pl
-	perl ./make-combine.pl 16 < $< > $@ || ($(RM) $@; exit 1)
-
-clean_r:
-	@rm -f $(CFG_VAR)/*.obj $(CFG_VAR)/*.lib $(CFG_VAR)/*.pdb $(CFG)/*.ilk || exit 0
-	@rm -f $(CFG)/*.obj $(CFG)/*.lib $(CFG)/*.pdb $(CFG)/*.ilk $(BUILT_SOURCES) || exit 0
+LIBRARY     = pixman-1
+
+CC   = cl
+LINK = link
+
+CFG_VAR = $(CFG)
+ifeq ($(CFG_VAR),)
+CFG_VAR=release
+endif
+
+MMX_VAR = $(MMX)
+ifeq ($(MMX_VAR),)
+MMX_VAR=on
+endif
+
+SSE2_VAR = $(SSE2)
+ifeq ($(SSE2_VAR),)
+SSE2_VAR=on
+endif
+
+CFLAGS     = -MD -nologo -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -I../pixman/src -I. -DPACKAGE=$(LIBRARY) -DPACKAGE_VERSION="" -DPACKAGE_BUGREPORT=""
+MMX_CFLAGS = -DUSE_MMX -w14710 -w14714
+SSE2_CFLAGS = -DUSE_SSE2
+
+# optimization flags
+ifeq ($(CFG_VAR),debug)
+CFLAGS += -Od -Zi
+else
+CFLAGS += -O2
+endif
+
+SOURCES =				\
+	pixman-image.c			\
+	pixman-access.c			\
+	pixman-access-accessors.c	\
+	pixman-region16.c		\
+	pixman-region32.c		\
+	pixman-combine32.c		\
+	pixman-combine64.c		\
+	pixman-utils.c			\
+	pixman-edge.c			\
+	pixman-edge-accessors.c		\
+	pixman-trap.c			\
+	pixman-timer.c			\
+	pixman-matrix.c			\
+	pixman-gradient-walker.c	\
+	pixman-conical-gradient.c	\
+	pixman-linear-gradient.c	\
+	pixman-radial-gradient.c	\
+	pixman-bits-image.c		\
+	pixman.c			\
+	pixman-noop.c			\
+	pixman-cpu.c			\
+	pixman-fast-path.c		\
+	pixman-implementation.c		\
+	pixman-solid-fill.c		\
+	pixman-general.c		\
+	$(NULL)
+
+BUILT_SOURCES = pixman-combine32.h pixman-combine32.c pixman-combine64.h pixman-combine64.c
+
+# MMX compilation flags
+ifeq ($(MMX_VAR),on)
+CFLAGS += $(MMX_CFLAGS)
+SOURCES += pixman-mmx.c
+endif
+
+# SSE2 compilation flags
+ifeq ($(SSE2_VAR),on)
+CFLAGS += $(SSE2_CFLAGS)
+SOURCES += pixman-sse2.c
+endif
+
+OBJECTS     = $(patsubst %.c, $(CFG_VAR)/%.obj, $(SOURCES))
+
+# targets
+all: inform informMMX informSSE2 $(CFG_VAR)/$(LIBRARY).lib
+	@exit 0
+clean: inform clean_r 
+	@exit 0
+pixman: inform informMMX informSSE2 $(CFG_VAR)/$(LIBRARY).lib 
+	@exit 0
+
+inform:
+ifneq ($(CFG),release)
+ifneq ($(CFG),debug)
+ifneq ($(CFG),)
+	@echo "Invalid specified configuration option : "$(CFG)"."
+	@echo
+	@echo -n "Possible choices for configuration are "
+	@echo "'release' and 'debug'"
+	@echo ""
+	@exit 1
+endif
+	@echo "Using default RELEASE configuration... (use CFG=release or CFG=debug)"
+endif
+endif
+
+informMMX:
+ifneq ($(MMX),off)
+ifneq ($(MMX),on)
+ifneq ($(MMX),)
+	@echo "Invalid specified MMX option : "$(MMX_VAR)"."
+	@echo
+	@echo -n "Possible choices for MMX are 'on' or 'off'"
+	@echo ""
+	@exit 1
+endif
+	@echo "Setting MMX flag to default value 'on'... (use MMX=on or MMX=off)"
+endif
+endif
+
+informSSE2:
+ifneq ($(SSE2),off)
+ifneq ($(SSE2),on)
+ifneq ($(SSE2),)
+	@echo "Invalid specified SSE option : "$(SSE2)"."
+	@echo
+	@echo -n "Possible choices for SSE2 are 'on' or 'off'"
+	@echo ""
+	@exit 1
+endif
+	@echo "Setting SSE2 flag to default value 'on'... (use SSE2=on or SSE2=off)"
+endif
+endif
+
+# pixman compilation and linking
+$(CFG_VAR)/%.obj: %.c $(BUILT_SOURCES)
+	@mkdir -p $(CFG_VAR)
+	@$(CC) -c $(CFLAGS) -Fo"$@" $<
+
+$(CFG_VAR)/$(LIBRARY).lib: $(OBJECTS)
+	lib -NOLOGO -OUT:$@ $(OBJECTS) || exit 0
+
+pixman-combine32.c: pixman-combine.c.template pixman-combine32.h make-combine.pl
+	perl ./make-combine.pl 8 < $< > $@ || ($(RM) $@; exit 1)
+pixman-combine32.h: pixman-combine.h.template make-combine.pl
+	perl ./make-combine.pl 8 < $< > $@ || ($(RM) $@; exit 1)
+
+pixman-combine64.c: pixman-combine.c.template pixman-combine64.h make-combine.pl
+	perl ./make-combine.pl 16 < $< > $@ || ($(RM) $@; exit 1)
+pixman-combine64.h: pixman-combine.h.template make-combine.pl
+	perl ./make-combine.pl 16 < $< > $@ || ($(RM) $@; exit 1)
+
+clean_r:
+	@rm -f $(CFG_VAR)/*.obj $(CFG_VAR)/*.lib $(CFG_VAR)/*.pdb $(CFG)/*.ilk || exit 0
+	@rm -f $(CFG)/*.obj $(CFG)/*.lib $(CFG)/*.pdb $(CFG)/*.ilk $(BUILT_SOURCES) || exit 0
diff --git a/pixman/pixman/pixman-access.c b/pixman/pixman/pixman-access.c
index d56dab4b2..32c4d8b2c 100644
--- a/pixman/pixman/pixman-access.c
+++ b/pixman/pixman/pixman-access.c
@@ -1,3086 +1,3086 @@
-/*
- *
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- *             2005 Lars Knoll & Zack Rusin, Trolltech
- *             2008 Aaron Plattner, NVIDIA Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-
-#include "pixman-private.h"
-#include "pixman-accessor.h"
-
-#define CONVERT_RGB24_TO_Y15(s)						\
-    (((((s) >> 16) & 0xff) * 153 +					\
-      (((s) >>  8) & 0xff) * 301 +					\
-      (((s)      ) & 0xff) * 58) >> 2)
-
-#define CONVERT_RGB24_TO_RGB15(s)                                       \
-    ((((s) >> 3) & 0x001f) |                                            \
-     (((s) >> 6) & 0x03e0) |                                            \
-     (((s) >> 9) & 0x7c00))
-
-#define RGB15_TO_ENTRY(mif,rgb15)					\
-    ((mif)->ent[rgb15])
-
-#define RGB24_TO_ENTRY(mif,rgb24)					\
-    RGB15_TO_ENTRY (mif,CONVERT_RGB24_TO_RGB15 (rgb24))
-
-#define RGB24_TO_ENTRY_Y(mif,rgb24)					\
-    ((mif)->ent[CONVERT_RGB24_TO_Y15 (rgb24)])
-
-/*
- * YV12 setup and access macros
- */
-
-#define YV12_SETUP(image)                                               \
-    bits_image_t *__bits_image = (bits_image_t *)image;                 \
-    uint32_t *bits = __bits_image->bits;                                \
-    int stride = __bits_image->rowstride;                               \
-    int offset0 = stride < 0 ?                                          \
-    ((-stride) >> 1) * ((__bits_image->height - 1) >> 1) - stride :	\
-    stride * __bits_image->height;					\
-    int offset1 = stride < 0 ?                                          \
-    offset0 + ((-stride) >> 1) * ((__bits_image->height) >> 1) :	\
-	offset0 + (offset0 >> 2)
-
-/* Note no trailing semicolon on the above macro; if it's there, then
- * the typical usage of YV12_SETUP(image); will have an extra trailing ;
- * that some compilers will interpret as a statement -- and then any further
- * variable declarations will cause an error.
- */
-
-#define YV12_Y(line)                                                    \
-    ((uint8_t *) ((bits) + (stride) * (line)))
-
-#define YV12_U(line)                                                    \
-    ((uint8_t *) ((bits) + offset1 +                                    \
-                  ((stride) >> 1) * ((line) >> 1)))
-
-#define YV12_V(line)                                                    \
-    ((uint8_t *) ((bits) + offset0 +                                    \
-                  ((stride) >> 1) * ((line) >> 1)))
-
-/********************************** Fetch ************************************/
-
-static void
-fetch_scanline_a8r8g8b8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    
-    MEMCPY_WRAPPED (image,
-                    buffer, (const uint32_t *)bits + x,
-                    width * sizeof(uint32_t));
-}
-
-static void
-fetch_scanline_x8r8g8b8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (const uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    
-    while (pixel < end)
-	*buffer++ = READ (image, pixel++) | 0xff000000;
-}
-
-static void
-fetch_scanline_a8b8g8r8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	
-	*buffer++ = (p & 0xff00ff00)	|
-	    ((p >> 16) & 0xff)		|
-	    ((p & 0xff) << 16);
-    }
-}
-
-static void
-fetch_scanline_x8b8g8r8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	
-	*buffer++ = 0xff000000		|
-	    (p & 0x0000ff00)		|
-	    ((p >> 16) & 0xff)		|
-	    ((p & 0xff) << 16);
-    }
-}
-
-static void
-fetch_scanline_b8g8r8a8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-
-	*buffer++ = (((p & 0xff000000) >> 24)	|
-	             ((p & 0x00ff0000) >> 8)	|
-	             ((p & 0x0000ff00) << 8)	|
-	             ((p & 0x000000ff) << 24));
-    }
-}
-
-static void
-fetch_scanline_b8g8r8x8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	
-	*buffer++ = (0xff000000 |
-	             ((p & 0xff000000) >> 24)	|
-	             ((p & 0x00ff0000) >> 8)	|
-	             ((p & 0x0000ff00) << 8));
-    }
-}
-
-static void
-fetch_scanline_r8g8b8a8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-
-	*buffer++ = (((p & 0x000000ff) << 24) | (p >> 8));
-    }
-}
-
-static void
-fetch_scanline_r8g8b8x8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	
-	*buffer++ = (0xff000000 | (p >> 8));
-    }
-}
-
-static void
-fetch_scanline_x14r6g6b6 (pixman_image_t *image,
-                          int             x,
-                          int             y,
-                          int             width,
-                          uint32_t *      buffer,
-                          const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (const uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-
-	r = ((p & 0x3f000) << 6) | ((p & 0x30000));
-	g = ((p & 0x00fc0) << 4) | ((p & 0x00c00) >> 2);
-	b = ((p & 0x0003f) << 2) | ((p & 0x00030) >> 4);
-
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-/* Expects a uint64_t buffer */
-static void
-fetch_scanline_a2r10g10b10 (pixman_image_t *image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            uint32_t *      b,
-                            const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = bits + x;
-    const uint32_t *end = pixel + width;
-    uint64_t *buffer = (uint64_t *)b;
-
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint64_t a = p >> 30;
-	uint64_t r = (p >> 20) & 0x3ff;
-	uint64_t g = (p >> 10) & 0x3ff;
-	uint64_t b = p & 0x3ff;
-
-	r = r << 6 | r >> 4;
-	g = g << 6 | g >> 4;
-	b = b << 6 | b >> 4;
-
-	a <<= 14;
-	a |= a >> 2;
-	a |= a >> 4;
-	a |= a >> 8;
-
-	*buffer++ = a << 48 | r << 32 | g << 16 | b;
-    }
-}
-
-/* Expects a uint64_t buffer */
-static void
-fetch_scanline_x2r10g10b10 (pixman_image_t *image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            uint32_t *      b,
-                            const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    uint64_t *buffer = (uint64_t *)b;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint64_t r = (p >> 20) & 0x3ff;
-	uint64_t g = (p >> 10) & 0x3ff;
-	uint64_t b = p & 0x3ff;
-	
-	r = r << 6 | r >> 4;
-	g = g << 6 | g >> 4;
-	b = b << 6 | b >> 4;
-	
-	*buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
-    }
-}
-
-/* Expects a uint64_t buffer */
-static void
-fetch_scanline_a2b10g10r10 (pixman_image_t *image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            uint32_t *      b,
-                            const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = bits + x;
-    const uint32_t *end = pixel + width;
-    uint64_t *buffer = (uint64_t *)b;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint64_t a = p >> 30;
-	uint64_t b = (p >> 20) & 0x3ff;
-	uint64_t g = (p >> 10) & 0x3ff;
-	uint64_t r = p & 0x3ff;
-	
-	r = r << 6 | r >> 4;
-	g = g << 6 | g >> 4;
-	b = b << 6 | b >> 4;
-	
-	a <<= 14;
-	a |= a >> 2;
-	a |= a >> 4;
-	a |= a >> 8;
-
-	*buffer++ = a << 48 | r << 32 | g << 16 | b;
-    }
-}
-
-/* Expects a uint64_t buffer */
-static void
-fetch_scanline_x2b10g10r10 (pixman_image_t *image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            uint32_t *      b,
-                            const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    uint64_t *buffer = (uint64_t *)b;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint64_t b = (p >> 20) & 0x3ff;
-	uint64_t g = (p >> 10) & 0x3ff;
-	uint64_t r = p & 0x3ff;
-	
-	r = r << 6 | r >> 4;
-	g = g << 6 | g >> 4;
-	b = b << 6 | b >> 4;
-	
-	*buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
-    }
-}
-
-static void
-fetch_scanline_r8g8b8 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + 3 * x;
-    const uint8_t *end = pixel + 3 * width;
-    
-    while (pixel < end)
-    {
-	uint32_t b = 0xff000000;
-	
-#ifdef WORDS_BIGENDIAN
-	b |= (READ (image, pixel++) << 16);
-	b |= (READ (image, pixel++) << 8);
-	b |= (READ (image, pixel++));
-#else
-	b |= (READ (image, pixel++));
-	b |= (READ (image, pixel++) << 8);
-	b |= (READ (image, pixel++) << 16);
-#endif
-	
-	*buffer++ = b;
-    }
-}
-
-static void
-fetch_scanline_b8g8r8 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + 3 * x;
-    const uint8_t *end = pixel + 3 * width;
-    
-    while (pixel < end)
-    {
-	uint32_t b = 0xff000000;
-#ifdef WORDS_BIGENDIAN
-	b |= (READ (image, pixel++));
-	b |= (READ (image, pixel++) << 8);
-	b |= (READ (image, pixel++) << 16);
-#else
-	b |= (READ (image, pixel++) << 16);
-	b |= (READ (image, pixel++) << 8);
-	b |= (READ (image, pixel++));
-#endif
-	*buffer++ = b;
-    }
-}
-
-static void
-fetch_scanline_r5g6b5 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r = (((p) << 3) & 0xf8) |
-	    (((p) << 5) & 0xfc00) |
-	    (((p) << 8) & 0xf80000);
-	
-	r |= (r >> 5) & 0x70007;
-	r |= (r >> 6) & 0x300;
-	
-	*buffer++ = 0xff000000 | r;
-    }
-}
-
-static void
-fetch_scanline_b5g6r5 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	b = ((p & 0xf800) | ((p & 0xe000) >> 5)) >> 8;
-	g = ((p & 0x07e0) | ((p & 0x0600) >> 6)) << 5;
-	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a1r5g5b5 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b, a;
-	
-	a = (uint32_t) ((uint8_t) (0 - ((p & 0x8000) >> 15))) << 24;
-	r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
-	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
-	b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_x1r5g5b5 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
-	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
-	b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a1b5g5r5 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    uint32_t r, g, b, a;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	
-	a = (uint32_t) ((uint8_t) (0 - ((p & 0x8000) >> 15))) << 24;
-	b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
-	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
-	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_x1b5g5r5 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
-	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
-	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a4r4g4b4 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b, a;
-	
-	a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
-	r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
-	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
-	b = ((p & 0x000f) | ((p & 0x000f) << 4));
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_x4r4g4b4 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
-	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
-	b = ((p & 0x000f) | ((p & 0x000f) << 4));
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a4b4g4r4 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b, a;
-	
-	a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
-	b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) >> 4;
-	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
-	r = ((p & 0x000f) | ((p & 0x000f) << 4)) << 16;
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_x4b4g4r4 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) >> 4;
-	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
-	r = ((p & 0x000f) | ((p & 0x000f) << 4)) << 16;
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a8 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-    
-    while (pixel < end)
-	*buffer++ = READ (image, pixel++) << 24;
-}
-
-static void
-fetch_scanline_r3g3b2 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	r = ((p & 0xe0) | ((p & 0xe0) >> 3) | ((p & 0xc0) >> 6)) << 16;
-	g = ((p & 0x1c) | ((p & 0x18) >> 3) | ((p & 0x1c) << 3)) << 8;
-	b = (((p & 0x03)     ) |
-	     ((p & 0x03) << 2) |
-	     ((p & 0x03) << 4) |
-	     ((p & 0x03) << 6));
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_b2g3r3 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-
-	b  = p & 0xc0;
-	b |= b >> 2;
-	b |= b >> 4;
-	b &= 0xff;
-
-	g  = (p & 0x38) << 10;
-	g |= g >> 3;
-	g |= g >> 6;
-	g &= 0xff00;
-
-	r  = (p & 0x7) << 21;
-	r |= r >> 3;
-	r |= r >> 6;
-	r &= 0xff0000;
-
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a2r2g2b2 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t a, r, g, b;
-	
-	a = ((p & 0xc0) * 0x55) << 18;
-	r = ((p & 0x30) * 0x55) << 12;
-	g = ((p & 0x0c) * 0x55) << 6;
-	b = ((p & 0x03) * 0x55);
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a2b2g2r2 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t a, r, g, b;
-	
-	a = ((p & 0xc0) * 0x55) << 18;
-	b = ((p & 0x30) * 0x55) >> 4;
-	g = ((p & 0x0c) * 0x55) << 6;
-	r = ((p & 0x03) * 0x55) << 16;
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_c8 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const pixman_indexed_t * indexed = image->bits.indexed;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	
-	*buffer++ = indexed->rgba[p];
-    }
-}
-
-static void
-fetch_scanline_x4a4 (pixman_image_t *image,
-                     int             x,
-                     int             y,
-                     int             width,
-                     uint32_t *      buffer,
-                     const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-   
-    while (pixel < end)
-    {
-	uint8_t p = READ (image, pixel++) & 0xf;
-
-	*buffer++ = (p | (p << 4)) << 24;
-    }
-}
-
-#define FETCH_8(img,l,o)    (READ (img, (((uint8_t *)(l)) + ((o) >> 3))))
-#ifdef WORDS_BIGENDIAN
-#define FETCH_4(img,l,o)						\
-    (((4 * (o)) & 4) ? (FETCH_8 (img,l, 4 * (o)) & 0xf) : (FETCH_8 (img,l,(4 * (o))) >> 4))
-#else
-#define FETCH_4(img,l,o)						\
-    (((4 * (o)) & 4) ? (FETCH_8 (img, l, 4 * (o)) >> 4) : (FETCH_8 (img, l, (4 * (o))) & 0xf))
-#endif
-
-static void
-fetch_scanline_a4 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-
-	p |= p << 4;
-
-	*buffer++ = p << 24;
-    }
-}
-
-static void
-fetch_scanline_r1g2b1 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-	uint32_t r, g, b;
-	
-	r = ((p & 0x8) * 0xff) << 13;
-	g = ((p & 0x6) * 0x55) << 7;
-	b = ((p & 0x1) * 0xff);
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_b1g2r1 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-	uint32_t r, g, b;
-	
-	b = ((p & 0x8) * 0xff) >> 3;
-	g = ((p & 0x6) * 0x55) << 7;
-	r = ((p & 0x1) * 0xff) << 16;
-
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a1r1g1b1 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    uint32_t a, r, g, b;
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-
-	a = ((p & 0x8) * 0xff) << 21;
-	r = ((p & 0x4) * 0xff) << 14;
-	g = ((p & 0x2) * 0xff) << 7;
-	b = ((p & 0x1) * 0xff);
-
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a1b1g1r1 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-	uint32_t a, r, g, b;
-
-	a = ((p & 0x8) * 0xff) << 21;
-	b = ((p & 0x4) * 0xff) >> 2;
-	g = ((p & 0x2) * 0xff) << 7;
-	r = ((p & 0x1) * 0xff) << 16;
-
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_c4 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const pixman_indexed_t * indexed = image->bits.indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-	
-	*buffer++ = indexed->rgba[p];
-    }
-}
-
-static void
-fetch_scanline_a1 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t p = READ (image, bits + ((i + x) >> 5));
-	uint32_t a;
-	
-#ifdef WORDS_BIGENDIAN
-	a = p >> (0x1f - ((i + x) & 0x1f));
-#else
-	a = p >> ((i + x) & 0x1f);
-#endif
-	a = a & 1;
-	a |= a << 1;
-	a |= a << 2;
-	a |= a << 4;
-	
-	*buffer++ = a << 24;
-    }
-}
-
-static void
-fetch_scanline_g1 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const pixman_indexed_t * indexed = image->bits.indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t p = READ (image, bits + ((i + x) >> 5));
-	uint32_t a;
-	
-#ifdef WORDS_BIGENDIAN
-	a = p >> (0x1f - ((i + x) & 0x1f));
-#else
-	a = p >> ((i + x) & 0x1f);
-#endif
-	a = a & 1;
-	
-	*buffer++ = indexed->rgba[a];
-    }
-}
-
-static void
-fetch_scanline_yuy2 (pixman_image_t *image,
-                     int             x,
-                     int             line,
-                     int             width,
-                     uint32_t *      buffer,
-                     const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + image->bits.rowstride * line;
-    int i;
-    
-    for (i = 0; i < width; i++)
-    {
-	int16_t y, u, v;
-	int32_t r, g, b;
-	
-	y = ((uint8_t *) bits)[(x + i) << 1] - 16;
-	u = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 1] - 128;
-	v = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 3] - 128;
-	
-	/* R = 1.164(Y - 16) + 1.596(V - 128) */
-	r = 0x012b27 * y + 0x019a2e * v;
-	/* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
-	g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
-	/* B = 1.164(Y - 16) + 2.018(U - 128) */
-	b = 0x012b27 * y + 0x0206a2 * u;
-	
-	*buffer++ = 0xff000000 |
-	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
-	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
-	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
-    }
-}
-
-static void
-fetch_scanline_yv12 (pixman_image_t *image,
-                     int             x,
-                     int             line,
-                     int             width,
-                     uint32_t *      buffer,
-                     const uint32_t *mask)
-{
-    YV12_SETUP (image);
-    uint8_t *y_line = YV12_Y (line);
-    uint8_t *u_line = YV12_U (line);
-    uint8_t *v_line = YV12_V (line);
-    int i;
-    
-    for (i = 0; i < width; i++)
-    {
-	int16_t y, u, v;
-	int32_t r, g, b;
-
-	y = y_line[x + i] - 16;
-	u = u_line[(x + i) >> 1] - 128;
-	v = v_line[(x + i) >> 1] - 128;
-
-	/* R = 1.164(Y - 16) + 1.596(V - 128) */
-	r = 0x012b27 * y + 0x019a2e * v;
-	/* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
-	g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
-	/* B = 1.164(Y - 16) + 2.018(U - 128) */
-	b = 0x012b27 * y + 0x0206a2 * u;
-
-	*buffer++ = 0xff000000 |
-	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
-	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
-	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
-    }
-}
-
-/**************************** Pixel wise fetching *****************************/
-
-/* Despite the type, expects a uint64_t buffer */
-static uint64_t
-fetch_pixel_a2r10g10b10 (bits_image_t *image,
-			 int		  offset,
-			 int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t p = READ (image, bits + offset);
-    uint64_t a = p >> 30;
-    uint64_t r = (p >> 20) & 0x3ff;
-    uint64_t g = (p >> 10) & 0x3ff;
-    uint64_t b = p & 0x3ff;
-
-    r = r << 6 | r >> 4;
-    g = g << 6 | g >> 4;
-    b = b << 6 | b >> 4;
-
-    a <<= 14;
-    a |= a >> 2;
-    a |= a >> 4;
-    a |= a >> 8;
-
-    return a << 48 | r << 32 | g << 16 | b;
-}
-
-/* Despite the type, this function expects a uint64_t buffer */
-static uint64_t
-fetch_pixel_x2r10g10b10 (bits_image_t *image,
-			 int	   offset,
-			 int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t p = READ (image, bits + offset);
-    uint64_t r = (p >> 20) & 0x3ff;
-    uint64_t g = (p >> 10) & 0x3ff;
-    uint64_t b = p & 0x3ff;
-    
-    r = r << 6 | r >> 4;
-    g = g << 6 | g >> 4;
-    b = b << 6 | b >> 4;
-    
-    return 0xffffULL << 48 | r << 32 | g << 16 | b;
-}
-
-/* Despite the type, expects a uint64_t buffer */
-static uint64_t
-fetch_pixel_a2b10g10r10 (bits_image_t *image,
-			 int           offset,
-			 int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t p = READ (image, bits + offset);
-    uint64_t a = p >> 30;
-    uint64_t b = (p >> 20) & 0x3ff;
-    uint64_t g = (p >> 10) & 0x3ff;
-    uint64_t r = p & 0x3ff;
-    
-    r = r << 6 | r >> 4;
-    g = g << 6 | g >> 4;
-    b = b << 6 | b >> 4;
-    
-    a <<= 14;
-    a |= a >> 2;
-    a |= a >> 4;
-    a |= a >> 8;
-    
-    return a << 48 | r << 32 | g << 16 | b;
-}
-
-/* Despite the type, this function expects a uint64_t buffer */
-static uint64_t
-fetch_pixel_x2b10g10r10 (bits_image_t *image,
-			 int           offset,
-			 int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t p = READ (image, bits + offset);
-    uint64_t b = (p >> 20) & 0x3ff;
-    uint64_t g = (p >> 10) & 0x3ff;
-    uint64_t r = p & 0x3ff;
-    
-    r = r << 6 | r >> 4;
-    g = g << 6 | g >> 4;
-    b = b << 6 | b >> 4;
-    
-    return 0xffffULL << 48 | r << 32 | g << 16 | b;
-}
-
-static uint32_t
-fetch_pixel_a8r8g8b8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    return READ (image, (uint32_t *)bits + offset);
-}
-
-static uint32_t
-fetch_pixel_x8r8g8b8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-
-    return READ (image, (uint32_t *)bits + offset) | 0xff000000;
-}
-
-static uint32_t
-fetch_pixel_a8b8g8r8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
-    
-    return ((pixel & 0xff000000) |
-	    ((pixel >> 16) & 0xff) |
-	    (pixel & 0x0000ff00) |
-	    ((pixel & 0xff) << 16));
-}
-
-static uint32_t
-fetch_pixel_x8b8g8r8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
-    
-    return ((0xff000000) |
-	    ((pixel >> 16) & 0xff) |
-	    (pixel & 0x0000ff00) |
-	    ((pixel & 0xff) << 16));
-}
-
-static uint32_t
-fetch_pixel_b8g8r8a8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
-    
-    return ((pixel & 0xff000000) >> 24 |
-	    (pixel & 0x00ff0000) >> 8 |
-	    (pixel & 0x0000ff00) << 8 |
-	    (pixel & 0x000000ff) << 24);
-}
-
-static uint32_t
-fetch_pixel_b8g8r8x8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
-    
-    return ((0xff000000) |
-	    (pixel & 0xff000000) >> 24 |
-	    (pixel & 0x00ff0000) >> 8 |
-	    (pixel & 0x0000ff00) << 8);
-}
-
-static uint32_t
-fetch_pixel_r8g8b8a8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
-    
-    return (((pixel & 0x000000ff) << 24) | (pixel >> 8));
-}
-
-static uint32_t
-fetch_pixel_r8g8b8x8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
-    
-    return (0xff000000 | (pixel >> 8));
-}
-
-static uint32_t
-fetch_pixel_x14r6g6b6 (bits_image_t *image,
-                       int           offset,
-                       int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *) bits + offset);
-    uint32_t r, g, b;
-
-    r = ((pixel & 0x3f000) << 6) | ((pixel & 0x30000));
-    g = ((pixel & 0x00fc0) << 4) | ((pixel & 0x00c00) >> 2);
-    b = ((pixel & 0x0003f) << 2) | ((pixel & 0x00030) >> 4);
-
-    return 0xff000000 | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_r8g8b8 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint8_t   *pixel = ((uint8_t *) bits) + (offset * 3);
-    
-#ifdef WORDS_BIGENDIAN
-    return (0xff000000 |
-	    (READ (image, pixel + 0) << 16) |
-	    (READ (image, pixel + 1) << 8) |
-	    (READ (image, pixel + 2)));
-#else
-    return (0xff000000 |
-	    (READ (image, pixel + 2) << 16) |
-	    (READ (image, pixel + 1) << 8) |
-	    (READ (image, pixel + 0)));
-#endif
-}
-
-static uint32_t
-fetch_pixel_b8g8r8 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint8_t   *pixel = ((uint8_t *) bits) + (offset * 3);
-#ifdef WORDS_BIGENDIAN
-    return (0xff000000 |
-	    (READ (image, pixel + 2) << 16) |
-	    (READ (image, pixel + 1) << 8) |
-	    (READ (image, pixel + 0)));
-#else
-    return (0xff000000 |
-	    (READ (image, pixel + 0) << 16) |
-	    (READ (image, pixel + 1) << 8) |
-	    (READ (image, pixel + 2)));
-#endif
-}
-
-static uint32_t
-fetch_pixel_r5g6b5 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    r = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) << 8;
-    g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
-    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_b5g6r5 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t r, g, b;
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    
-    b = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) >> 8;
-    g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
-    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_a1r5g5b5 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = (uint32_t) ((uint8_t) (0 - ((pixel & 0x8000) >> 15))) << 24;
-    r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
-    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
-    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
-    
-    return (a | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_x1r5g5b5 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
-    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
-    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_a1b5g5r5 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = (uint32_t) ((uint8_t) (0 - ((pixel & 0x8000) >> 15))) << 24;
-    b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
-    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
-    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
-    
-    return (a | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_x1b5g5r5 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
-    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
-    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_a4r4g4b4 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
-    r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
-    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
-    b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
-    
-    return (a | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_x4r4g4b4 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
-    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
-    b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_a4b4g4r4 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
-    b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) >> 4;
-    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
-    r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4)) << 16;
-    
-    return (a | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_x4b4g4r4 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) >> 4;
-    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
-    r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4)) << 16;
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_a8 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    
-    return pixel << 24;
-}
-
-static uint32_t
-fetch_pixel_r3g3b2 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    r = ((pixel & 0xe0) |
-	 ((pixel & 0xe0) >> 3) |
-	 ((pixel & 0xc0) >> 6)) << 16;
-    
-    g = ((pixel & 0x1c) |
-	 ((pixel & 0x18) >> 3) |
-	 ((pixel & 0x1c) << 3)) << 8;
-    
-    b = (((pixel & 0x03)     ) |
-	 ((pixel & 0x03) << 2) |
-	 ((pixel & 0x03) << 4) |
-	 ((pixel & 0x03) << 6));
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_b2g3r3 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t p = READ (image, (uint8_t *) bits + offset);
-    uint32_t r, g, b;
-
-    b  = p & 0xc0;
-    b |= b >> 2;
-    b |= b >> 4;
-    b &= 0xff;
-
-    g  = (p & 0x38) << 10;
-    g |= g >> 3;
-    g |= g >> 6;
-    g &= 0xff00;
-
-    r  = (p & 0x7) << 21;
-    r |= r >> 3;
-    r |= r >> 6;
-    r &= 0xff0000;
-
-    return 0xff000000 | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_a2r2g2b2 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = ((pixel & 0xc0) * 0x55) << 18;
-    r = ((pixel & 0x30) * 0x55) << 12;
-    g = ((pixel & 0x0c) * 0x55) << 6;
-    b = ((pixel & 0x03) * 0x55);
-    
-    return a | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_a2b2g2r2 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = ((pixel & 0xc0) * 0x55) << 18;
-    b = ((pixel & 0x30) * 0x55) >> 4;
-    g = ((pixel & 0x0c) * 0x55) << 6;
-    r = ((pixel & 0x03) * 0x55) << 16;
-    
-    return a | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_c8 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    const pixman_indexed_t * indexed = image->indexed;
-    
-    return indexed->rgba[pixel];
-}
-
-static uint32_t
-fetch_pixel_x4a4 (bits_image_t *image,
-		  int           offset,
-		  int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    
-    return ((pixel & 0xf) | ((pixel & 0xf) << 4)) << 24;
-}
-
-static uint32_t
-fetch_pixel_a4 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    
-    pixel |= pixel << 4;
-    return pixel << 24;
-}
-
-static uint32_t
-fetch_pixel_r1g2b1 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    uint32_t r, g, b;
-    
-    r = ((pixel & 0x8) * 0xff) << 13;
-    g = ((pixel & 0x6) * 0x55) << 7;
-    b = ((pixel & 0x1) * 0xff);
-    
-    return 0xff000000 | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_b1g2r1 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    uint32_t r, g, b;
-    
-    b = ((pixel & 0x8) * 0xff) >> 3;
-    g = ((pixel & 0x6) * 0x55) << 7;
-    r = ((pixel & 0x1) * 0xff) << 16;
-    
-    return 0xff000000 | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_a1r1g1b1 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    uint32_t a, r, g, b;
-
-    a = ((pixel & 0x8) * 0xff) << 21;
-    r = ((pixel & 0x4) * 0xff) << 14;
-    g = ((pixel & 0x2) * 0xff) << 7;
-    b = ((pixel & 0x1) * 0xff);
-
-    return a | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_a1b1g1r1 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    uint32_t a, r, g, b;
-
-    a = ((pixel & 0x8) * 0xff) << 21;
-    b = ((pixel & 0x4) * 0xff) >> 2;
-    g = ((pixel & 0x2) * 0xff) << 7;
-    r = ((pixel & 0x1) * 0xff) << 16;
-
-    return a | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_c4 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    const pixman_indexed_t * indexed = image->indexed;
-
-    return indexed->rgba[pixel];
-}
-
-static uint32_t
-fetch_pixel_a1 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, bits + (offset >> 5));
-    uint32_t a;
-    
-#ifdef WORDS_BIGENDIAN
-    a = pixel >> (0x1f - (offset & 0x1f));
-#else
-    a = pixel >> (offset & 0x1f);
-#endif
-    a = a & 1;
-    a |= a << 1;
-    a |= a << 2;
-    a |= a << 4;
-    
-    return a << 24;
-}
-
-static uint32_t
-fetch_pixel_g1 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, bits + (offset >> 5));
-    const pixman_indexed_t * indexed = image->indexed;
-    uint32_t a;
-    
-#ifdef WORDS_BIGENDIAN
-    a = pixel >> (0x1f - (offset & 0x1f));
-#else
-    a = pixel >> (offset & 0x1f);
-#endif
-    a = a & 1;
-    
-    return indexed->rgba[a];
-}
-
-static uint32_t
-fetch_pixel_yuy2 (bits_image_t *image,
-		  int           offset,
-		  int           line)
-{
-    const uint32_t *bits = image->bits + image->rowstride * line;
-    
-    int16_t y, u, v;
-    int32_t r, g, b;
-    
-    y = ((uint8_t *) bits)[offset << 1] - 16;
-    u = ((uint8_t *) bits)[((offset << 1) & - 4) + 1] - 128;
-    v = ((uint8_t *) bits)[((offset << 1) & - 4) + 3] - 128;
-    
-    /* R = 1.164(Y - 16) + 1.596(V - 128) */
-    r = 0x012b27 * y + 0x019a2e * v;
-    
-    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
-    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
-    
-    /* B = 1.164(Y - 16) + 2.018(U - 128) */
-    b = 0x012b27 * y + 0x0206a2 * u;
-    
-    return 0xff000000 |
-	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
-	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
-	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
-}
-
-static uint32_t
-fetch_pixel_yv12 (bits_image_t *image,
-		  int           offset,
-		  int           line)
-{
-    YV12_SETUP (image);
-    int16_t y = YV12_Y (line)[offset] - 16;
-    int16_t u = YV12_U (line)[offset >> 1] - 128;
-    int16_t v = YV12_V (line)[offset >> 1] - 128;
-    int32_t r, g, b;
-    
-    /* R = 1.164(Y - 16) + 1.596(V - 128) */
-    r = 0x012b27 * y + 0x019a2e * v;
-    
-    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
-    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
-    
-    /* B = 1.164(Y - 16) + 2.018(U - 128) */
-    b = 0x012b27 * y + 0x0206a2 * u;
-    
-    return 0xff000000 |
-	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
-	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
-	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
-}
-
-/*********************************** Store ************************************/
-
-#define SPLIT_A(v)              \
-    uint32_t a = ((v) >> 24),   \
-	r = ((v) >> 16) & 0xff, \
-	g = ((v) >> 8) & 0xff,  \
-	b = (v) & 0xff
-
-#define SPLIT(v)                     \
-    uint32_t r = ((v) >> 16) & 0xff, \
-	g = ((v) >> 8) & 0xff,       \
-	b = (v) & 0xff
-
-static void
-store_scanline_a2r10g10b10 (bits_image_t *  image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            const uint32_t *v)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = bits + x;
-    uint64_t *values = (uint64_t *)v;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       ((values[i] >> 32) & 0xc0000000) |
-	       ((values[i] >> 18) & 0x3ff00000) |
-	       ((values[i] >> 12) & 0xffc00) | 
-	       ((values[i] >> 6) & 0x3ff));    
-    }
-}
-
-static void
-store_scanline_x2r10g10b10 (bits_image_t *  image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            const uint32_t *v)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint64_t *values = (uint64_t *)v;
-    uint32_t *pixel = bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       ((values[i] >> 18) & 0x3ff00000) | 
-	       ((values[i] >> 12) & 0xffc00) |
-	       ((values[i] >> 6) & 0x3ff));
-    }
-}
-
-static void
-store_scanline_a2b10g10r10 (bits_image_t *  image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            const uint32_t *v)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = bits + x;
-    uint64_t *values = (uint64_t *)v;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       ((values[i] >> 32) & 0xc0000000) |
-	       ((values[i] >> 38) & 0x3ff) |
-	       ((values[i] >> 12) & 0xffc00) |
-	       ((values[i] << 14) & 0x3ff00000));
-    }
-}
-
-static void
-store_scanline_x2b10g10r10 (bits_image_t *  image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            const uint32_t *v)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint64_t *values = (uint64_t *)v;
-    uint32_t *pixel = bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       ((values[i] >> 38) & 0x3ff) |
-	       ((values[i] >> 12) & 0xffc00) |
-	       ((values[i] << 14) & 0x3ff00000));
-    }
-}
-
-static void
-store_scanline_a8r8g8b8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    
-    MEMCPY_WRAPPED (image, ((uint32_t *)bits) + x, values,
-                    width * sizeof(uint32_t));
-}
-
-static void
-store_scanline_x8r8g8b8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-	WRITE (image, pixel++, values[i] & 0xffffff);
-}
-
-static void
-store_scanline_a8b8g8r8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       (values[i] & 0xff00ff00)         |
-	       ((values[i] >> 16) & 0xff)       |
-	       ((values[i] & 0xff) << 16));
-    }
-}
-
-static void
-store_scanline_x8b8g8r8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       (values[i] & 0x0000ff00)         |
-	       ((values[i] >> 16) & 0xff)       |
-	       ((values[i] & 0xff) << 16));
-    }
-}
-
-static void
-store_scanline_b8g8r8a8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       ((values[i] >> 24) & 0x000000ff) |
-	       ((values[i] >>  8) & 0x0000ff00) |
-	       ((values[i] <<  8) & 0x00ff0000) |
-	       ((values[i] << 24) & 0xff000000));
-    }
-}
-
-static void
-store_scanline_b8g8r8x8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       ((values[i] >>  8) & 0x0000ff00) |
-	       ((values[i] <<  8) & 0x00ff0000) |
-	       ((values[i] << 24) & 0xff000000));
-    }
-}
-
-static void
-store_scanline_r8g8b8a8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       ((values[i] >> 24) & 0x000000ff) | (values[i] << 8));
-    }
-}
-
-static void
-store_scanline_r8g8b8x8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-	WRITE (image, pixel++, (values[i] << 8));
-}
-
-static void
-store_scanline_x14r6g6b6 (bits_image_t *  image,
-                          int             x,
-                          int             y,
-                          int             width,
-                          const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = ((uint32_t *) bits) + x;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t s = values[i];
-	uint32_t r, g, b;
-
-	r = (s & 0xfc0000) >> 6;
-	g = (s & 0x00fc00) >> 4;
-	b = (s & 0x0000fc) >> 2;
-
-	WRITE (image, pixel++, r | g | b);
-    }
-}
-
-static void
-store_scanline_r8g8b8 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t *pixel = ((uint8_t *) bits) + 3 * x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t val = values[i];
-	
-#ifdef WORDS_BIGENDIAN
-	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
-	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
-	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
-#else
-	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
-	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
-	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
-#endif
-    }
-}
-
-static void
-store_scanline_b8g8r8 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t *pixel = ((uint8_t *) bits) + 3 * x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t val = values[i];
-	
-#ifdef WORDS_BIGENDIAN
-	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
-	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
-	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
-#else
-	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
-	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
-	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
-#endif
-    }
-}
-
-static void
-store_scanline_r5g6b5 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t s = values[i];
-	
-	WRITE (image, pixel++,
-	       ((s >> 3) & 0x001f) |
-	       ((s >> 5) & 0x07e0) |
-	       ((s >> 8) & 0xf800));
-    }
-}
-
-static void
-store_scanline_b5g6r5 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((b << 8) & 0xf800) |
-	       ((g << 3) & 0x07e0) |
-	       ((r >> 3)         ));
-    }
-}
-
-static void
-store_scanline_a1r5g5b5 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((a << 8) & 0x8000) |
-	       ((r << 7) & 0x7c00) |
-	       ((g << 2) & 0x03e0) |
-	       ((b >> 3)         ));
-    }
-}
-
-static void
-store_scanline_x1r5g5b5 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((r << 7) & 0x7c00) |
-	       ((g << 2) & 0x03e0) |
-	       ((b >> 3)         ));
-    }
-}
-
-static void
-store_scanline_a1b5g5r5 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((a << 8) & 0x8000) |
-	       ((b << 7) & 0x7c00) |
-	       ((g << 2) & 0x03e0) |
-	       ((r >> 3)         ));
-    }
-}
-
-static void
-store_scanline_x1b5g5r5 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++, ((b << 7) & 0x7c00) |
-	       ((g << 2) & 0x03e0) |
-	       ((r >> 3)         ));
-    }
-}
-
-static void
-store_scanline_a4r4g4b4 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((a << 8) & 0xf000) |
-	       ((r << 4) & 0x0f00) |
-	       ((g     ) & 0x00f0) |
-	       ((b >> 4)         ));
-    }
-}
-
-static void
-store_scanline_x4r4g4b4 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((r << 4) & 0x0f00) |
-	       ((g     ) & 0x00f0) |
-	       ((b >> 4)         ));
-    }
-}
-
-static void
-store_scanline_a4b4g4r4 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	WRITE (image, pixel++, ((a << 8) & 0xf000) |
-	       ((b << 4) & 0x0f00) |
-	       ((g     ) & 0x00f0) |
-	       ((r >> 4)         ));
-    }
-}
-
-static void
-store_scanline_x4b4g4r4 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((b << 4) & 0x0f00) |
-	       ((g     ) & 0x00f0) |
-	       ((r >> 4)         ));
-    }
-}
-
-static void
-store_scanline_a8 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++, values[i] >> 24);
-    }
-}
-
-static void
-store_scanline_r3g3b2 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((r     ) & 0xe0) |
-	       ((g >> 3) & 0x1c) |
-	       ((b >> 6)       ));
-    }
-}
-
-static void
-store_scanline_b2g3r3 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((b     ) & 0xc0) |
-	       ((g >> 2) & 0x38) |
-	       ((r >> 5)       ));
-    }
-}
-
-static void
-store_scanline_a2r2g2b2 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((a     ) & 0xc0) |
-	       ((r >> 2) & 0x30) |
-	       ((g >> 4) & 0x0c) |
-	       ((b >> 6)       ));
-    }
-}
-
-static void
-store_scanline_a2b2g2r2 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((a     ) & 0xc0) |
-	       ((b >> 2) & 0x30) |
-	       ((g >> 4) & 0x0c) |
-	       ((r >> 6)       ));
-    }
-}
-
-static void
-store_scanline_c8 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t *pixel = ((uint8_t *) bits) + x;
-    const pixman_indexed_t *indexed = image->indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-	WRITE (image, pixel++, RGB24_TO_ENTRY (indexed,values[i]));
-}
-
-static void
-store_scanline_g8 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t *pixel = ((uint8_t *) bits) + x;
-    const pixman_indexed_t *indexed = image->indexed;
-    int i;
-
-    for (i = 0; i < width; ++i)
-	WRITE (image, pixel++, RGB24_TO_ENTRY_Y (indexed,values[i]));
-}
-
-static void
-store_scanline_x4a4 (bits_image_t *  image,
-                     int             x,
-                     int             y,
-                     int             width,
-                     const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-
-    for (i = 0; i < width; ++i)
-	WRITE (image, pixel++, values[i] >> 28);
-}
-
-#define STORE_8(img,l,o,v)  (WRITE (img, (uint8_t *)(l) + ((o) >> 3), (v)))
-#ifdef WORDS_BIGENDIAN
-
-#define STORE_4(img,l,o,v)						\
-    do									\
-    {									\
-	int bo = 4 * (o);						\
-	int v4 = (v) & 0x0f;						\
-									\
-	STORE_8 (img, l, bo, (						\
-		     bo & 4 ?						\
-		     (FETCH_8 (img, l, bo) & 0xf0) | (v4) :		\
-		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4)));	\
-    } while (0)
-#else
-
-#define STORE_4(img,l,o,v)						\
-    do									\
-    {									\
-	int bo = 4 * (o);						\
-	int v4 = (v) & 0x0f;						\
-									\
-	STORE_8 (img, l, bo, (						\
-		     bo & 4 ?						\
-		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4) :	\
-		     (FETCH_8 (img, l, bo) & 0xf0) | (v4)));		\
-    } while (0)
-#endif
-
-static void
-store_scanline_a4 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-
-    for (i = 0; i < width; ++i)
-	STORE_4 (image, bits, i + x, values[i] >> 28);
-}
-
-static void
-store_scanline_r1g2b1 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-
-	SPLIT (values[i]);
-	pixel = (((r >> 4) & 0x8) |
-	         ((g >> 5) & 0x6) |
-	         ((b >> 7)      ));
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_b1g2r1 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-
-	SPLIT (values[i]);
-	pixel = (((b >> 4) & 0x8) |
-	         ((g >> 5) & 0x6) |
-	         ((r >> 7)      ));
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_a1r1g1b1 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-
-	SPLIT_A (values[i]);
-	pixel = (((a >> 4) & 0x8) |
-	         ((r >> 5) & 0x4) |
-	         ((g >> 6) & 0x2) |
-	         ((b >> 7)      ));
-
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_a1b1g1r1 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-
-	SPLIT_A (values[i]);
-	pixel = (((a >> 4) & 0x8) |
-	         ((b >> 5) & 0x4) |
-	         ((g >> 6) & 0x2) |
-	         ((r >> 7)      ));
-
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_c4 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    const pixman_indexed_t *indexed = image->indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-	
-	pixel = RGB24_TO_ENTRY (indexed, values[i]);
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_g4 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    const pixman_indexed_t *indexed = image->indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-	
-	pixel = RGB24_TO_ENTRY_Y (indexed, values[i]);
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_a1 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t  *pixel = ((uint32_t *) bits) + ((i + x) >> 5);
-	uint32_t mask, v;
-	
-#ifdef WORDS_BIGENDIAN
-	mask = 1 << (0x1f - ((i + x) & 0x1f));
-#else
-	mask = 1 << ((i + x) & 0x1f);
-#endif
-	v = values[i] & 0x80000000 ? mask : 0;
-	
-	WRITE (image, pixel, (READ (image, pixel) & ~mask) | v);
-    }
-}
-
-static void
-store_scanline_g1 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    const pixman_indexed_t *indexed = image->indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t  *pixel = ((uint32_t *) bits) + ((i + x) >> 5);
-	uint32_t mask, v;
-	
-#ifdef WORDS_BIGENDIAN
-	mask = 1 << (0x1f - ((i + x) & 0x1f));
-#else
-	mask = 1 << ((i + x) & 0x1f);
-#endif
-	v = RGB24_TO_ENTRY_Y (indexed, values[i]) & 0x1 ? mask : 0;
-	
-	WRITE (image, pixel, (READ (image, pixel) & ~mask) | v);
-    }
-}
-
-/*
- * Contracts a 64bpp image to 32bpp and then stores it using a regular 32-bit
- * store proc. Despite the type, this function expects a uint64_t buffer.
- */
-static void
-store_scanline_generic_64 (bits_image_t *  image,
-                           int             x,
-                           int             y,
-                           int             width,
-                           const uint32_t *values)
-{
-    uint32_t *argb8_pixels;
-    
-    assert (image->common.type == BITS);
-    
-    argb8_pixels = pixman_malloc_ab (width, sizeof(uint32_t));
-    if (!argb8_pixels)
-	return;
-    
-    /* Contract the scanline.  We could do this in place if values weren't
-     * const.
-     */
-    pixman_contract (argb8_pixels, (uint64_t *)values, width);
-    
-    image->store_scanline_32 (image, x, y, width, argb8_pixels);
-    
-    free (argb8_pixels);
-}
-
-/* Despite the type, this function expects both buffer
- * and mask to be uint64_t
- */
-static void
-fetch_scanline_generic_64 (pixman_image_t *image,
-                           int             x,
-                           int             y,
-                           int             width,
-                           uint32_t *      buffer,
-                           const uint32_t *mask)
-{
-    pixman_format_code_t format;
-    
-    /* Fetch the pixels into the first half of buffer and then expand them in
-     * place.
-     */
-    image->bits.fetch_scanline_32 (image, x, y, width, buffer, NULL);
-
-    format = image->bits.format;
-    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR	||
-	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
-    {
-	/* Indexed formats are mapped to a8r8g8b8 with full
-	 * precision, so when expanding we shouldn't correct
-	 * for the width of the channels
-	 */
-	
-	format = PIXMAN_a8r8g8b8;
-    }
-    
-    pixman_expand ((uint64_t *)buffer, buffer, format, width);
-}
-
-/* Despite the type, this function expects a uint64_t *buffer */
-static uint64_t
-fetch_pixel_generic_64 (bits_image_t *image,
-			int	      offset,
-			int           line)
-{
-    uint32_t pixel32 = image->fetch_pixel_32 (image, offset, line);
-    uint64_t result;
-    pixman_format_code_t format;
-
-    format = image->format;
-    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR	||
-	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
-    {
-	/* Indexed formats are mapped to a8r8g8b8 with full
-	 * precision, so when expanding we shouldn't correct
-	 * for the width of the channels
-	 */
-	
-	format = PIXMAN_a8r8g8b8;
-    }
-    
-    pixman_expand ((uint64_t *)&result, &pixel32, format, 1);
-
-    return result;
-}
-
-/*
- * XXX: The transformed fetch path only works at 32-bpp so far.  When all
- * paths have wide versions, this can be removed.
- *
- * WARNING: This function loses precision!
- */
-static uint32_t
-fetch_pixel_generic_lossy_32 (bits_image_t *image,
-			      int           offset,
-			      int           line)
-{
-    uint64_t pixel64 = image->fetch_pixel_64 (image, offset, line);
-    uint32_t result;
-    
-    pixman_contract (&result, &pixel64, 1);
-
-    return result;
-}
-
-typedef struct
-{
-    pixman_format_code_t	format;
-    fetch_scanline_t		fetch_scanline_32;
-    fetch_scanline_t		fetch_scanline_64;
-    fetch_pixel_32_t		fetch_pixel_32;
-    fetch_pixel_64_t		fetch_pixel_64;
-    store_scanline_t		store_scanline_32;
-    store_scanline_t		store_scanline_64;
-} format_info_t;
-
-#define FORMAT_INFO(format) 						\
-    {									\
-	PIXMAN_ ## format,						\
-	    fetch_scanline_ ## format,					\
-	    fetch_scanline_generic_64,					\
-	    fetch_pixel_ ## format, fetch_pixel_generic_64,		\
-	    store_scanline_ ## format, store_scanline_generic_64	\
-    }
-
-static const format_info_t accessors[] =
-{
-/* 32 bpp formats */
-    FORMAT_INFO (a8r8g8b8),
-    FORMAT_INFO (x8r8g8b8),
-    FORMAT_INFO (a8b8g8r8),
-    FORMAT_INFO (x8b8g8r8),
-    FORMAT_INFO (b8g8r8a8),
-    FORMAT_INFO (b8g8r8x8),
-    FORMAT_INFO (r8g8b8a8),
-    FORMAT_INFO (r8g8b8x8),
-    FORMAT_INFO (x14r6g6b6),
-
-/* 24bpp formats */
-    FORMAT_INFO (r8g8b8),
-    FORMAT_INFO (b8g8r8),
-    
-/* 16bpp formats */
-    FORMAT_INFO (r5g6b5),
-    FORMAT_INFO (b5g6r5),
-    
-    FORMAT_INFO (a1r5g5b5),
-    FORMAT_INFO (x1r5g5b5),
-    FORMAT_INFO (a1b5g5r5),
-    FORMAT_INFO (x1b5g5r5),
-    FORMAT_INFO (a4r4g4b4),
-    FORMAT_INFO (x4r4g4b4),
-    FORMAT_INFO (a4b4g4r4),
-    FORMAT_INFO (x4b4g4r4),
-    
-/* 8bpp formats */
-    FORMAT_INFO (a8),
-    FORMAT_INFO (r3g3b2),
-    FORMAT_INFO (b2g3r3),
-    FORMAT_INFO (a2r2g2b2),
-    FORMAT_INFO (a2b2g2r2),
-    
-    FORMAT_INFO (c8),
-    
-#define fetch_scanline_g8 fetch_scanline_c8
-#define fetch_pixel_g8 fetch_pixel_c8
-    FORMAT_INFO (g8),
-    
-#define fetch_scanline_x4c4 fetch_scanline_c8
-#define fetch_pixel_x4c4 fetch_pixel_c8
-#define store_scanline_x4c4 store_scanline_c8
-    FORMAT_INFO (x4c4),
-    
-#define fetch_scanline_x4g4 fetch_scanline_c8
-#define fetch_pixel_x4g4 fetch_pixel_c8
-#define store_scanline_x4g4 store_scanline_g8
-    FORMAT_INFO (x4g4),
-    
-    FORMAT_INFO (x4a4),
-    
-/* 4bpp formats */
-    FORMAT_INFO (a4),
-    FORMAT_INFO (r1g2b1),
-    FORMAT_INFO (b1g2r1),
-    FORMAT_INFO (a1r1g1b1),
-    FORMAT_INFO (a1b1g1r1),
-    
-    FORMAT_INFO (c4),
-    
-#define fetch_scanline_g4 fetch_scanline_c4
-#define fetch_pixel_g4 fetch_pixel_c4
-    FORMAT_INFO (g4),
-    
-/* 1bpp formats */
-    FORMAT_INFO (a1),
-    FORMAT_INFO (g1),
-    
-/* Wide formats */
-    
-    { PIXMAN_a2r10g10b10,
-      NULL, fetch_scanline_a2r10g10b10,
-      fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10,
-      NULL, store_scanline_a2r10g10b10 },
-    
-    { PIXMAN_x2r10g10b10,
-      NULL, fetch_scanline_x2r10g10b10,
-      fetch_pixel_generic_lossy_32, fetch_pixel_x2r10g10b10,
-      NULL, store_scanline_x2r10g10b10 },
-    
-    { PIXMAN_a2b10g10r10,
-      NULL, fetch_scanline_a2b10g10r10,
-      fetch_pixel_generic_lossy_32, fetch_pixel_a2b10g10r10,
-      NULL, store_scanline_a2b10g10r10 },
-    
-    { PIXMAN_x2b10g10r10,
-      NULL, fetch_scanline_x2b10g10r10,
-      fetch_pixel_generic_lossy_32, fetch_pixel_x2b10g10r10,
-      NULL, store_scanline_x2b10g10r10 },
-    
-/* YUV formats */
-    { PIXMAN_yuy2,
-      fetch_scanline_yuy2, fetch_scanline_generic_64,
-      fetch_pixel_yuy2, fetch_pixel_generic_64,
-      NULL, NULL },
-    
-    { PIXMAN_yv12,
-      fetch_scanline_yv12, fetch_scanline_generic_64,
-      fetch_pixel_yv12, fetch_pixel_generic_64,
-      NULL, NULL },
-    
-    { PIXMAN_null },
-};
-
-static void
-setup_accessors (bits_image_t *image)
-{
-    const format_info_t *info = accessors;
-    
-    while (info->format != PIXMAN_null)
-    {
-	if (info->format == image->format)
-	{
-	    image->fetch_scanline_32 = info->fetch_scanline_32;
-	    image->fetch_scanline_64 = info->fetch_scanline_64;
-	    image->fetch_pixel_32 = info->fetch_pixel_32;
-	    image->fetch_pixel_64 = info->fetch_pixel_64;
-	    image->store_scanline_32 = info->store_scanline_32;
-	    image->store_scanline_64 = info->store_scanline_64;
-	    
-	    return;
-	}
-	
-	info++;
-    }
-}
-
-#ifndef PIXMAN_FB_ACCESSORS
-void
-_pixman_bits_image_setup_accessors_accessors (bits_image_t *image);
-
-void
-_pixman_bits_image_setup_accessors (bits_image_t *image)
-{
-    if (image->read_func || image->write_func)
-	_pixman_bits_image_setup_accessors_accessors (image);
-    else
-	setup_accessors (image);
-}
-
-#else
-
-void
-_pixman_bits_image_setup_accessors_accessors (bits_image_t *image)
-{
-    setup_accessors (image);
-}
-
-#endif
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pixman-private.h"
+#include "pixman-accessor.h"
+
+#define CONVERT_RGB24_TO_Y15(s)						\
+    (((((s) >> 16) & 0xff) * 153 +					\
+      (((s) >>  8) & 0xff) * 301 +					\
+      (((s)      ) & 0xff) * 58) >> 2)
+
+#define CONVERT_RGB24_TO_RGB15(s)                                       \
+    ((((s) >> 3) & 0x001f) |                                            \
+     (((s) >> 6) & 0x03e0) |                                            \
+     (((s) >> 9) & 0x7c00))
+
+#define RGB15_TO_ENTRY(mif,rgb15)					\
+    ((mif)->ent[rgb15])
+
+#define RGB24_TO_ENTRY(mif,rgb24)					\
+    RGB15_TO_ENTRY (mif,CONVERT_RGB24_TO_RGB15 (rgb24))
+
+#define RGB24_TO_ENTRY_Y(mif,rgb24)					\
+    ((mif)->ent[CONVERT_RGB24_TO_Y15 (rgb24)])
+
+/*
+ * YV12 setup and access macros
+ */
+
+#define YV12_SETUP(image)                                               \
+    bits_image_t *__bits_image = (bits_image_t *)image;                 \
+    uint32_t *bits = __bits_image->bits;                                \
+    int stride = __bits_image->rowstride;                               \
+    int offset0 = stride < 0 ?                                          \
+    ((-stride) >> 1) * ((__bits_image->height - 1) >> 1) - stride :	\
+    stride * __bits_image->height;					\
+    int offset1 = stride < 0 ?                                          \
+    offset0 + ((-stride) >> 1) * ((__bits_image->height) >> 1) :	\
+	offset0 + (offset0 >> 2)
+
+/* Note no trailing semicolon on the above macro; if it's there, then
+ * the typical usage of YV12_SETUP(image); will have an extra trailing ;
+ * that some compilers will interpret as a statement -- and then any further
+ * variable declarations will cause an error.
+ */
+
+#define YV12_Y(line)                                                    \
+    ((uint8_t *) ((bits) + (stride) * (line)))
+
+#define YV12_U(line)                                                    \
+    ((uint8_t *) ((bits) + offset1 +                                    \
+                  ((stride) >> 1) * ((line) >> 1)))
+
+#define YV12_V(line)                                                    \
+    ((uint8_t *) ((bits) + offset0 +                                    \
+                  ((stride) >> 1) * ((line) >> 1)))
+
+/********************************** Fetch ************************************/
+
+static void
+fetch_scanline_a8r8g8b8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    
+    MEMCPY_WRAPPED (image,
+                    buffer, (const uint32_t *)bits + x,
+                    width * sizeof(uint32_t));
+}
+
+static void
+fetch_scanline_x8r8g8b8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (const uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    
+    while (pixel < end)
+	*buffer++ = READ (image, pixel++) | 0xff000000;
+}
+
+static void
+fetch_scanline_a8b8g8r8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
+	*buffer++ = (p & 0xff00ff00)	|
+	    ((p >> 16) & 0xff)		|
+	    ((p & 0xff) << 16);
+    }
+}
+
+static void
+fetch_scanline_x8b8g8r8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
+	*buffer++ = 0xff000000		|
+	    (p & 0x0000ff00)		|
+	    ((p >> 16) & 0xff)		|
+	    ((p & 0xff) << 16);
+    }
+}
+
+static void
+fetch_scanline_b8g8r8a8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+
+	*buffer++ = (((p & 0xff000000) >> 24)	|
+	             ((p & 0x00ff0000) >> 8)	|
+	             ((p & 0x0000ff00) << 8)	|
+	             ((p & 0x000000ff) << 24));
+    }
+}
+
+static void
+fetch_scanline_b8g8r8x8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
+	*buffer++ = (0xff000000 |
+	             ((p & 0xff000000) >> 24)	|
+	             ((p & 0x00ff0000) >> 8)	|
+	             ((p & 0x0000ff00) << 8));
+    }
+}
+
+static void
+fetch_scanline_r8g8b8a8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+
+	*buffer++ = (((p & 0x000000ff) << 24) | (p >> 8));
+    }
+}
+
+static void
+fetch_scanline_r8g8b8x8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
+	*buffer++ = (0xff000000 | (p >> 8));
+    }
+}
+
+static void
+fetch_scanline_x14r6g6b6 (pixman_image_t *image,
+                          int             x,
+                          int             y,
+                          int             width,
+                          uint32_t *      buffer,
+                          const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (const uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+
+	r = ((p & 0x3f000) << 6) | ((p & 0x30000));
+	g = ((p & 0x00fc0) << 4) | ((p & 0x00c00) >> 2);
+	b = ((p & 0x0003f) << 2) | ((p & 0x00030) >> 4);
+
+	*buffer++ = 0xff000000 | r | g | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_a2r10g10b10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t a = p >> 30;
+	uint64_t r = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t b = p & 0x3ff;
+
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
+
+	a <<= 14;
+	a |= a >> 2;
+	a |= a >> 4;
+	a |= a >> 8;
+
+	*buffer++ = a << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_x2r10g10b10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t r = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t b = p & 0x3ff;
+	
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
+	
+	*buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_a2b10g10r10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t a = p >> 30;
+	uint64_t b = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t r = p & 0x3ff;
+	
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
+	
+	a <<= 14;
+	a |= a >> 2;
+	a |= a >> 4;
+	a |= a >> 8;
+
+	*buffer++ = a << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_x2b10g10r10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t b = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t r = p & 0x3ff;
+	
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
+	
+	*buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+static void
+fetch_scanline_r8g8b8 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint8_t *pixel = (const uint8_t *)bits + 3 * x;
+    const uint8_t *end = pixel + 3 * width;
+    
+    while (pixel < end)
+    {
+	uint32_t b = 0xff000000;
+	
+#ifdef WORDS_BIGENDIAN
+	b |= (READ (image, pixel++) << 16);
+	b |= (READ (image, pixel++) << 8);
+	b |= (READ (image, pixel++));
+#else
+	b |= (READ (image, pixel++));
+	b |= (READ (image, pixel++) << 8);
+	b |= (READ (image, pixel++) << 16);
+#endif
+	
+	*buffer++ = b;
+    }
+}
+
+static void
+fetch_scanline_b8g8r8 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint8_t *pixel = (const uint8_t *)bits + 3 * x;
+    const uint8_t *end = pixel + 3 * width;
+    
+    while (pixel < end)
+    {
+	uint32_t b = 0xff000000;
+#ifdef WORDS_BIGENDIAN
+	b |= (READ (image, pixel++));
+	b |= (READ (image, pixel++) << 8);
+	b |= (READ (image, pixel++) << 16);
+#else
+	b |= (READ (image, pixel++) << 16);
+	b |= (READ (image, pixel++) << 8);
+	b |= (READ (image, pixel++));
+#endif
+	*buffer++ = b;
+    }
+}
+
+static void
+fetch_scanline_r5g6b5 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r = (((p) << 3) & 0xf8) |
+	    (((p) << 5) & 0xfc00) |
+	    (((p) << 8) & 0xf80000);
+	
+	r |= (r >> 5) & 0x70007;
+	r |= (r >> 6) & 0x300;
+	
+	*buffer++ = 0xff000000 | r;
+    }
+}
+
+static void
+fetch_scanline_b5g6r5 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
+	b = ((p & 0xf800) | ((p & 0xe000) >> 5)) >> 8;
+	g = ((p & 0x07e0) | ((p & 0x0600) >> 6)) << 5;
+	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
+	
+	*buffer++ = 0xff000000 | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_a1r5g5b5 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b, a;
+	
+	a = (uint32_t) ((uint8_t) (0 - ((p & 0x8000) >> 15))) << 24;
+	r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
+	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
+	b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
+	
+	*buffer++ = a | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_x1r5g5b5 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
+	r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
+	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
+	b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
+	
+	*buffer++ = 0xff000000 | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_a1b5g5r5 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    uint32_t r, g, b, a;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
+	a = (uint32_t) ((uint8_t) (0 - ((p & 0x8000) >> 15))) << 24;
+	b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
+	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
+	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
+	
+	*buffer++ = a | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_x1b5g5r5 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
+	b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
+	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
+	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
+	
+	*buffer++ = 0xff000000 | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_a4r4g4b4 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b, a;
+	
+	a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
+	r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
+	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
+	b = ((p & 0x000f) | ((p & 0x000f) << 4));
+	
+	*buffer++ = a | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_x4r4g4b4 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
+	r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
+	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
+	b = ((p & 0x000f) | ((p & 0x000f) << 4));
+	
+	*buffer++ = 0xff000000 | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_a4b4g4r4 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b, a;
+	
+	a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
+	b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) >> 4;
+	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
+	r = ((p & 0x000f) | ((p & 0x000f) << 4)) << 16;
+	
+	*buffer++ = a | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_x4b4g4r4 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
+	b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) >> 4;
+	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
+	r = ((p & 0x000f) | ((p & 0x000f) << 4)) << 16;
+	
+	*buffer++ = 0xff000000 | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_a8 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    
+    while (pixel < end)
+	*buffer++ = READ (image, pixel++) << 24;
+}
+
+static void
+fetch_scanline_r3g3b2 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
+	r = ((p & 0xe0) | ((p & 0xe0) >> 3) | ((p & 0xc0) >> 6)) << 16;
+	g = ((p & 0x1c) | ((p & 0x18) >> 3) | ((p & 0x1c) << 3)) << 8;
+	b = (((p & 0x03)     ) |
+	     ((p & 0x03) << 2) |
+	     ((p & 0x03) << 4) |
+	     ((p & 0x03) << 6));
+	
+	*buffer++ = 0xff000000 | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_b2g3r3 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+
+	b  = p & 0xc0;
+	b |= b >> 2;
+	b |= b >> 4;
+	b &= 0xff;
+
+	g  = (p & 0x38) << 10;
+	g |= g >> 3;
+	g |= g >> 6;
+	g &= 0xff00;
+
+	r  = (p & 0x7) << 21;
+	r |= r >> 3;
+	r |= r >> 6;
+	r &= 0xff0000;
+
+	*buffer++ = 0xff000000 | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_a2r2g2b2 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t a, r, g, b;
+	
+	a = ((p & 0xc0) * 0x55) << 18;
+	r = ((p & 0x30) * 0x55) << 12;
+	g = ((p & 0x0c) * 0x55) << 6;
+	b = ((p & 0x03) * 0x55);
+	
+	*buffer++ = a | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_a2b2g2r2 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t a, r, g, b;
+	
+	a = ((p & 0xc0) * 0x55) << 18;
+	b = ((p & 0x30) * 0x55) >> 4;
+	g = ((p & 0x0c) * 0x55) << 6;
+	r = ((p & 0x03) * 0x55) << 16;
+	
+	*buffer++ = a | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_c8 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const pixman_indexed_t * indexed = image->bits.indexed;
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
+	*buffer++ = indexed->rgba[p];
+    }
+}
+
+static void
+fetch_scanline_x4a4 (pixman_image_t *image,
+                     int             x,
+                     int             y,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+   
+    while (pixel < end)
+    {
+	uint8_t p = READ (image, pixel++) & 0xf;
+
+	*buffer++ = (p | (p << 4)) << 24;
+    }
+}
+
+#define FETCH_8(img,l,o)    (READ (img, (((uint8_t *)(l)) + ((o) >> 3))))
+#ifdef WORDS_BIGENDIAN
+#define FETCH_4(img,l,o)						\
+    (((4 * (o)) & 4) ? (FETCH_8 (img,l, 4 * (o)) & 0xf) : (FETCH_8 (img,l,(4 * (o))) >> 4))
+#else
+#define FETCH_4(img,l,o)						\
+    (((4 * (o)) & 4) ? (FETCH_8 (img, l, 4 * (o)) >> 4) : (FETCH_8 (img, l, (4 * (o))) & 0xf))
+#endif
+
+static void
+fetch_scanline_a4 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+
+	p |= p << 4;
+
+	*buffer++ = p << 24;
+    }
+}
+
+static void
+fetch_scanline_r1g2b1 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+	uint32_t r, g, b;
+	
+	r = ((p & 0x8) * 0xff) << 13;
+	g = ((p & 0x6) * 0x55) << 7;
+	b = ((p & 0x1) * 0xff);
+	
+	*buffer++ = 0xff000000 | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_b1g2r1 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+	uint32_t r, g, b;
+	
+	b = ((p & 0x8) * 0xff) >> 3;
+	g = ((p & 0x6) * 0x55) << 7;
+	r = ((p & 0x1) * 0xff) << 16;
+
+	*buffer++ = 0xff000000 | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_a1r1g1b1 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    uint32_t a, r, g, b;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+
+	a = ((p & 0x8) * 0xff) << 21;
+	r = ((p & 0x4) * 0xff) << 14;
+	g = ((p & 0x2) * 0xff) << 7;
+	b = ((p & 0x1) * 0xff);
+
+	*buffer++ = a | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_a1b1g1r1 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+	uint32_t a, r, g, b;
+
+	a = ((p & 0x8) * 0xff) << 21;
+	b = ((p & 0x4) * 0xff) >> 2;
+	g = ((p & 0x2) * 0xff) << 7;
+	r = ((p & 0x1) * 0xff) << 16;
+
+	*buffer++ = a | r | g | b;
+    }
+}
+
+static void
+fetch_scanline_c4 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const pixman_indexed_t * indexed = image->bits.indexed;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+	
+	*buffer++ = indexed->rgba[p];
+    }
+}
+
+static void
+fetch_scanline_a1 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = READ (image, bits + ((i + x) >> 5));
+	uint32_t a;
+	
+#ifdef WORDS_BIGENDIAN
+	a = p >> (0x1f - ((i + x) & 0x1f));
+#else
+	a = p >> ((i + x) & 0x1f);
+#endif
+	a = a & 1;
+	a |= a << 1;
+	a |= a << 2;
+	a |= a << 4;
+	
+	*buffer++ = a << 24;
+    }
+}
+
+static void
+fetch_scanline_g1 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const pixman_indexed_t * indexed = image->bits.indexed;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = READ (image, bits + ((i + x) >> 5));
+	uint32_t a;
+	
+#ifdef WORDS_BIGENDIAN
+	a = p >> (0x1f - ((i + x) & 0x1f));
+#else
+	a = p >> ((i + x) & 0x1f);
+#endif
+	a = a & 1;
+	
+	*buffer++ = indexed->rgba[a];
+    }
+}
+
+static void
+fetch_scanline_yuy2 (pixman_image_t *image,
+                     int             x,
+                     int             line,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + image->bits.rowstride * line;
+    int i;
+    
+    for (i = 0; i < width; i++)
+    {
+	int16_t y, u, v;
+	int32_t r, g, b;
+	
+	y = ((uint8_t *) bits)[(x + i) << 1] - 16;
+	u = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 1] - 128;
+	v = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 3] - 128;
+	
+	/* R = 1.164(Y - 16) + 1.596(V - 128) */
+	r = 0x012b27 * y + 0x019a2e * v;
+	/* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+	g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+	/* B = 1.164(Y - 16) + 2.018(U - 128) */
+	b = 0x012b27 * y + 0x0206a2 * u;
+	
+	*buffer++ = 0xff000000 |
+	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+    }
+}
+
+static void
+fetch_scanline_yv12 (pixman_image_t *image,
+                     int             x,
+                     int             line,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask)
+{
+    YV12_SETUP (image);
+    uint8_t *y_line = YV12_Y (line);
+    uint8_t *u_line = YV12_U (line);
+    uint8_t *v_line = YV12_V (line);
+    int i;
+    
+    for (i = 0; i < width; i++)
+    {
+	int16_t y, u, v;
+	int32_t r, g, b;
+
+	y = y_line[x + i] - 16;
+	u = u_line[(x + i) >> 1] - 128;
+	v = v_line[(x + i) >> 1] - 128;
+
+	/* R = 1.164(Y - 16) + 1.596(V - 128) */
+	r = 0x012b27 * y + 0x019a2e * v;
+	/* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+	g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+	/* B = 1.164(Y - 16) + 2.018(U - 128) */
+	b = 0x012b27 * y + 0x0206a2 * u;
+
+	*buffer++ = 0xff000000 |
+	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+    }
+}
+
+/**************************** Pixel wise fetching *****************************/
+
+/* Despite the type, expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_a2r10g10b10 (bits_image_t *image,
+			 int		  offset,
+			 int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t a = p >> 30;
+    uint64_t r = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t b = p & 0x3ff;
+
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+
+    a <<= 14;
+    a |= a >> 2;
+    a |= a >> 4;
+    a |= a >> 8;
+
+    return a << 48 | r << 32 | g << 16 | b;
+}
+
+/* Despite the type, this function expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_x2r10g10b10 (bits_image_t *image,
+			 int	   offset,
+			 int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t r = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t b = p & 0x3ff;
+    
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+    
+    return 0xffffULL << 48 | r << 32 | g << 16 | b;
+}
+
+/* Despite the type, expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_a2b10g10r10 (bits_image_t *image,
+			 int           offset,
+			 int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t a = p >> 30;
+    uint64_t b = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t r = p & 0x3ff;
+    
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+    
+    a <<= 14;
+    a |= a >> 2;
+    a |= a >> 4;
+    a |= a >> 8;
+    
+    return a << 48 | r << 32 | g << 16 | b;
+}
+
+/* Despite the type, this function expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_x2b10g10r10 (bits_image_t *image,
+			 int           offset,
+			 int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t b = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t r = p & 0x3ff;
+    
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+    
+    return 0xffffULL << 48 | r << 32 | g << 16 | b;
+}
+
+static uint32_t
+fetch_pixel_a8r8g8b8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    return READ (image, (uint32_t *)bits + offset);
+}
+
+static uint32_t
+fetch_pixel_x8r8g8b8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+
+    return READ (image, (uint32_t *)bits + offset) | 0xff000000;
+}
+
+static uint32_t
+fetch_pixel_a8b8g8r8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
+    
+    return ((pixel & 0xff000000) |
+	    ((pixel >> 16) & 0xff) |
+	    (pixel & 0x0000ff00) |
+	    ((pixel & 0xff) << 16));
+}
+
+static uint32_t
+fetch_pixel_x8b8g8r8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
+    
+    return ((0xff000000) |
+	    ((pixel >> 16) & 0xff) |
+	    (pixel & 0x0000ff00) |
+	    ((pixel & 0xff) << 16));
+}
+
+static uint32_t
+fetch_pixel_b8g8r8a8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
+    
+    return ((pixel & 0xff000000) >> 24 |
+	    (pixel & 0x00ff0000) >> 8 |
+	    (pixel & 0x0000ff00) << 8 |
+	    (pixel & 0x000000ff) << 24);
+}
+
+static uint32_t
+fetch_pixel_b8g8r8x8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
+    
+    return ((0xff000000) |
+	    (pixel & 0xff000000) >> 24 |
+	    (pixel & 0x00ff0000) >> 8 |
+	    (pixel & 0x0000ff00) << 8);
+}
+
+static uint32_t
+fetch_pixel_r8g8b8a8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
+    
+    return (((pixel & 0x000000ff) << 24) | (pixel >> 8));
+}
+
+static uint32_t
+fetch_pixel_r8g8b8x8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
+    
+    return (0xff000000 | (pixel >> 8));
+}
+
+static uint32_t
+fetch_pixel_x14r6g6b6 (bits_image_t *image,
+                       int           offset,
+                       int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint32_t *) bits + offset);
+    uint32_t r, g, b;
+
+    r = ((pixel & 0x3f000) << 6) | ((pixel & 0x30000));
+    g = ((pixel & 0x00fc0) << 4) | ((pixel & 0x00c00) >> 2);
+    b = ((pixel & 0x0003f) << 2) | ((pixel & 0x00030) >> 4);
+
+    return 0xff000000 | r | g | b;
+}
+
+static uint32_t
+fetch_pixel_r8g8b8 (bits_image_t *image,
+		    int           offset,
+		    int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint8_t   *pixel = ((uint8_t *) bits) + (offset * 3);
+    
+#ifdef WORDS_BIGENDIAN
+    return (0xff000000 |
+	    (READ (image, pixel + 0) << 16) |
+	    (READ (image, pixel + 1) << 8) |
+	    (READ (image, pixel + 2)));
+#else
+    return (0xff000000 |
+	    (READ (image, pixel + 2) << 16) |
+	    (READ (image, pixel + 1) << 8) |
+	    (READ (image, pixel + 0)));
+#endif
+}
+
+static uint32_t
+fetch_pixel_b8g8r8 (bits_image_t *image,
+		    int           offset,
+		    int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint8_t   *pixel = ((uint8_t *) bits) + (offset * 3);
+#ifdef WORDS_BIGENDIAN
+    return (0xff000000 |
+	    (READ (image, pixel + 2) << 16) |
+	    (READ (image, pixel + 1) << 8) |
+	    (READ (image, pixel + 0)));
+#else
+    return (0xff000000 |
+	    (READ (image, pixel + 0) << 16) |
+	    (READ (image, pixel + 1) << 8) |
+	    (READ (image, pixel + 2)));
+#endif
+}
+
+static uint32_t
+fetch_pixel_r5g6b5 (bits_image_t *image,
+		    int           offset,
+		    int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t r, g, b;
+    
+    r = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) << 8;
+    g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
+    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
+    
+    return (0xff000000 | r | g | b);
+}
+
+static uint32_t
+fetch_pixel_b5g6r5 (bits_image_t *image,
+		    int           offset,
+		    int           line)
+{
+    uint32_t r, g, b;
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    
+    b = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) >> 8;
+    g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
+    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
+    
+    return (0xff000000 | r | g | b);
+}
+
+static uint32_t
+fetch_pixel_a1r5g5b5 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
+    a = (uint32_t) ((uint8_t) (0 - ((pixel & 0x8000) >> 15))) << 24;
+    r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
+    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
+    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
+    
+    return (a | r | g | b);
+}
+
+static uint32_t
+fetch_pixel_x1r5g5b5 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t r, g, b;
+    
+    r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
+    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
+    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
+    
+    return (0xff000000 | r | g | b);
+}
+
+static uint32_t
+fetch_pixel_a1b5g5r5 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
+    a = (uint32_t) ((uint8_t) (0 - ((pixel & 0x8000) >> 15))) << 24;
+    b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
+    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
+    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
+    
+    return (a | r | g | b);
+}
+
+static uint32_t
+fetch_pixel_x1b5g5r5 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t r, g, b;
+    
+    b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
+    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
+    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
+    
+    return (0xff000000 | r | g | b);
+}
+
+static uint32_t
+fetch_pixel_a4r4g4b4 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
+    a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
+    r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
+    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
+    b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
+    
+    return (a | r | g | b);
+}
+
+static uint32_t
+fetch_pixel_x4r4g4b4 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t r, g, b;
+    
+    r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
+    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
+    b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
+    
+    return (0xff000000 | r | g | b);
+}
+
+static uint32_t
+fetch_pixel_a4b4g4r4 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
+    a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
+    b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) >> 4;
+    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
+    r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4)) << 16;
+    
+    return (a | r | g | b);
+}
+
+static uint32_t
+fetch_pixel_x4b4g4r4 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t r, g, b;
+    
+    b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) >> 4;
+    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
+    r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4)) << 16;
+    
+    return (0xff000000 | r | g | b);
+}
+
+static uint32_t
+fetch_pixel_a8 (bits_image_t *image,
+		int           offset,
+		int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    
+    return pixel << 24;
+}
+
+static uint32_t
+fetch_pixel_r3g3b2 (bits_image_t *image,
+		    int           offset,
+		    int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    uint32_t r, g, b;
+    
+    r = ((pixel & 0xe0) |
+	 ((pixel & 0xe0) >> 3) |
+	 ((pixel & 0xc0) >> 6)) << 16;
+    
+    g = ((pixel & 0x1c) |
+	 ((pixel & 0x18) >> 3) |
+	 ((pixel & 0x1c) << 3)) << 8;
+    
+    b = (((pixel & 0x03)     ) |
+	 ((pixel & 0x03) << 2) |
+	 ((pixel & 0x03) << 4) |
+	 ((pixel & 0x03) << 6));
+    
+    return (0xff000000 | r | g | b);
+}
+
+static uint32_t
+fetch_pixel_b2g3r3 (bits_image_t *image,
+		    int           offset,
+		    int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, (uint8_t *) bits + offset);
+    uint32_t r, g, b;
+
+    b  = p & 0xc0;
+    b |= b >> 2;
+    b |= b >> 4;
+    b &= 0xff;
+
+    g  = (p & 0x38) << 10;
+    g |= g >> 3;
+    g |= g >> 6;
+    g &= 0xff00;
+
+    r  = (p & 0x7) << 21;
+    r |= r >> 3;
+    r |= r >> 6;
+    r &= 0xff0000;
+
+    return 0xff000000 | r | g | b;
+}
+
+static uint32_t
+fetch_pixel_a2r2g2b2 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
+    a = ((pixel & 0xc0) * 0x55) << 18;
+    r = ((pixel & 0x30) * 0x55) << 12;
+    g = ((pixel & 0x0c) * 0x55) << 6;
+    b = ((pixel & 0x03) * 0x55);
+    
+    return a | r | g | b;
+}
+
+static uint32_t
+fetch_pixel_a2b2g2r2 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
+    a = ((pixel & 0xc0) * 0x55) << 18;
+    b = ((pixel & 0x30) * 0x55) >> 4;
+    g = ((pixel & 0x0c) * 0x55) << 6;
+    r = ((pixel & 0x03) * 0x55) << 16;
+    
+    return a | r | g | b;
+}
+
+static uint32_t
+fetch_pixel_c8 (bits_image_t *image,
+		int           offset,
+		int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    const pixman_indexed_t * indexed = image->indexed;
+    
+    return indexed->rgba[pixel];
+}
+
+static uint32_t
+fetch_pixel_x4a4 (bits_image_t *image,
+		  int           offset,
+		  int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    
+    return ((pixel & 0xf) | ((pixel & 0xf) << 4)) << 24;
+}
+
+static uint32_t
+fetch_pixel_a4 (bits_image_t *image,
+		int           offset,
+		int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    
+    pixel |= pixel << 4;
+    return pixel << 24;
+}
+
+static uint32_t
+fetch_pixel_r1g2b1 (bits_image_t *image,
+		    int           offset,
+		    int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    uint32_t r, g, b;
+    
+    r = ((pixel & 0x8) * 0xff) << 13;
+    g = ((pixel & 0x6) * 0x55) << 7;
+    b = ((pixel & 0x1) * 0xff);
+    
+    return 0xff000000 | r | g | b;
+}
+
+static uint32_t
+fetch_pixel_b1g2r1 (bits_image_t *image,
+		    int           offset,
+		    int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    uint32_t r, g, b;
+    
+    b = ((pixel & 0x8) * 0xff) >> 3;
+    g = ((pixel & 0x6) * 0x55) << 7;
+    r = ((pixel & 0x1) * 0xff) << 16;
+    
+    return 0xff000000 | r | g | b;
+}
+
+static uint32_t
+fetch_pixel_a1r1g1b1 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    uint32_t a, r, g, b;
+
+    a = ((pixel & 0x8) * 0xff) << 21;
+    r = ((pixel & 0x4) * 0xff) << 14;
+    g = ((pixel & 0x2) * 0xff) << 7;
+    b = ((pixel & 0x1) * 0xff);
+
+    return a | r | g | b;
+}
+
+static uint32_t
+fetch_pixel_a1b1g1r1 (bits_image_t *image,
+		      int           offset,
+		      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    uint32_t a, r, g, b;
+
+    a = ((pixel & 0x8) * 0xff) << 21;
+    b = ((pixel & 0x4) * 0xff) >> 2;
+    g = ((pixel & 0x2) * 0xff) << 7;
+    r = ((pixel & 0x1) * 0xff) << 16;
+
+    return a | r | g | b;
+}
+
+static uint32_t
+fetch_pixel_c4 (bits_image_t *image,
+		int           offset,
+		int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    const pixman_indexed_t * indexed = image->indexed;
+
+    return indexed->rgba[pixel];
+}
+
+static uint32_t
+fetch_pixel_a1 (bits_image_t *image,
+		int           offset,
+		int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, bits + (offset >> 5));
+    uint32_t a;
+    
+#ifdef WORDS_BIGENDIAN
+    a = pixel >> (0x1f - (offset & 0x1f));
+#else
+    a = pixel >> (offset & 0x1f);
+#endif
+    a = a & 1;
+    a |= a << 1;
+    a |= a << 2;
+    a |= a << 4;
+    
+    return a << 24;
+}
+
+static uint32_t
+fetch_pixel_g1 (bits_image_t *image,
+		int           offset,
+		int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, bits + (offset >> 5));
+    const pixman_indexed_t * indexed = image->indexed;
+    uint32_t a;
+    
+#ifdef WORDS_BIGENDIAN
+    a = pixel >> (0x1f - (offset & 0x1f));
+#else
+    a = pixel >> (offset & 0x1f);
+#endif
+    a = a & 1;
+    
+    return indexed->rgba[a];
+}
+
+static uint32_t
+fetch_pixel_yuy2 (bits_image_t *image,
+		  int           offset,
+		  int           line)
+{
+    const uint32_t *bits = image->bits + image->rowstride * line;
+    
+    int16_t y, u, v;
+    int32_t r, g, b;
+    
+    y = ((uint8_t *) bits)[offset << 1] - 16;
+    u = ((uint8_t *) bits)[((offset << 1) & - 4) + 1] - 128;
+    v = ((uint8_t *) bits)[((offset << 1) & - 4) + 3] - 128;
+    
+    /* R = 1.164(Y - 16) + 1.596(V - 128) */
+    r = 0x012b27 * y + 0x019a2e * v;
+    
+    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    
+    /* B = 1.164(Y - 16) + 2.018(U - 128) */
+    b = 0x012b27 * y + 0x0206a2 * u;
+    
+    return 0xff000000 |
+	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+}
+
+static uint32_t
+fetch_pixel_yv12 (bits_image_t *image,
+		  int           offset,
+		  int           line)
+{
+    YV12_SETUP (image);
+    int16_t y = YV12_Y (line)[offset] - 16;
+    int16_t u = YV12_U (line)[offset >> 1] - 128;
+    int16_t v = YV12_V (line)[offset >> 1] - 128;
+    int32_t r, g, b;
+    
+    /* R = 1.164(Y - 16) + 1.596(V - 128) */
+    r = 0x012b27 * y + 0x019a2e * v;
+    
+    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    
+    /* B = 1.164(Y - 16) + 2.018(U - 128) */
+    b = 0x012b27 * y + 0x0206a2 * u;
+    
+    return 0xff000000 |
+	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+}
+
+/*********************************** Store ************************************/
+
+#define SPLIT_A(v)              \
+    uint32_t a = ((v) >> 24),   \
+	r = ((v) >> 16) & 0xff, \
+	g = ((v) >> 8) & 0xff,  \
+	b = (v) & 0xff
+
+#define SPLIT(v)                     \
+    uint32_t r = ((v) >> 16) & 0xff, \
+	g = ((v) >> 8) & 0xff,       \
+	b = (v) & 0xff
+
+static void
+store_scanline_a2r10g10b10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    uint64_t *values = (uint64_t *)v;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 32) & 0xc0000000) |
+	       ((values[i] >> 18) & 0x3ff00000) |
+	       ((values[i] >> 12) & 0xffc00) | 
+	       ((values[i] >> 6) & 0x3ff));    
+    }
+}
+
+static void
+store_scanline_x2r10g10b10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint64_t *values = (uint64_t *)v;
+    uint32_t *pixel = bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 18) & 0x3ff00000) | 
+	       ((values[i] >> 12) & 0xffc00) |
+	       ((values[i] >> 6) & 0x3ff));
+    }
+}
+
+static void
+store_scanline_a2b10g10r10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    uint64_t *values = (uint64_t *)v;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 32) & 0xc0000000) |
+	       ((values[i] >> 38) & 0x3ff) |
+	       ((values[i] >> 12) & 0xffc00) |
+	       ((values[i] << 14) & 0x3ff00000));
+    }
+}
+
+static void
+store_scanline_x2b10g10r10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint64_t *values = (uint64_t *)v;
+    uint32_t *pixel = bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 38) & 0x3ff) |
+	       ((values[i] >> 12) & 0xffc00) |
+	       ((values[i] << 14) & 0x3ff00000));
+    }
+}
+
+static void
+store_scanline_a8r8g8b8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    
+    MEMCPY_WRAPPED (image, ((uint32_t *)bits) + x, values,
+                    width * sizeof(uint32_t));
+}
+
+static void
+store_scanline_x8r8g8b8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+	WRITE (image, pixel++, values[i] & 0xffffff);
+}
+
+static void
+store_scanline_a8b8g8r8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       (values[i] & 0xff00ff00)         |
+	       ((values[i] >> 16) & 0xff)       |
+	       ((values[i] & 0xff) << 16));
+    }
+}
+
+static void
+store_scanline_x8b8g8r8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       (values[i] & 0x0000ff00)         |
+	       ((values[i] >> 16) & 0xff)       |
+	       ((values[i] & 0xff) << 16));
+    }
+}
+
+static void
+store_scanline_b8g8r8a8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 24) & 0x000000ff) |
+	       ((values[i] >>  8) & 0x0000ff00) |
+	       ((values[i] <<  8) & 0x00ff0000) |
+	       ((values[i] << 24) & 0xff000000));
+    }
+}
+
+static void
+store_scanline_b8g8r8x8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >>  8) & 0x0000ff00) |
+	       ((values[i] <<  8) & 0x00ff0000) |
+	       ((values[i] << 24) & 0xff000000));
+    }
+}
+
+static void
+store_scanline_r8g8b8a8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 24) & 0x000000ff) | (values[i] << 8));
+    }
+}
+
+static void
+store_scanline_r8g8b8x8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+	WRITE (image, pixel++, (values[i] << 8));
+}
+
+static void
+store_scanline_x14r6g6b6 (bits_image_t *  image,
+                          int             x,
+                          int             y,
+                          int             width,
+                          const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = ((uint32_t *) bits) + x;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = values[i];
+	uint32_t r, g, b;
+
+	r = (s & 0xfc0000) >> 6;
+	g = (s & 0x00fc00) >> 4;
+	b = (s & 0x0000fc) >> 2;
+
+	WRITE (image, pixel++, r | g | b);
+    }
+}
+
+static void
+store_scanline_r8g8b8 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t *pixel = ((uint8_t *) bits) + 3 * x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t val = values[i];
+	
+#ifdef WORDS_BIGENDIAN
+	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
+	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
+	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
+#else
+	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
+	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
+	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
+#endif
+    }
+}
+
+static void
+store_scanline_b8g8r8 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t *pixel = ((uint8_t *) bits) + 3 * x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t val = values[i];
+	
+#ifdef WORDS_BIGENDIAN
+	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
+	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
+	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
+#else
+	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
+	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
+	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
+#endif
+    }
+}
+
+static void
+store_scanline_r5g6b5 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint16_t *pixel = ((uint16_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = values[i];
+	
+	WRITE (image, pixel++,
+	       ((s >> 3) & 0x001f) |
+	       ((s >> 5) & 0x07e0) |
+	       ((s >> 8) & 0xf800));
+    }
+}
+
+static void
+store_scanline_b5g6r5 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((b << 8) & 0xf800) |
+	       ((g << 3) & 0x07e0) |
+	       ((r >> 3)         ));
+    }
+}
+
+static void
+store_scanline_a1r5g5b5 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((a << 8) & 0x8000) |
+	       ((r << 7) & 0x7c00) |
+	       ((g << 2) & 0x03e0) |
+	       ((b >> 3)         ));
+    }
+}
+
+static void
+store_scanline_x1r5g5b5 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((r << 7) & 0x7c00) |
+	       ((g << 2) & 0x03e0) |
+	       ((b >> 3)         ));
+    }
+}
+
+static void
+store_scanline_a1b5g5r5 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((a << 8) & 0x8000) |
+	       ((b << 7) & 0x7c00) |
+	       ((g << 2) & 0x03e0) |
+	       ((r >> 3)         ));
+    }
+}
+
+static void
+store_scanline_x1b5g5r5 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++, ((b << 7) & 0x7c00) |
+	       ((g << 2) & 0x03e0) |
+	       ((r >> 3)         ));
+    }
+}
+
+static void
+store_scanline_a4r4g4b4 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((a << 8) & 0xf000) |
+	       ((r << 4) & 0x0f00) |
+	       ((g     ) & 0x00f0) |
+	       ((b >> 4)         ));
+    }
+}
+
+static void
+store_scanline_x4r4g4b4 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((r << 4) & 0x0f00) |
+	       ((g     ) & 0x00f0) |
+	       ((b >> 4)         ));
+    }
+}
+
+static void
+store_scanline_a4b4g4r4 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	WRITE (image, pixel++, ((a << 8) & 0xf000) |
+	       ((b << 4) & 0x0f00) |
+	       ((g     ) & 0x00f0) |
+	       ((r >> 4)         ));
+    }
+}
+
+static void
+store_scanline_x4b4g4r4 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((b << 4) & 0x0f00) |
+	       ((g     ) & 0x00f0) |
+	       ((r >> 4)         ));
+    }
+}
+
+static void
+store_scanline_a8 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++, values[i] >> 24);
+    }
+}
+
+static void
+store_scanline_r3g3b2 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((r     ) & 0xe0) |
+	       ((g >> 3) & 0x1c) |
+	       ((b >> 6)       ));
+    }
+}
+
+static void
+store_scanline_b2g3r3 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((b     ) & 0xc0) |
+	       ((g >> 2) & 0x38) |
+	       ((r >> 5)       ));
+    }
+}
+
+static void
+store_scanline_a2r2g2b2 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((a     ) & 0xc0) |
+	       ((r >> 2) & 0x30) |
+	       ((g >> 4) & 0x0c) |
+	       ((b >> 6)       ));
+    }
+}
+
+static void
+store_scanline_a2b2g2r2 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((a     ) & 0xc0) |
+	       ((b >> 2) & 0x30) |
+	       ((g >> 4) & 0x0c) |
+	       ((r >> 6)       ));
+    }
+}
+
+static void
+store_scanline_c8 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t *pixel = ((uint8_t *) bits) + x;
+    const pixman_indexed_t *indexed = image->indexed;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+	WRITE (image, pixel++, RGB24_TO_ENTRY (indexed,values[i]));
+}
+
+static void
+store_scanline_g8 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t *pixel = ((uint8_t *) bits) + x;
+    const pixman_indexed_t *indexed = image->indexed;
+    int i;
+
+    for (i = 0; i < width; ++i)
+	WRITE (image, pixel++, RGB24_TO_ENTRY_Y (indexed,values[i]));
+}
+
+static void
+store_scanline_x4a4 (bits_image_t *  image,
+                     int             x,
+                     int             y,
+                     int             width,
+                     const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    int i;
+
+    for (i = 0; i < width; ++i)
+	WRITE (image, pixel++, values[i] >> 28);
+}
+
+#define STORE_8(img,l,o,v)  (WRITE (img, (uint8_t *)(l) + ((o) >> 3), (v)))
+#ifdef WORDS_BIGENDIAN
+
+#define STORE_4(img,l,o,v)						\
+    do									\
+    {									\
+	int bo = 4 * (o);						\
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4) :		\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4)));	\
+    } while (0)
+#else
+
+#define STORE_4(img,l,o,v)						\
+    do									\
+    {									\
+	int bo = 4 * (o);						\
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4) :	\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4)));		\
+    } while (0)
+#endif
+
+static void
+store_scanline_a4 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    int i;
+
+    for (i = 0; i < width; ++i)
+	STORE_4 (image, bits, i + x, values[i] >> 28);
+}
+
+static void
+store_scanline_r1g2b1 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t pixel;
+
+	SPLIT (values[i]);
+	pixel = (((r >> 4) & 0x8) |
+	         ((g >> 5) & 0x6) |
+	         ((b >> 7)      ));
+	STORE_4 (image, bits, i + x, pixel);
+    }
+}
+
+static void
+store_scanline_b1g2r1 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t pixel;
+
+	SPLIT (values[i]);
+	pixel = (((b >> 4) & 0x8) |
+	         ((g >> 5) & 0x6) |
+	         ((r >> 7)      ));
+	STORE_4 (image, bits, i + x, pixel);
+    }
+}
+
+static void
+store_scanline_a1r1g1b1 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t pixel;
+
+	SPLIT_A (values[i]);
+	pixel = (((a >> 4) & 0x8) |
+	         ((r >> 5) & 0x4) |
+	         ((g >> 6) & 0x2) |
+	         ((b >> 7)      ));
+
+	STORE_4 (image, bits, i + x, pixel);
+    }
+}
+
+static void
+store_scanline_a1b1g1r1 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t pixel;
+
+	SPLIT_A (values[i]);
+	pixel = (((a >> 4) & 0x8) |
+	         ((b >> 5) & 0x4) |
+	         ((g >> 6) & 0x2) |
+	         ((r >> 7)      ));
+
+	STORE_4 (image, bits, i + x, pixel);
+    }
+}
+
+static void
+store_scanline_c4 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    const pixman_indexed_t *indexed = image->indexed;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t pixel;
+	
+	pixel = RGB24_TO_ENTRY (indexed, values[i]);
+	STORE_4 (image, bits, i + x, pixel);
+    }
+}
+
+static void
+store_scanline_g4 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    const pixman_indexed_t *indexed = image->indexed;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t pixel;
+	
+	pixel = RGB24_TO_ENTRY_Y (indexed, values[i]);
+	STORE_4 (image, bits, i + x, pixel);
+    }
+}
+
+static void
+store_scanline_a1 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t  *pixel = ((uint32_t *) bits) + ((i + x) >> 5);
+	uint32_t mask, v;
+	
+#ifdef WORDS_BIGENDIAN
+	mask = 1 << (0x1f - ((i + x) & 0x1f));
+#else
+	mask = 1 << ((i + x) & 0x1f);
+#endif
+	v = values[i] & 0x80000000 ? mask : 0;
+	
+	WRITE (image, pixel, (READ (image, pixel) & ~mask) | v);
+    }
+}
+
+static void
+store_scanline_g1 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    const pixman_indexed_t *indexed = image->indexed;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t  *pixel = ((uint32_t *) bits) + ((i + x) >> 5);
+	uint32_t mask, v;
+	
+#ifdef WORDS_BIGENDIAN
+	mask = 1 << (0x1f - ((i + x) & 0x1f));
+#else
+	mask = 1 << ((i + x) & 0x1f);
+#endif
+	v = RGB24_TO_ENTRY_Y (indexed, values[i]) & 0x1 ? mask : 0;
+	
+	WRITE (image, pixel, (READ (image, pixel) & ~mask) | v);
+    }
+}
+
+/*
+ * Contracts a 64bpp image to 32bpp and then stores it using a regular 32-bit
+ * store proc. Despite the type, this function expects a uint64_t buffer.
+ */
+static void
+store_scanline_generic_64 (bits_image_t *  image,
+                           int             x,
+                           int             y,
+                           int             width,
+                           const uint32_t *values)
+{
+    uint32_t *argb8_pixels;
+    
+    assert (image->common.type == BITS);
+    
+    argb8_pixels = pixman_malloc_ab (width, sizeof(uint32_t));
+    if (!argb8_pixels)
+	return;
+    
+    /* Contract the scanline.  We could do this in place if values weren't
+     * const.
+     */
+    pixman_contract (argb8_pixels, (uint64_t *)values, width);
+    
+    image->store_scanline_32 (image, x, y, width, argb8_pixels);
+    
+    free (argb8_pixels);
+}
+
+/* Despite the type, this function expects both buffer
+ * and mask to be uint64_t
+ */
+static void
+fetch_scanline_generic_64 (pixman_image_t *image,
+                           int             x,
+                           int             y,
+                           int             width,
+                           uint32_t *      buffer,
+                           const uint32_t *mask)
+{
+    pixman_format_code_t format;
+    
+    /* Fetch the pixels into the first half of buffer and then expand them in
+     * place.
+     */
+    image->bits.fetch_scanline_32 (image, x, y, width, buffer, NULL);
+
+    format = image->bits.format;
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR	||
+	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+	/* Indexed formats are mapped to a8r8g8b8 with full
+	 * precision, so when expanding we shouldn't correct
+	 * for the width of the channels
+	 */
+	
+	format = PIXMAN_a8r8g8b8;
+    }
+    
+    pixman_expand ((uint64_t *)buffer, buffer, format, width);
+}
+
+/* Despite the type, this function expects a uint64_t *buffer */
+static uint64_t
+fetch_pixel_generic_64 (bits_image_t *image,
+			int	      offset,
+			int           line)
+{
+    uint32_t pixel32 = image->fetch_pixel_32 (image, offset, line);
+    uint64_t result;
+    pixman_format_code_t format;
+
+    format = image->format;
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR	||
+	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+	/* Indexed formats are mapped to a8r8g8b8 with full
+	 * precision, so when expanding we shouldn't correct
+	 * for the width of the channels
+	 */
+	
+	format = PIXMAN_a8r8g8b8;
+    }
+    
+    pixman_expand ((uint64_t *)&result, &pixel32, format, 1);
+
+    return result;
+}
+
+/*
+ * XXX: The transformed fetch path only works at 32-bpp so far.  When all
+ * paths have wide versions, this can be removed.
+ *
+ * WARNING: This function loses precision!
+ */
+static uint32_t
+fetch_pixel_generic_lossy_32 (bits_image_t *image,
+			      int           offset,
+			      int           line)
+{
+    uint64_t pixel64 = image->fetch_pixel_64 (image, offset, line);
+    uint32_t result;
+    
+    pixman_contract (&result, &pixel64, 1);
+
+    return result;
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    fetch_scanline_t		fetch_scanline_32;
+    fetch_scanline_t		fetch_scanline_64;
+    fetch_pixel_32_t		fetch_pixel_32;
+    fetch_pixel_64_t		fetch_pixel_64;
+    store_scanline_t		store_scanline_32;
+    store_scanline_t		store_scanline_64;
+} format_info_t;
+
+#define FORMAT_INFO(format) 						\
+    {									\
+	PIXMAN_ ## format,						\
+	    fetch_scanline_ ## format,					\
+	    fetch_scanline_generic_64,					\
+	    fetch_pixel_ ## format, fetch_pixel_generic_64,		\
+	    store_scanline_ ## format, store_scanline_generic_64	\
+    }
+
+static const format_info_t accessors[] =
+{
+/* 32 bpp formats */
+    FORMAT_INFO (a8r8g8b8),
+    FORMAT_INFO (x8r8g8b8),
+    FORMAT_INFO (a8b8g8r8),
+    FORMAT_INFO (x8b8g8r8),
+    FORMAT_INFO (b8g8r8a8),
+    FORMAT_INFO (b8g8r8x8),
+    FORMAT_INFO (r8g8b8a8),
+    FORMAT_INFO (r8g8b8x8),
+    FORMAT_INFO (x14r6g6b6),
+
+/* 24bpp formats */
+    FORMAT_INFO (r8g8b8),
+    FORMAT_INFO (b8g8r8),
+    
+/* 16bpp formats */
+    FORMAT_INFO (r5g6b5),
+    FORMAT_INFO (b5g6r5),
+    
+    FORMAT_INFO (a1r5g5b5),
+    FORMAT_INFO (x1r5g5b5),
+    FORMAT_INFO (a1b5g5r5),
+    FORMAT_INFO (x1b5g5r5),
+    FORMAT_INFO (a4r4g4b4),
+    FORMAT_INFO (x4r4g4b4),
+    FORMAT_INFO (a4b4g4r4),
+    FORMAT_INFO (x4b4g4r4),
+    
+/* 8bpp formats */
+    FORMAT_INFO (a8),
+    FORMAT_INFO (r3g3b2),
+    FORMAT_INFO (b2g3r3),
+    FORMAT_INFO (a2r2g2b2),
+    FORMAT_INFO (a2b2g2r2),
+    
+    FORMAT_INFO (c8),
+    
+#define fetch_scanline_g8 fetch_scanline_c8
+#define fetch_pixel_g8 fetch_pixel_c8
+    FORMAT_INFO (g8),
+    
+#define fetch_scanline_x4c4 fetch_scanline_c8
+#define fetch_pixel_x4c4 fetch_pixel_c8
+#define store_scanline_x4c4 store_scanline_c8
+    FORMAT_INFO (x4c4),
+    
+#define fetch_scanline_x4g4 fetch_scanline_c8
+#define fetch_pixel_x4g4 fetch_pixel_c8
+#define store_scanline_x4g4 store_scanline_g8
+    FORMAT_INFO (x4g4),
+    
+    FORMAT_INFO (x4a4),
+    
+/* 4bpp formats */
+    FORMAT_INFO (a4),
+    FORMAT_INFO (r1g2b1),
+    FORMAT_INFO (b1g2r1),
+    FORMAT_INFO (a1r1g1b1),
+    FORMAT_INFO (a1b1g1r1),
+    
+    FORMAT_INFO (c4),
+    
+#define fetch_scanline_g4 fetch_scanline_c4
+#define fetch_pixel_g4 fetch_pixel_c4
+    FORMAT_INFO (g4),
+    
+/* 1bpp formats */
+    FORMAT_INFO (a1),
+    FORMAT_INFO (g1),
+    
+/* Wide formats */
+    
+    { PIXMAN_a2r10g10b10,
+      NULL, fetch_scanline_a2r10g10b10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10,
+      NULL, store_scanline_a2r10g10b10 },
+    
+    { PIXMAN_x2r10g10b10,
+      NULL, fetch_scanline_x2r10g10b10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2r10g10b10,
+      NULL, store_scanline_x2r10g10b10 },
+    
+    { PIXMAN_a2b10g10r10,
+      NULL, fetch_scanline_a2b10g10r10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2b10g10r10,
+      NULL, store_scanline_a2b10g10r10 },
+    
+    { PIXMAN_x2b10g10r10,
+      NULL, fetch_scanline_x2b10g10r10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2b10g10r10,
+      NULL, store_scanline_x2b10g10r10 },
+    
+/* YUV formats */
+    { PIXMAN_yuy2,
+      fetch_scanline_yuy2, fetch_scanline_generic_64,
+      fetch_pixel_yuy2, fetch_pixel_generic_64,
+      NULL, NULL },
+    
+    { PIXMAN_yv12,
+      fetch_scanline_yv12, fetch_scanline_generic_64,
+      fetch_pixel_yv12, fetch_pixel_generic_64,
+      NULL, NULL },
+    
+    { PIXMAN_null },
+};
+
+static void
+setup_accessors (bits_image_t *image)
+{
+    const format_info_t *info = accessors;
+    
+    while (info->format != PIXMAN_null)
+    {
+	if (info->format == image->format)
+	{
+	    image->fetch_scanline_32 = info->fetch_scanline_32;
+	    image->fetch_scanline_64 = info->fetch_scanline_64;
+	    image->fetch_pixel_32 = info->fetch_pixel_32;
+	    image->fetch_pixel_64 = info->fetch_pixel_64;
+	    image->store_scanline_32 = info->store_scanline_32;
+	    image->store_scanline_64 = info->store_scanline_64;
+	    
+	    return;
+	}
+	
+	info++;
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+void
+_pixman_bits_image_setup_accessors_accessors (bits_image_t *image);
+
+void
+_pixman_bits_image_setup_accessors (bits_image_t *image)
+{
+    if (image->read_func || image->write_func)
+	_pixman_bits_image_setup_accessors_accessors (image);
+    else
+	setup_accessors (image);
+}
+
+#else
+
+void
+_pixman_bits_image_setup_accessors_accessors (bits_image_t *image)
+{
+    setup_accessors (image);
+}
+
+#endif
diff --git a/pixman/pixman/pixman-arm-common.h b/pixman/pixman/pixman-arm-common.h
index c15af540a..f56264e8c 100644
--- a/pixman/pixman/pixman-arm-common.h
+++ b/pixman/pixman/pixman-arm-common.h
@@ -1,416 +1,416 @@
-/*
- * Copyright © 2010 Nokia Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
- */
-
-#ifndef PIXMAN_ARM_COMMON_H
-#define PIXMAN_ARM_COMMON_H
-
-#include "pixman-inlines.h"
-
-/* Define some macros which can expand into proxy functions between
- * ARM assembly optimized functions and the rest of pixman fast path API.
- *
- * All the low level ARM assembly functions have to use ARM EABI
- * calling convention and take up to 8 arguments:
- *    width, height, dst, dst_stride, src, src_stride, mask, mask_stride
- *
- * The arguments are ordered with the most important coming first (the
- * first 4 arguments are passed to function in registers, the rest are
- * on stack). The last arguments are optional, for example if the
- * function is not using mask, then 'mask' and 'mask_stride' can be
- * omitted when doing a function call.
- *
- * Arguments 'src' and 'mask' contain either a pointer to the top left
- * pixel of the composited rectangle or a pixel color value depending
- * on the function type. In the case of just a color value (solid source
- * or mask), the corresponding stride argument is unused.
- */
-
-#define SKIP_ZERO_SRC  1
-#define SKIP_ZERO_MASK 2
-
-#define PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(cputype, name,                \
-                                          src_type, src_cnt,            \
-                                          dst_type, dst_cnt)            \
-void                                                                    \
-pixman_composite_##name##_asm_##cputype (int32_t   w,                   \
-                                         int32_t   h,                   \
-                                         dst_type *dst,                 \
-                                         int32_t   dst_stride,          \
-                                         src_type *src,                 \
-                                         int32_t   src_stride);         \
-                                                                        \
-static void                                                             \
-cputype##_composite_##name (pixman_implementation_t *imp,               \
-                            pixman_composite_info_t *info)              \
-{                                                                       \
-    PIXMAN_COMPOSITE_ARGS (info);                                       \
-    dst_type *dst_line;							\
-    src_type *src_line;                                                 \
-    int32_t dst_stride, src_stride;                                     \
-                                                                        \
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
-                           src_stride, src_line, src_cnt);              \
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
-                           dst_stride, dst_line, dst_cnt);              \
-                                                                        \
-    pixman_composite_##name##_asm_##cputype (width, height,             \
-                                             dst_line, dst_stride,      \
-                                             src_line, src_stride);     \
-}
-
-#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(flags, cputype, name,           \
-                                        dst_type, dst_cnt)              \
-void                                                                    \
-pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
-                                         int32_t    h,                  \
-                                         dst_type  *dst,                \
-                                         int32_t    dst_stride,         \
-                                         uint32_t   src);               \
-                                                                        \
-static void                                                             \
-cputype##_composite_##name (pixman_implementation_t *imp,               \
-			    pixman_composite_info_t *info)              \
-{                                                                       \
-    PIXMAN_COMPOSITE_ARGS (info);					\
-    dst_type  *dst_line;                                                \
-    int32_t    dst_stride;                                              \
-    uint32_t   src;                                                     \
-                                                                        \
-    src = _pixman_image_get_solid (					\
-	imp, src_image, dest_image->bits.format);			\
-                                                                        \
-    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \
-	return;                                                         \
-                                                                        \
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
-                           dst_stride, dst_line, dst_cnt);              \
-                                                                        \
-    pixman_composite_##name##_asm_##cputype (width, height,             \
-                                             dst_line, dst_stride,      \
-                                             src);                      \
-}
-
-#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(flags, cputype, name,      \
-                                             mask_type, mask_cnt,       \
-                                             dst_type, dst_cnt)         \
-void                                                                    \
-pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
-                                         int32_t    h,                  \
-                                         dst_type  *dst,                \
-                                         int32_t    dst_stride,         \
-                                         uint32_t   src,                \
-                                         int32_t    unused,             \
-                                         mask_type *mask,               \
-                                         int32_t    mask_stride);       \
-                                                                        \
-static void                                                             \
-cputype##_composite_##name (pixman_implementation_t *imp,               \
-                            pixman_composite_info_t *info)              \
-{                                                                       \
-    PIXMAN_COMPOSITE_ARGS (info);                                       \
-    dst_type  *dst_line;						\
-    mask_type *mask_line;                                               \
-    int32_t    dst_stride, mask_stride;                                 \
-    uint32_t   src;                                                     \
-                                                                        \
-    src = _pixman_image_get_solid (					\
-	imp, src_image, dest_image->bits.format);			\
-                                                                        \
-    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \
-	return;                                                         \
-                                                                        \
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
-                           dst_stride, dst_line, dst_cnt);              \
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
-                           mask_stride, mask_line, mask_cnt);           \
-                                                                        \
-    pixman_composite_##name##_asm_##cputype (width, height,             \
-                                             dst_line, dst_stride,      \
-                                             src, 0,                    \
-                                             mask_line, mask_stride);   \
-}
-
-#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(flags, cputype, name,       \
-                                            src_type, src_cnt,          \
-                                            dst_type, dst_cnt)          \
-void                                                                    \
-pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
-                                         int32_t    h,                  \
-                                         dst_type  *dst,                \
-                                         int32_t    dst_stride,         \
-                                         src_type  *src,                \
-                                         int32_t    src_stride,         \
-                                         uint32_t   mask);              \
-                                                                        \
-static void                                                             \
-cputype##_composite_##name (pixman_implementation_t *imp,               \
-                            pixman_composite_info_t *info)              \
-{                                                                       \
-    PIXMAN_COMPOSITE_ARGS (info);                                       \
-    dst_type  *dst_line;						\
-    src_type  *src_line;                                                \
-    int32_t    dst_stride, src_stride;                                  \
-    uint32_t   mask;                                                    \
-                                                                        \
-    mask = _pixman_image_get_solid (					\
-	imp, mask_image, dest_image->bits.format);			\
-                                                                        \
-    if ((flags & SKIP_ZERO_MASK) && mask == 0)                          \
-	return;                                                         \
-                                                                        \
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
-                           dst_stride, dst_line, dst_cnt);              \
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
-                           src_stride, src_line, src_cnt);              \
-                                                                        \
-    pixman_composite_##name##_asm_##cputype (width, height,             \
-                                             dst_line, dst_stride,      \
-                                             src_line, src_stride,      \
-                                             mask);                     \
-}
-
-#define PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST(cputype, name,           \
-                                               src_type, src_cnt,       \
-                                               mask_type, mask_cnt,     \
-                                               dst_type, dst_cnt)       \
-void                                                                    \
-pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
-                                         int32_t    h,                  \
-                                         dst_type  *dst,                \
-                                         int32_t    dst_stride,         \
-                                         src_type  *src,                \
-                                         int32_t    src_stride,         \
-                                         mask_type *mask,               \
-                                         int32_t    mask_stride);       \
-                                                                        \
-static void                                                             \
-cputype##_composite_##name (pixman_implementation_t *imp,               \
-                            pixman_composite_info_t *info)              \
-{                                                                       \
-    PIXMAN_COMPOSITE_ARGS (info);                                       \
-    dst_type  *dst_line;						\
-    src_type  *src_line;                                                \
-    mask_type *mask_line;                                               \
-    int32_t    dst_stride, src_stride, mask_stride;                     \
-                                                                        \
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
-                           dst_stride, dst_line, dst_cnt);              \
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
-                           src_stride, src_line, src_cnt);              \
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
-                           mask_stride, mask_line, mask_cnt);           \
-                                                                        \
-    pixman_composite_##name##_asm_##cputype (width, height,             \
-                                             dst_line, dst_stride,      \
-                                             src_line, src_stride,      \
-                                             mask_line, mask_stride);   \
-}
-
-#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST(cputype, name, op,             \
-                                               src_type, dst_type)            \
-void                                                                          \
-pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
-                                                   int32_t          w,        \
-                                                   dst_type *       dst,      \
-                                                   const src_type * src,      \
-                                                   pixman_fixed_t   vx,       \
-                                                   pixman_fixed_t   unit_x);  \
-                                                                              \
-static force_inline void                                                      \
-scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
-                                                   const src_type * ps,       \
-                                                   int32_t          w,        \
-                                                   pixman_fixed_t   vx,       \
-                                                   pixman_fixed_t   unit_x,   \
-                                                   pixman_fixed_t   max_vx,   \
-                                                   pixman_bool_t    zero_src) \
-{                                                                             \
-    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
-                                                                  vx, unit_x);\
-}                                                                             \
-                                                                              \
-FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op,                         \
-                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
-                       src_type, dst_type, COVER)                             \
-FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op,                          \
-                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
-                       src_type, dst_type, NONE)                              \
-FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
-                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
-                       src_type, dst_type, PAD)
-
-/* Provide entries for the fast path table */
-#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
-    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
-    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
-    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
-
-#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
-                                                  src_type, dst_type)         \
-void                                                                          \
-pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
-                                                   int32_t          w,        \
-                                                   dst_type *       dst,      \
-                                                   const src_type * src,      \
-                                                   pixman_fixed_t   vx,       \
-                                                   pixman_fixed_t   unit_x,   \
-                                                   const uint8_t *  mask);    \
-                                                                              \
-static force_inline void                                                      \
-scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t *  mask,     \
-                                                   dst_type *       pd,       \
-                                                   const src_type * ps,       \
-                                                   int32_t          w,        \
-                                                   pixman_fixed_t   vx,       \
-                                                   pixman_fixed_t   unit_x,   \
-                                                   pixman_fixed_t   max_vx,   \
-                                                   pixman_bool_t    zero_src) \
-{                                                                             \
-    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
-	return;                                                               \
-    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
-                                                                  vx, unit_x, \
-                                                                  mask);      \
-}                                                                             \
-                                                                              \
-FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                  \
-                              scaled_nearest_scanline_##cputype##_##name##_##op,\
-                              src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\
-FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op,                   \
-                              scaled_nearest_scanline_##cputype##_##name##_##op,\
-                              src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
-FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \
-                              scaled_nearest_scanline_##cputype##_##name##_##op,\
-                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
-
-/* Provide entries for the fast path table */
-#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
-
-/*****************************************************************************/
-
-#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op,     \
-                                                src_type, dst_type)           \
-void                                                                          \
-pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
-                                                dst_type *       dst,         \
-                                                const src_type * top,         \
-                                                const src_type * bottom,      \
-                                                int              wt,          \
-                                                int              wb,          \
-                                                pixman_fixed_t   x,           \
-                                                pixman_fixed_t   ux,          \
-                                                int              width);      \
-                                                                              \
-static force_inline void                                                      \
-scaled_bilinear_scanline_##cputype##_##name##_##op (                          \
-                                                dst_type *       dst,         \
-                                                const uint32_t * mask,        \
-                                                const src_type * src_top,     \
-                                                const src_type * src_bottom,  \
-                                                int32_t          w,           \
-                                                int              wt,          \
-                                                int              wb,          \
-                                                pixman_fixed_t   vx,          \
-                                                pixman_fixed_t   unit_x,      \
-                                                pixman_fixed_t   max_vx,      \
-                                                pixman_bool_t    zero_src)    \
-{                                                                             \
-    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
-	return;                                                               \
-    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
-                            dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
-}                                                                             \
-                                                                              \
-FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
-                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       src_type, uint32_t, dst_type, COVER, FLAG_NONE)        \
-FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
-                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       src_type, uint32_t, dst_type, NONE, FLAG_NONE)         \
-FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
-                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       src_type, uint32_t, dst_type, PAD, FLAG_NONE)          \
-FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
-                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       src_type, uint32_t, dst_type, NORMAL,                  \
-                       FLAG_NONE)
-
-
-#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op,  \
-                                                src_type, dst_type)           \
-void                                                                          \
-pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
-                                                dst_type *       dst,         \
-                                                const uint8_t *  mask,        \
-                                                const src_type * top,         \
-                                                const src_type * bottom,      \
-                                                int              wt,          \
-                                                int              wb,          \
-                                                pixman_fixed_t   x,           \
-                                                pixman_fixed_t   ux,          \
-                                                int              width);      \
-                                                                              \
-static force_inline void                                                      \
-scaled_bilinear_scanline_##cputype##_##name##_##op (                          \
-                                                dst_type *       dst,         \
-                                                const uint8_t *  mask,        \
-                                                const src_type * src_top,     \
-                                                const src_type * src_bottom,  \
-                                                int32_t          w,           \
-                                                int              wt,          \
-                                                int              wb,          \
-                                                pixman_fixed_t   vx,          \
-                                                pixman_fixed_t   unit_x,      \
-                                                pixman_fixed_t   max_vx,      \
-                                                pixman_bool_t    zero_src)    \
-{                                                                             \
-    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
-	return;                                                                   \
-    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
-                      dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \
-}                                                                             \
-                                                                              \
-FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
-                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       src_type, uint8_t, dst_type, COVER,                    \
-                       FLAG_HAVE_NON_SOLID_MASK)                              \
-FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
-                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       src_type, uint8_t, dst_type, NONE,                     \
-                       FLAG_HAVE_NON_SOLID_MASK)                              \
-FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
-                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       src_type, uint8_t, dst_type, PAD,                      \
-                       FLAG_HAVE_NON_SOLID_MASK)                              \
-FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
-                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
-                       src_type, uint8_t, dst_type, NORMAL,                   \
-                       FLAG_HAVE_NON_SOLID_MASK)
-
-
-#endif
+/*
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+#ifndef PIXMAN_ARM_COMMON_H
+#define PIXMAN_ARM_COMMON_H
+
+#include "pixman-inlines.h"
+
+/* Define some macros which can expand into proxy functions between
+ * ARM assembly optimized functions and the rest of pixman fast path API.
+ *
+ * All the low level ARM assembly functions have to use ARM EABI
+ * calling convention and take up to 8 arguments:
+ *    width, height, dst, dst_stride, src, src_stride, mask, mask_stride
+ *
+ * The arguments are ordered with the most important coming first (the
+ * first 4 arguments are passed to function in registers, the rest are
+ * on stack). The last arguments are optional, for example if the
+ * function is not using mask, then 'mask' and 'mask_stride' can be
+ * omitted when doing a function call.
+ *
+ * Arguments 'src' and 'mask' contain either a pointer to the top left
+ * pixel of the composited rectangle or a pixel color value depending
+ * on the function type. In the case of just a color value (solid source
+ * or mask), the corresponding stride argument is unused.
+ */
+
+#define SKIP_ZERO_SRC  1
+#define SKIP_ZERO_MASK 2
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(cputype, name,                \
+                                          src_type, src_cnt,            \
+                                          dst_type, dst_cnt)            \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t   w,                   \
+                                         int32_t   h,                   \
+                                         dst_type *dst,                 \
+                                         int32_t   dst_stride,          \
+                                         src_type *src,                 \
+                                         int32_t   src_stride);         \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type *dst_line;							\
+    src_type *src_line;                                                 \
+    int32_t dst_stride, src_stride;                                     \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride);     \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(flags, cputype, name,           \
+                                        dst_type, dst_cnt)              \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         uint32_t   src);               \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+			    pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);					\
+    dst_type  *dst_line;                                                \
+    int32_t    dst_stride;                                              \
+    uint32_t   src;                                                     \
+                                                                        \
+    src = _pixman_image_get_solid (					\
+	imp, src_image, dest_image->bits.format);			\
+                                                                        \
+    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src);                      \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(flags, cputype, name,      \
+                                             mask_type, mask_cnt,       \
+                                             dst_type, dst_cnt)         \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         uint32_t   src,                \
+                                         int32_t    unused,             \
+                                         mask_type *mask,               \
+                                         int32_t    mask_stride);       \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;						\
+    mask_type *mask_line;                                               \
+    int32_t    dst_stride, mask_stride;                                 \
+    uint32_t   src;                                                     \
+                                                                        \
+    src = _pixman_image_get_solid (					\
+	imp, src_image, dest_image->bits.format);			\
+                                                                        \
+    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
+                           mask_stride, mask_line, mask_cnt);           \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src, 0,                    \
+                                             mask_line, mask_stride);   \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(flags, cputype, name,       \
+                                            src_type, src_cnt,          \
+                                            dst_type, dst_cnt)          \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         src_type  *src,                \
+                                         int32_t    src_stride,         \
+                                         uint32_t   mask);              \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;						\
+    src_type  *src_line;                                                \
+    int32_t    dst_stride, src_stride;                                  \
+    uint32_t   mask;                                                    \
+                                                                        \
+    mask = _pixman_image_get_solid (					\
+	imp, mask_image, dest_image->bits.format);			\
+                                                                        \
+    if ((flags & SKIP_ZERO_MASK) && mask == 0)                          \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride,      \
+                                             mask);                     \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST(cputype, name,           \
+                                               src_type, src_cnt,       \
+                                               mask_type, mask_cnt,     \
+                                               dst_type, dst_cnt)       \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         src_type  *src,                \
+                                         int32_t    src_stride,         \
+                                         mask_type *mask,               \
+                                         int32_t    mask_stride);       \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;						\
+    src_type  *src_line;                                                \
+    mask_type *mask_line;                                               \
+    int32_t    dst_stride, src_stride, mask_stride;                     \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
+                           mask_stride, mask_line, mask_cnt);           \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride,      \
+                                             mask_line, mask_stride);   \
+}
+
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST(cputype, name, op,             \
+                                               src_type, dst_type)            \
+void                                                                          \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
+                                                   int32_t          w,        \
+                                                   dst_type *       dst,      \
+                                                   const src_type * src,      \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x);  \
+                                                                              \
+static force_inline void                                                      \
+scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
+                                                   const src_type * ps,       \
+                                                   int32_t          w,        \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
+                                                   pixman_bool_t    zero_src) \
+{                                                                             \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
+                                                                  vx, unit_x);\
+}                                                                             \
+                                                                              \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op,                         \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, COVER)                             \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op,                          \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, NONE)                              \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, PAD)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
+                                                  src_type, dst_type)         \
+void                                                                          \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
+                                                   int32_t          w,        \
+                                                   dst_type *       dst,      \
+                                                   const src_type * src,      \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   const uint8_t *  mask);    \
+                                                                              \
+static force_inline void                                                      \
+scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t *  mask,     \
+                                                   dst_type *       pd,       \
+                                                   const src_type * ps,       \
+                                                   int32_t          w,        \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
+                                                   pixman_bool_t    zero_src) \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+	return;                                                               \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
+                                                                  vx, unit_x, \
+                                                                  mask);      \
+}                                                                             \
+                                                                              \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                  \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op,                   \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+/*****************************************************************************/
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op,     \
+                                                src_type, dst_type)           \
+void                                                                          \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
+                                                dst_type *       dst,         \
+                                                const src_type * top,         \
+                                                const src_type * bottom,      \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   x,           \
+                                                pixman_fixed_t   ux,          \
+                                                int              width);      \
+                                                                              \
+static force_inline void                                                      \
+scaled_bilinear_scanline_##cputype##_##name##_##op (                          \
+                                                dst_type *       dst,         \
+                                                const uint32_t * mask,        \
+                                                const src_type * src_top,     \
+                                                const src_type * src_bottom,  \
+                                                int32_t          w,           \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   vx,          \
+                                                pixman_fixed_t   unit_x,      \
+                                                pixman_fixed_t   max_vx,      \
+                                                pixman_bool_t    zero_src)    \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+	return;                                                               \
+    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
+                            dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+}                                                                             \
+                                                                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, COVER, FLAG_NONE)        \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, NONE, FLAG_NONE)         \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, PAD, FLAG_NONE)          \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, NORMAL,                  \
+                       FLAG_NONE)
+
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op,  \
+                                                src_type, dst_type)           \
+void                                                                          \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
+                                                dst_type *       dst,         \
+                                                const uint8_t *  mask,        \
+                                                const src_type * top,         \
+                                                const src_type * bottom,      \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   x,           \
+                                                pixman_fixed_t   ux,          \
+                                                int              width);      \
+                                                                              \
+static force_inline void                                                      \
+scaled_bilinear_scanline_##cputype##_##name##_##op (                          \
+                                                dst_type *       dst,         \
+                                                const uint8_t *  mask,        \
+                                                const src_type * src_top,     \
+                                                const src_type * src_bottom,  \
+                                                int32_t          w,           \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   vx,          \
+                                                pixman_fixed_t   unit_x,      \
+                                                pixman_fixed_t   max_vx,      \
+                                                pixman_bool_t    zero_src)    \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+	return;                                                                   \
+    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
+                      dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+}                                                                             \
+                                                                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, COVER,                    \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, NONE,                     \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, PAD,                      \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, NORMAL,                   \
+                       FLAG_HAVE_NON_SOLID_MASK)
+
+
+#endif
diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S
index f76053dc3..3fcd07dc3 100644
--- a/pixman/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman/pixman-arm-neon-asm.S
@@ -1,3497 +1,3497 @@
-/*
- * Copyright © 2009 Nokia Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
- */
-
-/*
- * This file contains implementations of NEON optimized pixel processing
- * functions. There is no full and detailed tutorial, but some functions
- * (those which are exposing some new or interesting features) are
- * extensively commented and can be used as examples.
- *
- * You may want to have a look at the comments for following functions:
- *  - pixman_composite_over_8888_0565_asm_neon
- *  - pixman_composite_over_n_8_0565_asm_neon
- */
-
-/* Prevent the stack from becoming executable for no reason... */
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
-    .text
-    .fpu neon
-    .arch armv7a
-    .object_arch armv4
-    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
-    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
-    .arm
-    .altmacro
-    .p2align 2
-
-#include "pixman-arm-neon-asm.h"
-
-/* Global configuration options and preferences */
-
-/*
- * The code can optionally make use of unaligned memory accesses to improve
- * performance of handling leading/trailing pixels for each scanline.
- * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
- * example in linux if unaligned memory accesses are not configured to
- * generate.exceptions.
- */
-.set RESPECT_STRICT_ALIGNMENT, 1
-
-/*
- * Set default prefetch type. There is a choice between the following options:
- *
- * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
- * as NOP to workaround some HW bugs or for whatever other reason)
- *
- * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
- * advanced prefetch intruduces heavy overhead)
- *
- * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
- * which can run ARM and NEON instructions simultaneously so that extra ARM
- * instructions do not add (many) extra cycles, but improve prefetch efficiency)
- *
- * Note: some types of function can't support advanced prefetch and fallback
- *       to simple one (those which handle 24bpp pixels)
- */
-.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
-
-/* Prefetch distance in pixels for simple prefetch */
-.set PREFETCH_DISTANCE_SIMPLE, 64
-
-/*
- * Implementation of pixman_composite_over_8888_0565_asm_neon
- *
- * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
- * performs OVER compositing operation. Function fast_composite_over_8888_0565
- * from pixman-fast-path.c does the same in C and can be used as a reference.
- *
- * First we need to have some NEON assembly code which can do the actual
- * operation on the pixels and provide it to the template macro.
- *
- * Template macro quite conveniently takes care of emitting all the necessary
- * code for memory reading and writing (including quite tricky cases of
- * handling unaligned leading/trailing pixels), so we only need to deal with
- * the data in NEON registers.
- *
- * NEON registers allocation in general is recommented to be the following:
- * d0,  d1,  d2,  d3  - contain loaded source pixel data
- * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
- * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
- * d28, d29, d30, d31 - place for storing the result (destination pixels)
- *
- * As can be seen above, four 64-bit NEON registers are used for keeping
- * intermediate pixel data and up to 8 pixels can be processed in one step
- * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
- *
- * This particular function uses the following registers allocation:
- * d0,  d1,  d2,  d3  - contain loaded source pixel data
- * d4,  d5            - contain loaded destination pixels (they are needed)
- * d28, d29           - place for storing the result (destination pixels)
- */
-
-/*
- * Step one. We need to have some code to do some arithmetics on pixel data.
- * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
- * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
- * perform all the needed calculations and write the result to {d28, d29}.
- * The rationale for having two macros and not just one will be explained
- * later. In practice, any single monolitic function which does the work can
- * be split into two parts in any arbitrary way without affecting correctness.
- *
- * There is one special trick here too. Common template macro can optionally
- * make our life a bit easier by doing R, G, B, A color components
- * deinterleaving for 32bpp pixel formats (and this feature is used in
- * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
- * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
- * actually use d0 register for blue channel (a vector of eight 8-bit
- * values), d1 register for green, d2 for red and d3 for alpha. This
- * simple conversion can be also done with a few NEON instructions:
- *
- * Packed to planar conversion:
- *  vuzp.8 d0, d1
- *  vuzp.8 d2, d3
- *  vuzp.8 d1, d3
- *  vuzp.8 d0, d2
- *
- * Planar to packed conversion:
- *  vzip.8 d0, d2
- *  vzip.8 d1, d3
- *  vzip.8 d2, d3
- *  vzip.8 d0, d1
- *
- * But pixel can be loaded directly in planar format using VLD4.8 NEON
- * instruction. It is 1 cycle slower than VLD1.32, so this is not always
- * desirable, that's why deinterleaving is optional.
- *
- * But anyway, here is the code:
- */
-.macro pixman_composite_over_8888_0565_process_pixblock_head
-    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
-       and put data into d6 - red, d7 - green, d30 - blue */
-    vshrn.u16   d6, q2, #8
-    vshrn.u16   d7, q2, #3
-    vsli.u16    q2, q2, #5
-    vsri.u8     d6, d6, #5
-    vmvn.8      d3, d3      /* invert source alpha */
-    vsri.u8     d7, d7, #6
-    vshrn.u16   d30, q2, #2
-    /* now do alpha blending, storing results in 8-bit planar format
-       into d16 - red, d19 - green, d18 - blue */
-    vmull.u8    q10, d3, d6
-    vmull.u8    q11, d3, d7
-    vmull.u8    q12, d3, d30
-    vrshr.u16   q13, q10, #8
-    vrshr.u16   q3, q11, #8
-    vrshr.u16   q15, q12, #8
-    vraddhn.u16 d20, q10, q13
-    vraddhn.u16 d23, q11, q3
-    vraddhn.u16 d22, q12, q15
-.endm
-
-.macro pixman_composite_over_8888_0565_process_pixblock_tail
-    /* ... continue alpha blending */
-    vqadd.u8    d16, d2, d20
-    vqadd.u8    q9, q0, q11
-    /* convert the result to r5g6b5 and store it into {d28, d29} */
-    vshll.u8    q14, d16, #8
-    vshll.u8    q8, d19, #8
-    vshll.u8    q9, d18, #8
-    vsri.u16    q14, q8, #5
-    vsri.u16    q14, q9, #11
-.endm
-
-/*
- * OK, now we got almost everything that we need. Using the above two
- * macros, the work can be done right. But now we want to optimize
- * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
- * a lot from good code scheduling and software pipelining.
- *
- * Let's construct some code, which will run in the core main loop.
- * Some pseudo-code of the main loop will look like this:
- *   head
- *   while (...) {
- *     tail
- *     head
- *   }
- *   tail
- *
- * It may look a bit weird, but this setup allows to hide instruction
- * latencies better and also utilize dual-issue capability more
- * efficiently (make pairs of load-store and ALU instructions).
- *
- * So what we need now is a '*_tail_head' macro, which will be used
- * in the core main loop. A trivial straightforward implementation
- * of this macro would look like this:
- *
- *   pixman_composite_over_8888_0565_process_pixblock_tail
- *   vst1.16     {d28, d29}, [DST_W, :128]!
- *   vld1.16     {d4, d5}, [DST_R, :128]!
- *   vld4.32     {d0, d1, d2, d3}, [SRC]!
- *   pixman_composite_over_8888_0565_process_pixblock_head
- *   cache_preload 8, 8
- *
- * Now it also got some VLD/VST instructions. We simply can't move from
- * processing one block of pixels to the other one with just arithmetics.
- * The previously processed data needs to be written to memory and new
- * data needs to be fetched. Fortunately, this main loop does not deal
- * with partial leading/trailing pixels and can load/store a full block
- * of pixels in a bulk. Additionally, destination buffer is already
- * 16 bytes aligned here (which is good for performance).
- *
- * New things here are DST_R, DST_W, SRC and MASK identifiers. These
- * are the aliases for ARM registers which are used as pointers for
- * accessing data. We maintain separate pointers for reading and writing
- * destination buffer (DST_R and DST_W).
- *
- * Another new thing is 'cache_preload' macro. It is used for prefetching
- * data into CPU L2 cache and improve performance when dealing with large
- * images which are far larger than cache size. It uses one argument
- * (actually two, but they need to be the same here) - number of pixels
- * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
- * details about this macro. Moreover, if good performance is needed
- * the code from this macro needs to be copied into '*_tail_head' macro
- * and mixed with the rest of code for optimal instructions scheduling.
- * We are actually doing it below.
- *
- * Now after all the explanations, here is the optimized code.
- * Different instruction streams (originaling from '*_head', '*_tail'
- * and 'cache_preload' macro) use different indentation levels for
- * better readability. Actually taking the code from one of these
- * indentation levels and ignoring a few VLD/VST instructions would
- * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
- * macro!
- */
-
-#if 1
-
-.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
-        vqadd.u8    d16, d2, d20
-    vld1.16     {d4, d5}, [DST_R, :128]!
-        vqadd.u8    q9, q0, q11
-    vshrn.u16   d6, q2, #8
-    fetch_src_pixblock
-    vshrn.u16   d7, q2, #3
-    vsli.u16    q2, q2, #5
-        vshll.u8    q14, d16, #8
-                                    PF add PF_X, PF_X, #8
-        vshll.u8    q8, d19, #8
-                                    PF tst PF_CTL, #0xF
-    vsri.u8     d6, d6, #5
-                                    PF addne PF_X, PF_X, #8
-    vmvn.8      d3, d3
-                                    PF subne PF_CTL, PF_CTL, #1
-    vsri.u8     d7, d7, #6
-    vshrn.u16   d30, q2, #2
-    vmull.u8    q10, d3, d6
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-    vmull.u8    q11, d3, d7
-    vmull.u8    q12, d3, d30
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-        vsri.u16    q14, q8, #5
-                                    PF cmp PF_X, ORIG_W
-        vshll.u8    q9, d18, #8
-    vrshr.u16   q13, q10, #8
-                                    PF subge PF_X, PF_X, ORIG_W
-    vrshr.u16   q3, q11, #8
-    vrshr.u16   q15, q12, #8
-                                    PF subges PF_CTL, PF_CTL, #0x10
-        vsri.u16    q14, q9, #11
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-    vraddhn.u16 d20, q10, q13
-    vraddhn.u16 d23, q11, q3
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vraddhn.u16 d22, q12, q15
-        vst1.16     {d28, d29}, [DST_W, :128]!
-.endm
-
-#else
-
-/* If we did not care much about the performance, we would just use this... */
-.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
-    pixman_composite_over_8888_0565_process_pixblock_tail
-    vst1.16     {d28, d29}, [DST_W, :128]!
-    vld1.16     {d4, d5}, [DST_R, :128]!
-    fetch_src_pixblock
-    pixman_composite_over_8888_0565_process_pixblock_head
-    cache_preload 8, 8
-.endm
-
-#endif
-
-/*
- * And now the final part. We are using 'generate_composite_function' macro
- * to put all the stuff together. We are specifying the name of the function
- * which we want to get, number of bits per pixel for the source, mask and
- * destination (0 if unused, like mask in this case). Next come some bit
- * flags:
- *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
- *                             and written, for write-only buffer we would use
- *                             FLAG_DST_WRITEONLY flag instead
- *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
- *                             and separate color channels for 32bpp format.
- * The next things are:
- *  - the number of pixels processed per iteration (8 in this case, because
- *    that's the maximum what can fit into four 64-bit NEON registers).
- *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
- *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
- *    prefetch distance can be selected by running some benchmarks.
- *
- * After that we specify some macros, these are 'default_init',
- * 'default_cleanup' here which are empty (but it is possible to have custom
- * init/cleanup macros to be able to save/restore some extra NEON registers
- * like d8-d15 or do anything else) followed by
- * 'pixman_composite_over_8888_0565_process_pixblock_head',
- * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
- * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
- * which we got implemented above.
- *
- * The last part is the NEON registers allocation scheme.
- */
-generate_composite_function \
-    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_0565_process_pixblock_head, \
-    pixman_composite_over_8888_0565_process_pixblock_tail, \
-    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_0565_process_pixblock_head
-    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
-       and put data into d6 - red, d7 - green, d30 - blue */
-    vshrn.u16   d6, q2, #8
-    vshrn.u16   d7, q2, #3
-    vsli.u16    q2, q2, #5
-    vsri.u8     d6, d6, #5
-    vsri.u8     d7, d7, #6
-    vshrn.u16   d30, q2, #2
-    /* now do alpha blending, storing results in 8-bit planar format
-       into d16 - red, d19 - green, d18 - blue */
-    vmull.u8    q10, d3, d6
-    vmull.u8    q11, d3, d7
-    vmull.u8    q12, d3, d30
-    vrshr.u16   q13, q10, #8
-    vrshr.u16   q3, q11, #8
-    vrshr.u16   q15, q12, #8
-    vraddhn.u16 d20, q10, q13
-    vraddhn.u16 d23, q11, q3
-    vraddhn.u16 d22, q12, q15
-.endm
-
-.macro pixman_composite_over_n_0565_process_pixblock_tail
-    /* ... continue alpha blending */
-    vqadd.u8    d16, d2, d20
-    vqadd.u8    q9, q0, q11
-    /* convert the result to r5g6b5 and store it into {d28, d29} */
-    vshll.u8    q14, d16, #8
-    vshll.u8    q8, d19, #8
-    vshll.u8    q9, d18, #8
-    vsri.u16    q14, q8, #5
-    vsri.u16    q14, q9, #11
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_0565_process_pixblock_tail_head
-    pixman_composite_over_n_0565_process_pixblock_tail
-    vld1.16     {d4, d5}, [DST_R, :128]!
-    vst1.16     {d28, d29}, [DST_W, :128]!
-    pixman_composite_over_n_0565_process_pixblock_head
-    cache_preload 8, 8
-.endm
-
-.macro pixman_composite_over_n_0565_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d3[0]}, [DUMMY]
-    vdup.8      d0, d3[0]
-    vdup.8      d1, d3[1]
-    vdup.8      d2, d3[2]
-    vdup.8      d3, d3[3]
-    vmvn.8      d3, d3      /* invert source alpha */
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_0565_init, \
-    default_cleanup, \
-    pixman_composite_over_n_0565_process_pixblock_head, \
-    pixman_composite_over_n_0565_process_pixblock_tail, \
-    pixman_composite_over_n_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_8888_0565_process_pixblock_head
-    vshll.u8    q8, d1, #8
-    vshll.u8    q14, d2, #8
-    vshll.u8    q9, d0, #8
-.endm
-
-.macro pixman_composite_src_8888_0565_process_pixblock_tail
-    vsri.u16    q14, q8, #5
-    vsri.u16    q14, q9, #11
-.endm
-
-.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
-        vsri.u16    q14, q8, #5
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-    fetch_src_pixblock
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vsri.u16    q14, q9, #11
-                                    PF cmp PF_X, ORIG_W
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-    vshll.u8    q8, d1, #8
-        vst1.16     {d28, d29}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vshll.u8    q14, d2, #8
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-    vshll.u8    q9, d0, #8
-.endm
-
-generate_composite_function \
-    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_8888_0565_process_pixblock_head, \
-    pixman_composite_src_8888_0565_process_pixblock_tail, \
-    pixman_composite_src_8888_0565_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0565_8888_process_pixblock_head
-    vshrn.u16   d30, q0, #8
-    vshrn.u16   d29, q0, #3
-    vsli.u16    q0, q0, #5
-    vmov.u8     d31, #255
-    vsri.u8     d30, d30, #5
-    vsri.u8     d29, d29, #6
-    vshrn.u16   d28, q0, #2
-.endm
-
-.macro pixman_composite_src_0565_8888_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
-    pixman_composite_src_0565_8888_process_pixblock_tail
-    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
-    fetch_src_pixblock
-    pixman_composite_src_0565_8888_process_pixblock_head
-    cache_preload 8, 8
-.endm
-
-generate_composite_function \
-    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_0565_8888_process_pixblock_head, \
-    pixman_composite_src_0565_8888_process_pixblock_tail, \
-    pixman_composite_src_0565_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8_8_process_pixblock_head
-    vqadd.u8    q14, q0, q2
-    vqadd.u8    q15, q1, q3
-.endm
-
-.macro pixman_composite_add_8_8_process_pixblock_tail
-.endm
-
-.macro pixman_composite_add_8_8_process_pixblock_tail_head
-    fetch_src_pixblock
-                                    PF add PF_X, PF_X, #32
-                                    PF tst PF_CTL, #0xF
-    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-                                    PF addne PF_X, PF_X, #32
-                                    PF subne PF_CTL, PF_CTL, #1
-        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vqadd.u8    q14, q0, q2
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vqadd.u8    q15, q1, q3
-.endm
-
-generate_composite_function \
-    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
-    FLAG_DST_READWRITE, \
-    32, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8_8_process_pixblock_head, \
-    pixman_composite_add_8_8_process_pixblock_tail, \
-    pixman_composite_add_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
-    fetch_src_pixblock
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vqadd.u8    q14, q0, q2
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vqadd.u8    q15, q1, q3
-.endm
-
-generate_composite_function \
-    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8_8_process_pixblock_head, \
-    pixman_composite_add_8_8_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8_8_process_pixblock_head, \
-    pixman_composite_add_8_8_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
-    vmvn.8      d24, d3  /* get inverted alpha */
-    /* do alpha blending */
-    vmull.u8    q8, d24, d4
-    vmull.u8    q9, d24, d5
-    vmull.u8    q10, d24, d6
-    vmull.u8    q11, d24, d7
-.endm
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
-    vrshr.u16   q14, q8, #8
-    vrshr.u16   q15, q9, #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q12, q10
-    vraddhn.u16 d31, q13, q11
-.endm
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-        vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-        vrshr.u16   q15, q9, #8
-        vrshr.u16   q12, q10, #8
-        vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vraddhn.u16 d28, q14, q8
-        vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
-        vraddhn.u16 d30, q12, q10
-        vraddhn.u16 d31, q13, q11
-    fetch_src_pixblock
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-    vmvn.8      d22, d3
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
-    vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vmull.u8    q9, d22, d5
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-    vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vmull.u8    q11, d22, d7
-.endm
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
-    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
-    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_8888_process_pixblock_head
-    pixman_composite_out_reverse_8888_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_8888_8888_process_pixblock_tail
-    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
-    vqadd.u8    q14, q0, q14
-    vqadd.u8    q15, q1, q15
-.endm
-
-.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-        vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-        vrshr.u16   q15, q9, #8
-        vrshr.u16   q12, q10, #8
-        vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vraddhn.u16 d28, q14, q8
-        vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
-        vraddhn.u16 d30, q12, q10
-        vraddhn.u16 d31, q13, q11
-        vqadd.u8    q14, q0, q14
-        vqadd.u8    q15, q1, q15
-    fetch_src_pixblock
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-    vmvn.8      d22, d3
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
-    vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vmull.u8    q9, d22, d5
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-    vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vmull.u8    q11, d22, d7
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_8888_process_pixblock_head, \
-    pixman_composite_over_8888_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_8888_process_pixblock_head, \
-    pixman_composite_over_8888_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8888_process_pixblock_head
-    /* deinterleaved source pixels in {d0, d1, d2, d3} */
-    /* inverted alpha in {d24} */
-    /* destination pixels in {d4, d5, d6, d7} */
-    vmull.u8    q8, d24, d4
-    vmull.u8    q9, d24, d5
-    vmull.u8    q10, d24, d6
-    vmull.u8    q11, d24, d7
-.endm
-
-.macro pixman_composite_over_n_8888_process_pixblock_tail
-    vrshr.u16   q14, q8, #8
-    vrshr.u16   q15, q9, #8
-    vrshr.u16   q2, q10, #8
-    vrshr.u16   q3, q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q2, q10
-    vraddhn.u16 d31, q3, q11
-    vqadd.u8    q14, q0, q14
-    vqadd.u8    q15, q1, q15
-.endm
-
-.macro pixman_composite_over_n_8888_process_pixblock_tail_head
-        vrshr.u16   q14, q8, #8
-        vrshr.u16   q15, q9, #8
-        vrshr.u16   q2, q10, #8
-        vrshr.u16   q3, q11, #8
-        vraddhn.u16 d28, q14, q8
-        vraddhn.u16 d29, q15, q9
-        vraddhn.u16 d30, q2, q10
-        vraddhn.u16 d31, q3, q11
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-        vqadd.u8    q14, q0, q14
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0x0F
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vqadd.u8    q15, q1, q15
-                                    PF cmp PF_X, ORIG_W
-    vmull.u8    q8, d24, d4
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-    vmull.u8    q9, d24, d5
-                                    PF subge PF_X, PF_X, ORIG_W
-    vmull.u8    q10, d24, d6
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vmull.u8    q11, d24, d7
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_over_n_8888_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d3[0]}, [DUMMY]
-    vdup.8      d0, d3[0]
-    vdup.8      d1, d3[1]
-    vdup.8      d2, d3[2]
-    vdup.8      d3, d3[3]
-    vmvn.8      d24, d3  /* get inverted alpha */
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_8888_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_8888_process_pixblock_head, \
-    pixman_composite_over_8888_8888_process_pixblock_tail, \
-    pixman_composite_over_n_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
-        vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-        vrshr.u16   q15, q9, #8
-        vrshr.u16   q12, q10, #8
-        vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vraddhn.u16 d28, q14, q8
-        vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
-        vraddhn.u16 d30, q12, q10
-        vraddhn.u16 d31, q13, q11
-        vqadd.u8    q14, q0, q14
-        vqadd.u8    q15, q1, q15
-    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
-    vmvn.8      d22, d3
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
-    vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vmull.u8    q9, d22, d5
-    vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vmull.u8    q11, d22, d7
-.endm
-
-.macro pixman_composite_over_reverse_n_8888_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d7[0]}, [DUMMY]
-    vdup.8      d4, d7[0]
-    vdup.8      d5, d7[1]
-    vdup.8      d6, d7[2]
-    vdup.8      d7, d7[3]
-.endm
-
-generate_composite_function \
-    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_reverse_n_8888_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_8888_process_pixblock_head, \
-    pixman_composite_over_8888_8888_process_pixblock_tail, \
-    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    0,  /* dst_r_basereg */ \
-    4,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_head
-    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
-    vmull.u8    q1,  d24, d9
-    vmull.u8    q6,  d24, d10
-    vmull.u8    q7,  d24, d11
-        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
-        vshrn.u16   d7,  q2, #3
-        vsli.u16    q2,  q2, #5
-    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
-    vrshr.u16   q9,  q1,  #8
-    vrshr.u16   q10, q6,  #8
-    vrshr.u16   q11, q7,  #8
-    vraddhn.u16 d0,  q0,  q8
-    vraddhn.u16 d1,  q1,  q9
-    vraddhn.u16 d2,  q6,  q10
-    vraddhn.u16 d3,  q7,  q11
-        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
-        vsri.u8     d7,  d7, #6
-    vmvn.8      d3,  d3
-        vshrn.u16   d30, q2, #2
-    vmull.u8    q8,  d3, d6     /* now do alpha blending */
-    vmull.u8    q9,  d3, d7
-    vmull.u8    q10, d3, d30
-.endm
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
-    /* 3 cycle bubble (after vmull.u8) */
-    vrshr.u16   q13, q8,  #8
-    vrshr.u16   q11, q9,  #8
-    vrshr.u16   q15, q10, #8
-    vraddhn.u16 d16, q8,  q13
-    vraddhn.u16 d27, q9,  q11
-    vraddhn.u16 d26, q10, q15
-    vqadd.u8    d16, d2,  d16
-    /* 1 cycle bubble */
-    vqadd.u8    q9,  q0,  q13
-    vshll.u8    q14, d16, #8    /* convert to 16bpp */
-    vshll.u8    q8,  d19, #8
-    vshll.u8    q9,  d18, #8
-    vsri.u16    q14, q8,  #5
-    /* 1 cycle bubble */
-    vsri.u16    q14, q9,  #11
-.endm
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
-    vld1.16     {d4, d5}, [DST_R, :128]!
-    vshrn.u16   d6,  q2,  #8
-    fetch_mask_pixblock
-    vshrn.u16   d7,  q2,  #3
-    fetch_src_pixblock
-    vmull.u8    q6,  d24, d10
-        vrshr.u16   q13, q8,  #8
-        vrshr.u16   q11, q9,  #8
-        vrshr.u16   q15, q10, #8
-        vraddhn.u16 d16, q8,  q13
-        vraddhn.u16 d27, q9,  q11
-        vraddhn.u16 d26, q10, q15
-        vqadd.u8    d16, d2,  d16
-    vmull.u8    q1,  d24, d9
-        vqadd.u8    q9,  q0,  q13
-        vshll.u8    q14, d16, #8
-    vmull.u8    q0,  d24, d8
-        vshll.u8    q8,  d19, #8
-        vshll.u8    q9,  d18, #8
-        vsri.u16    q14, q8,  #5
-    vmull.u8    q7,  d24, d11
-        vsri.u16    q14, q9,  #11
-
-    cache_preload 8, 8
-
-    vsli.u16    q2,  q2,  #5
-    vrshr.u16   q8,  q0,  #8
-    vrshr.u16   q9,  q1,  #8
-    vrshr.u16   q10, q6,  #8
-    vrshr.u16   q11, q7,  #8
-    vraddhn.u16 d0,  q0,  q8
-    vraddhn.u16 d1,  q1,  q9
-    vraddhn.u16 d2,  q6,  q10
-    vraddhn.u16 d3,  q7,  q11
-    vsri.u8     d6,  d6,  #5
-    vsri.u8     d7,  d7,  #6
-    vmvn.8      d3,  d3
-    vshrn.u16   d30, q2,  #2
-    vst1.16     {d28, d29}, [DST_W, :128]!
-    vmull.u8    q8,  d3,  d6
-    vmull.u8    q9,  d3,  d7
-    vmull.u8    q10, d3,  d30
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_8888_8_0565_process_pixblock_head, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-/*
- * This function needs a special initialization of solid mask.
- * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
- * offset, split into color components and replicated in d8-d11
- * registers. Additionally, this function needs all the NEON registers,
- * so it has to save d8-d15 registers which are callee saved according
- * to ABI. These registers are restored from 'cleanup' macro. All the
- * other NEON registers are caller saved, so can be clobbered freely
- * without introducing any problems.
- */
-.macro pixman_composite_over_n_8_0565_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vpush       {d8-d15}
-    vld1.32     {d11[0]}, [DUMMY]
-    vdup.8      d8, d11[0]
-    vdup.8      d9, d11[1]
-    vdup.8      d10, d11[2]
-    vdup.8      d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8_0565_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_8_0565_init, \
-    pixman_composite_over_n_8_0565_cleanup, \
-    pixman_composite_over_8888_8_0565_process_pixblock_head, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_n_0565_init
-    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
-    vpush       {d8-d15}
-    vld1.32     {d24[0]}, [DUMMY]
-    vdup.8      d24, d24[3]
-.endm
-
-.macro pixman_composite_over_8888_n_0565_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_8888_n_0565_init, \
-    pixman_composite_over_8888_n_0565_cleanup, \
-    pixman_composite_over_8888_8_0565_process_pixblock_head, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0565_0565_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_0565_0565_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
-    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
-    fetch_src_pixblock
-    cache_preload 16, 16
-.endm
-
-generate_composite_function \
-    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
-    FLAG_DST_WRITEONLY, \
-    16, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_0565_0565_process_pixblock_head, \
-    pixman_composite_src_0565_0565_process_pixblock_tail, \
-    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_8_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_8_process_pixblock_tail_head
-    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_8_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d0[0]}, [DUMMY]
-    vsli.u64    d0, d0, #8
-    vsli.u64    d0, d0, #16
-    vsli.u64    d0, d0, #32
-    vorr        d1, d0, d0
-    vorr        q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_8_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
-    FLAG_DST_WRITEONLY, \
-    32, /* number of pixels, processed in a single block */ \
-    0,  /* prefetch distance */ \
-    pixman_composite_src_n_8_init, \
-    pixman_composite_src_n_8_cleanup, \
-    pixman_composite_src_n_8_process_pixblock_head, \
-    pixman_composite_src_n_8_process_pixblock_tail, \
-    pixman_composite_src_n_8_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_0565_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_0565_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_0565_process_pixblock_tail_head
-    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_0565_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d0[0]}, [DUMMY]
-    vsli.u64    d0, d0, #16
-    vsli.u64    d0, d0, #32
-    vorr        d1, d0, d0
-    vorr        q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_0565_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
-    FLAG_DST_WRITEONLY, \
-    16, /* number of pixels, processed in a single block */ \
-    0,  /* prefetch distance */ \
-    pixman_composite_src_n_0565_init, \
-    pixman_composite_src_n_0565_cleanup, \
-    pixman_composite_src_n_0565_process_pixblock_head, \
-    pixman_composite_src_n_0565_process_pixblock_tail, \
-    pixman_composite_src_n_0565_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_8888_process_pixblock_tail_head
-    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_8888_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d0[0]}, [DUMMY]
-    vsli.u64    d0, d0, #32
-    vorr        d1, d0, d0
-    vorr        q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_8888_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
-    FLAG_DST_WRITEONLY, \
-    8, /* number of pixels, processed in a single block */ \
-    0, /* prefetch distance */ \
-    pixman_composite_src_n_8888_init, \
-    pixman_composite_src_n_8888_cleanup, \
-    pixman_composite_src_n_8888_process_pixblock_head, \
-    pixman_composite_src_n_8888_process_pixblock_tail, \
-    pixman_composite_src_n_8888_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_8888_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_8888_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
-    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
-    fetch_src_pixblock
-    cache_preload 8, 8
-.endm
-
-generate_composite_function \
-    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_WRITEONLY, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_8888_8888_process_pixblock_head, \
-    pixman_composite_src_8888_8888_process_pixblock_tail, \
-    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_x888_8888_process_pixblock_head
-    vorr     q0, q0, q2
-    vorr     q1, q1, q2
-.endm
-
-.macro pixman_composite_src_x888_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
-    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
-    fetch_src_pixblock
-    vorr     q0, q0, q2
-    vorr     q1, q1, q2
-    cache_preload 8, 8
-.endm
-
-.macro pixman_composite_src_x888_8888_init
-    vmov.u8  q2, #0xFF
-    vshl.u32 q2, q2, #24
-.endm
-
-generate_composite_function \
-    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_WRITEONLY, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    pixman_composite_src_x888_8888_init, \
-    default_cleanup, \
-    pixman_composite_src_x888_8888_process_pixblock_head, \
-    pixman_composite_src_x888_8888_process_pixblock_tail, \
-    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8_8888_process_pixblock_head
-    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
-    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
-    /* and destination data in {d4, d5, d6, d7} */
-    /* mask is in d24 (d25, d26, d27 are unused) */
-
-    /* in */
-    vmull.u8    q6, d24, d8
-    vmull.u8    q7, d24, d9
-    vmull.u8    q8, d24, d10
-    vmull.u8    q9, d24, d11
-    vrshr.u16   q10, q6, #8
-    vrshr.u16   q11, q7, #8
-    vrshr.u16   q12, q8, #8
-    vrshr.u16   q13, q9, #8
-    vraddhn.u16 d0, q6, q10
-    vraddhn.u16 d1, q7, q11
-    vraddhn.u16 d2, q8, q12
-    vraddhn.u16 d3, q9, q13
-    vmvn.8      d25, d3  /* get inverted alpha */
-    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
-    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
-    /* now do alpha blending */
-    vmull.u8    q8, d25, d4
-    vmull.u8    q9, d25, d5
-    vmull.u8    q10, d25, d6
-    vmull.u8    q11, d25, d7
-.endm
-
-.macro pixman_composite_over_n_8_8888_process_pixblock_tail
-    vrshr.u16   q14, q8, #8
-    vrshr.u16   q15, q9, #8
-    vrshr.u16   q6, q10, #8
-    vrshr.u16   q7, q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q6, q10
-    vraddhn.u16 d31, q7, q11
-    vqadd.u8    q14, q0, q14
-    vqadd.u8    q15, q1, q15
-.endm
-
-.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
-        vrshr.u16   q14, q8, #8
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-        vrshr.u16   q15, q9, #8
-    fetch_mask_pixblock
-        vrshr.u16   q6, q10, #8
-                                    PF add PF_X, PF_X, #8
-        vrshr.u16   q7, q11, #8
-                                    PF tst PF_CTL, #0x0F
-        vraddhn.u16 d28, q14, q8
-                                    PF addne PF_X, PF_X, #8
-        vraddhn.u16 d29, q15, q9
-                                    PF subne PF_CTL, PF_CTL, #1
-        vraddhn.u16 d30, q6, q10
-                                    PF cmp PF_X, ORIG_W
-        vraddhn.u16 d31, q7, q11
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-    vmull.u8    q6, d24, d8
-                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
-    vmull.u8    q7, d24, d9
-                                    PF subge PF_X, PF_X, ORIG_W
-    vmull.u8    q8, d24, d10
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vmull.u8    q9, d24, d11
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-        vqadd.u8    q14, q0, q14
-                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
-        vqadd.u8    q15, q1, q15
-    vrshr.u16   q10, q6, #8
-    vrshr.u16   q11, q7, #8
-    vrshr.u16   q12, q8, #8
-    vrshr.u16   q13, q9, #8
-    vraddhn.u16 d0, q6, q10
-    vraddhn.u16 d1, q7, q11
-    vraddhn.u16 d2, q8, q12
-    vraddhn.u16 d3, q9, q13
-        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-    vmvn.8      d25, d3
-    vmull.u8    q8, d25, d4
-    vmull.u8    q9, d25, d5
-    vmull.u8    q10, d25, d6
-    vmull.u8    q11, d25, d7
-.endm
-
-.macro pixman_composite_over_n_8_8888_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vpush       {d8-d15}
-    vld1.32     {d11[0]}, [DUMMY]
-    vdup.8      d8, d11[0]
-    vdup.8      d9, d11[1]
-    vdup.8      d10, d11[2]
-    vdup.8      d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8_8888_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_8_8888_init, \
-    pixman_composite_over_n_8_8888_cleanup, \
-    pixman_composite_over_n_8_8888_process_pixblock_head, \
-    pixman_composite_over_n_8_8888_process_pixblock_tail, \
-    pixman_composite_over_n_8_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8_8_process_pixblock_head
-    vmull.u8    q0,  d24, d8
-    vmull.u8    q1,  d25, d8
-    vmull.u8    q6,  d26, d8
-    vmull.u8    q7,  d27, d8
-    vrshr.u16   q10, q0,  #8
-    vrshr.u16   q11, q1,  #8
-    vrshr.u16   q12, q6,  #8
-    vrshr.u16   q13, q7,  #8
-    vraddhn.u16 d0,  q0,  q10
-    vraddhn.u16 d1,  q1,  q11
-    vraddhn.u16 d2,  q6,  q12
-    vraddhn.u16 d3,  q7,  q13
-    vmvn.8      q12, q0
-    vmvn.8      q13, q1
-    vmull.u8    q8,  d24, d4
-    vmull.u8    q9,  d25, d5
-    vmull.u8    q10, d26, d6
-    vmull.u8    q11, d27, d7
-.endm
-
-.macro pixman_composite_over_n_8_8_process_pixblock_tail
-    vrshr.u16   q14, q8,  #8
-    vrshr.u16   q15, q9,  #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q12, q10
-    vraddhn.u16 d31, q13, q11
-    vqadd.u8    q14, q0,  q14
-    vqadd.u8    q15, q1,  q15
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
-    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    pixman_composite_over_n_8_8_process_pixblock_tail
-    fetch_mask_pixblock
-    cache_preload 32, 32
-    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-    pixman_composite_over_n_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_n_8_8_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vpush       {d8-d15}
-    vld1.32     {d8[0]}, [DUMMY]
-    vdup.8      d8, d8[3]
-.endm
-
-.macro pixman_composite_over_n_8_8_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
-    FLAG_DST_READWRITE, \
-    32, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_8_8_init, \
-    pixman_composite_over_n_8_8_cleanup, \
-    pixman_composite_over_n_8_8_process_pixblock_head, \
-    pixman_composite_over_n_8_8_process_pixblock_tail, \
-    pixman_composite_over_n_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
-    /*
-     * 'combine_mask_ca' replacement
-     *
-     * input:  solid src (n) in {d8,  d9,  d10, d11}
-     *         dest in          {d4,  d5,  d6,  d7 }
-     *         mask in          {d24, d25, d26, d27}
-     * output: updated src in   {d0,  d1,  d2,  d3 }
-     *         updated mask in  {d24, d25, d26, d3 }
-     */
-    vmull.u8    q0,  d24, d8
-    vmull.u8    q1,  d25, d9
-    vmull.u8    q6,  d26, d10
-    vmull.u8    q7,  d27, d11
-    vmull.u8    q9,  d11, d25
-    vmull.u8    q12, d11, d24
-    vmull.u8    q13, d11, d26
-    vrshr.u16   q8,  q0,  #8
-    vrshr.u16   q10, q1,  #8
-    vrshr.u16   q11, q6,  #8
-    vraddhn.u16 d0,  q0,  q8
-    vraddhn.u16 d1,  q1,  q10
-    vraddhn.u16 d2,  q6,  q11
-    vrshr.u16   q11, q12, #8
-    vrshr.u16   q8,  q9,  #8
-    vrshr.u16   q6,  q13, #8
-    vrshr.u16   q10, q7,  #8
-    vraddhn.u16 d24, q12, q11
-    vraddhn.u16 d25, q9,  q8
-    vraddhn.u16 d26, q13, q6
-    vraddhn.u16 d3,  q7,  q10
-    /*
-     * 'combine_over_ca' replacement
-     *
-     * output: updated dest in {d28, d29, d30, d31}
-     */
-    vmvn.8      q12, q12
-    vmvn.8      d26, d26
-    vmull.u8    q8,  d24, d4
-    vmull.u8    q9,  d25, d5
-    vmvn.8      d27, d3
-    vmull.u8    q10, d26, d6
-    vmull.u8    q11, d27, d7
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
-    /* ... continue 'combine_over_ca' replacement */
-    vrshr.u16   q14, q8,  #8
-    vrshr.u16   q15, q9,  #8
-    vrshr.u16   q6,  q10, #8
-    vrshr.u16   q7,  q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q6,  q10
-    vraddhn.u16 d31, q7,  q11
-    vqadd.u8    q14, q0,  q14
-    vqadd.u8    q15, q1,  q15
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
-        vrshr.u16   q14, q8, #8
-        vrshr.u16   q15, q9, #8
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-        vrshr.u16   q6, q10, #8
-        vrshr.u16   q7, q11, #8
-        vraddhn.u16 d28, q14, q8
-        vraddhn.u16 d29, q15, q9
-        vraddhn.u16 d30, q6, q10
-        vraddhn.u16 d31, q7, q11
-    fetch_mask_pixblock
-        vqadd.u8    q14, q0, q14
-        vqadd.u8    q15, q1, q15
-    cache_preload 8, 8
-    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
-    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vpush       {d8-d15}
-    vld1.32     {d11[0]}, [DUMMY]
-    vdup.8      d8, d11[0]
-    vdup.8      d9, d11[1]
-    vdup.8      d10, d11[2]
-    vdup.8      d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_8888_8888_ca_init, \
-    pixman_composite_over_n_8888_8888_ca_cleanup, \
-    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
-    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
-    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
-    /*
-     * 'combine_mask_ca' replacement
-     *
-     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
-     *         mask in          {d24, d25, d26}       [B, G, R]
-     * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
-     *         updated mask in  {d24, d25, d26}       [B, G, R]
-     */
-    vmull.u8    q0,  d24, d8
-    vmull.u8    q1,  d25, d9
-    vmull.u8    q6,  d26, d10
-    vmull.u8    q9,  d11, d25
-    vmull.u8    q12, d11, d24
-    vmull.u8    q13, d11, d26
-    vrshr.u16   q8,  q0,  #8
-    vrshr.u16   q10, q1,  #8
-    vrshr.u16   q11, q6,  #8
-    vraddhn.u16 d0,  q0,  q8
-    vraddhn.u16 d1,  q1,  q10
-    vraddhn.u16 d2,  q6,  q11
-    vrshr.u16   q11, q12, #8
-    vrshr.u16   q8,  q9,  #8
-    vrshr.u16   q6,  q13, #8
-    vraddhn.u16 d24, q12, q11
-    vraddhn.u16 d25, q9,  q8
-    /*
-     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
-     * and put data into d16 - blue, d17 - green, d18 - red
-     */
-       vshrn.u16   d17, q2,  #3
-       vshrn.u16   d18, q2,  #8
-    vraddhn.u16 d26, q13, q6
-       vsli.u16    q2,  q2,  #5
-       vsri.u8     d18, d18, #5
-       vsri.u8     d17, d17, #6
-    /*
-     * 'combine_over_ca' replacement
-     *
-     * output: updated dest in d16 - blue, d17 - green, d18 - red
-     */
-    vmvn.8      q12, q12
-       vshrn.u16   d16, q2,  #2
-    vmvn.8      d26, d26
-    vmull.u8    q6,  d16, d24
-    vmull.u8    q7,  d17, d25
-    vmull.u8    q11, d18, d26
-.endm
-
-.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
-    /* ... continue 'combine_over_ca' replacement */
-    vrshr.u16   q10, q6,  #8
-    vrshr.u16   q14, q7,  #8
-    vrshr.u16   q15, q11, #8
-    vraddhn.u16 d16, q10, q6
-    vraddhn.u16 d17, q14, q7
-    vraddhn.u16 d18, q15, q11
-    vqadd.u8    q8,  q0,  q8
-    vqadd.u8    d18, d2,  d18
-    /*
-     * convert the results in d16, d17, d18 to r5g6b5 and store
-     * them into {d28, d29}
-     */
-    vshll.u8    q14, d18, #8
-    vshll.u8    q10, d17, #8
-    vshll.u8    q15, d16, #8
-    vsri.u16    q14, q10, #5
-    vsri.u16    q14, q15, #11
-.endm
-
-.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
-    fetch_mask_pixblock
-        vrshr.u16   q10, q6, #8
-        vrshr.u16   q14, q7, #8
-    vld1.16     {d4, d5}, [DST_R, :128]!
-        vrshr.u16   q15, q11, #8
-        vraddhn.u16 d16, q10, q6
-        vraddhn.u16 d17, q14, q7
-        vraddhn.u16 d22, q15, q11
-            /* process_pixblock_head */
-            /*
-             * 'combine_mask_ca' replacement
-             *
-             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
-             *         mask in          {d24, d25, d26}       [B, G, R]
-             * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
-             *         updated mask in  {d24, d25, d26}       [B, G, R]
-             */
-            vmull.u8    q6,  d26, d10
-        vqadd.u8    q8,  q0, q8
-            vmull.u8    q0,  d24, d8
-        vqadd.u8    d22, d2, d22
-            vmull.u8    q1,  d25, d9
-        /*
-         * convert the result in d16, d17, d22 to r5g6b5 and store
-         * it into {d28, d29}
-         */
-        vshll.u8    q14, d22, #8
-        vshll.u8    q10, d17, #8
-        vshll.u8    q15, d16, #8
-            vmull.u8    q9,  d11, d25
-        vsri.u16    q14, q10, #5
-            vmull.u8    q12, d11, d24
-            vmull.u8    q13, d11, d26
-        vsri.u16    q14, q15, #11
-    cache_preload 8, 8
-            vrshr.u16   q8,  q0,  #8
-            vrshr.u16   q10, q1,  #8
-            vrshr.u16   q11, q6,  #8
-            vraddhn.u16 d0,  q0,  q8
-            vraddhn.u16 d1,  q1,  q10
-            vraddhn.u16 d2,  q6,  q11
-            vrshr.u16   q11, q12, #8
-            vrshr.u16   q8,  q9,  #8
-            vrshr.u16   q6,  q13, #8
-            vraddhn.u16 d24, q12, q11
-            vraddhn.u16 d25, q9,  q8
-                /*
-                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
-	         * 8-bit format and put data into d16 - blue, d17 - green,
-	         * d18 - red
-                 */
-                vshrn.u16   d17, q2,  #3
-                vshrn.u16   d18, q2,  #8
-            vraddhn.u16 d26, q13, q6
-                vsli.u16    q2,  q2,  #5
-                vsri.u8     d17, d17, #6
-                vsri.u8     d18, d18, #5
-            /*
-             * 'combine_over_ca' replacement
-             *
-             * output: updated dest in d16 - blue, d17 - green, d18 - red
-             */
-            vmvn.8      q12, q12
-                vshrn.u16   d16, q2,  #2
-            vmvn.8      d26, d26
-            vmull.u8    q7,  d17, d25
-            vmull.u8    q6,  d16, d24
-            vmull.u8    q11, d18, d26
-    vst1.16     {d28, d29}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_over_n_8888_0565_ca_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vpush       {d8-d15}
-    vld1.32     {d11[0]}, [DUMMY]
-    vdup.8      d8, d11[0]
-    vdup.8      d9, d11[1]
-    vdup.8      d10, d11[2]
-    vdup.8      d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8888_0565_ca_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_8888_0565_ca_init, \
-    pixman_composite_over_n_8888_0565_ca_cleanup, \
-    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
-    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
-    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_in_n_8_process_pixblock_head
-    /* expecting source data in {d0, d1, d2, d3} */
-    /* and destination data in {d4, d5, d6, d7} */
-    vmull.u8    q8,  d4,  d3
-    vmull.u8    q9,  d5,  d3
-    vmull.u8    q10, d6,  d3
-    vmull.u8    q11, d7,  d3
-.endm
-
-.macro pixman_composite_in_n_8_process_pixblock_tail
-    vrshr.u16   q14, q8,  #8
-    vrshr.u16   q15, q9,  #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d28, q8,  q14
-    vraddhn.u16 d29, q9,  q15
-    vraddhn.u16 d30, q10, q12
-    vraddhn.u16 d31, q11, q13
-.endm
-
-.macro pixman_composite_in_n_8_process_pixblock_tail_head
-    pixman_composite_in_n_8_process_pixblock_tail
-    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    cache_preload 32, 32
-    pixman_composite_in_n_8_process_pixblock_head
-    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_in_n_8_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d3[0]}, [DUMMY]
-    vdup.8      d3, d3[3]
-.endm
-
-.macro pixman_composite_in_n_8_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
-    FLAG_DST_READWRITE, \
-    32, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_in_n_8_init, \
-    pixman_composite_in_n_8_cleanup, \
-    pixman_composite_in_n_8_process_pixblock_head, \
-    pixman_composite_in_n_8_process_pixblock_tail, \
-    pixman_composite_in_n_8_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-.macro pixman_composite_add_n_8_8_process_pixblock_head
-    /* expecting source data in {d8, d9, d10, d11} */
-    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
-    /* and destination data in {d4, d5, d6, d7} */
-    /* mask is in d24, d25, d26, d27 */
-    vmull.u8    q0, d24, d11
-    vmull.u8    q1, d25, d11
-    vmull.u8    q6, d26, d11
-    vmull.u8    q7, d27, d11
-    vrshr.u16   q10, q0, #8
-    vrshr.u16   q11, q1, #8
-    vrshr.u16   q12, q6, #8
-    vrshr.u16   q13, q7, #8
-    vraddhn.u16 d0, q0, q10
-    vraddhn.u16 d1, q1, q11
-    vraddhn.u16 d2, q6, q12
-    vraddhn.u16 d3, q7, q13
-    vqadd.u8    q14, q0, q2
-    vqadd.u8    q15, q1, q3
-.endm
-
-.macro pixman_composite_add_n_8_8_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
-    pixman_composite_add_n_8_8_process_pixblock_tail
-    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    fetch_mask_pixblock
-    cache_preload 32, 32
-    pixman_composite_add_n_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_add_n_8_8_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vpush       {d8-d15}
-    vld1.32     {d11[0]}, [DUMMY]
-    vdup.8      d11, d11[3]
-.endm
-
-.macro pixman_composite_add_n_8_8_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
-    FLAG_DST_READWRITE, \
-    32, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_add_n_8_8_init, \
-    pixman_composite_add_n_8_8_cleanup, \
-    pixman_composite_add_n_8_8_process_pixblock_head, \
-    pixman_composite_add_n_8_8_process_pixblock_tail, \
-    pixman_composite_add_n_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8_8_8_process_pixblock_head
-    /* expecting source data in {d0, d1, d2, d3} */
-    /* destination data in {d4, d5, d6, d7} */
-    /* mask in {d24, d25, d26, d27} */
-    vmull.u8    q8, d24, d0
-    vmull.u8    q9, d25, d1
-    vmull.u8    q10, d26, d2
-    vmull.u8    q11, d27, d3
-    vrshr.u16   q0, q8, #8
-    vrshr.u16   q1, q9, #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d0, q0, q8
-    vraddhn.u16 d1, q1, q9
-    vraddhn.u16 d2, q12, q10
-    vraddhn.u16 d3, q13, q11
-    vqadd.u8    q14, q0, q2
-    vqadd.u8    q15, q1, q3
-.endm
-
-.macro pixman_composite_add_8_8_8_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
-    pixman_composite_add_8_8_8_process_pixblock_tail
-    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    fetch_mask_pixblock
-    fetch_src_pixblock
-    cache_preload 32, 32
-    pixman_composite_add_8_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_add_8_8_8_init
-.endm
-
-.macro pixman_composite_add_8_8_8_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
-    FLAG_DST_READWRITE, \
-    32, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_add_8_8_8_init, \
-    pixman_composite_add_8_8_8_cleanup, \
-    pixman_composite_add_8_8_8_process_pixblock_head, \
-    pixman_composite_add_8_8_8_process_pixblock_tail, \
-    pixman_composite_add_8_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
-    /* expecting source data in {d0, d1, d2, d3} */
-    /* destination data in {d4, d5, d6, d7} */
-    /* mask in {d24, d25, d26, d27} */
-    vmull.u8    q8,  d27, d0
-    vmull.u8    q9,  d27, d1
-    vmull.u8    q10, d27, d2
-    vmull.u8    q11, d27, d3
-    /* 1 cycle bubble */
-    vrsra.u16   q8,  q8,  #8
-    vrsra.u16   q9,  q9,  #8
-    vrsra.u16   q10, q10, #8
-    vrsra.u16   q11, q11, #8
-.endm
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
-    /* 2 cycle bubble */
-    vrshrn.u16  d28, q8,  #8
-    vrshrn.u16  d29, q9,  #8
-    vrshrn.u16  d30, q10, #8
-    vrshrn.u16  d31, q11, #8
-    vqadd.u8    q14, q2,  q14
-    /* 1 cycle bubble */
-    vqadd.u8    q15, q3,  q15
-.endm
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
-    fetch_src_pixblock
-        vrshrn.u16  d28, q8,  #8
-    fetch_mask_pixblock
-        vrshrn.u16  d29, q9,  #8
-    vmull.u8    q8,  d27, d0
-        vrshrn.u16  d30, q10, #8
-    vmull.u8    q9,  d27, d1
-        vrshrn.u16  d31, q11, #8
-    vmull.u8    q10, d27, d2
-        vqadd.u8    q14, q2,  q14
-    vmull.u8    q11, d27, d3
-        vqadd.u8    q15, q3,  q15
-    vrsra.u16   q8,  q8,  #8
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    vrsra.u16   q9,  q9,  #8
-        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-    vrsra.u16   q10, q10, #8
-
-    cache_preload 8, 8
-
-    vrsra.u16   q11, q11, #8
-.endm
-
-generate_composite_function \
-    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-generate_composite_function \
-    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    27  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_n_8_8888_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d3[0]}, [DUMMY]
-    vdup.8      d0, d3[0]
-    vdup.8      d1, d3[1]
-    vdup.8      d2, d3[2]
-    vdup.8      d3, d3[3]
-.endm
-
-.macro pixman_composite_add_n_8_8888_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_add_n_8_8888_init, \
-    pixman_composite_add_n_8_8888_cleanup, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    27  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_n_8888_init
-    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
-    vld1.32     {d27[0]}, [DUMMY]
-    vdup.8      d27, d27[3]
-.endm
-
-.macro pixman_composite_add_8888_n_8888_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_add_8888_n_8888_init, \
-    pixman_composite_add_8888_n_8888_cleanup, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    27  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
-    /* expecting source data in {d0, d1, d2, d3} */
-    /* destination data in {d4, d5, d6, d7} */
-    /* solid mask is in d15 */
-
-    /* 'in' */
-    vmull.u8    q8, d15, d3
-    vmull.u8    q6, d15, d2
-    vmull.u8    q5, d15, d1
-    vmull.u8    q4, d15, d0
-    vrshr.u16   q13, q8, #8
-    vrshr.u16   q12, q6, #8
-    vrshr.u16   q11, q5, #8
-    vrshr.u16   q10, q4, #8
-    vraddhn.u16 d3, q8, q13
-    vraddhn.u16 d2, q6, q12
-    vraddhn.u16 d1, q5, q11
-    vraddhn.u16 d0, q4, q10
-    vmvn.8      d24, d3  /* get inverted alpha */
-    /* now do alpha blending */
-    vmull.u8    q8, d24, d4
-    vmull.u8    q9, d24, d5
-    vmull.u8    q10, d24, d6
-    vmull.u8    q11, d24, d7
-.endm
-
-.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
-    vrshr.u16   q14, q8, #8
-    vrshr.u16   q15, q9, #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q12, q10
-    vraddhn.u16 d31, q13, q11
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
-    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
-    fetch_src_pixblock
-    cache_preload 8, 8
-    fetch_mask_pixblock
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
-    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
-    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    12  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_n_8888_process_pixblock_head
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
-    vqadd.u8    q14, q0, q14
-    vqadd.u8    q15, q1, q15
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
-    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
-    pixman_composite_over_8888_n_8888_process_pixblock_tail
-    fetch_src_pixblock
-    cache_preload 8, 8
-    pixman_composite_over_8888_n_8888_process_pixblock_head
-    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_over_8888_n_8888_init
-    add         DUMMY, sp, #48
-    vpush       {d8-d15}
-    vld1.32     {d15[0]}, [DUMMY]
-    vdup.8      d15, d15[3]
-.endm
-
-.macro pixman_composite_over_8888_n_8888_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_8888_n_8888_init, \
-    pixman_composite_over_8888_n_8888_cleanup, \
-    pixman_composite_over_8888_n_8888_process_pixblock_head, \
-    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
-    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
-    pixman_composite_over_8888_n_8888_process_pixblock_tail
-    fetch_src_pixblock
-    cache_preload 8, 8
-    fetch_mask_pixblock
-    pixman_composite_over_8888_n_8888_process_pixblock_head
-    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_8888_n_8888_process_pixblock_head, \
-    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    12  /* mask_basereg  */
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_8888_n_8888_process_pixblock_head, \
-    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    12  /* mask_basereg  */
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
-    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
-    pixman_composite_over_8888_n_8888_process_pixblock_tail
-    fetch_src_pixblock
-    cache_preload 8, 8
-    fetch_mask_pixblock
-    pixman_composite_over_8888_n_8888_process_pixblock_head
-    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_8888_n_8888_process_pixblock_head, \
-    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    15  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_0888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_0888_0888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
-    vst3.8 {d0, d1, d2}, [DST_W]!
-    fetch_src_pixblock
-    cache_preload 8, 8
-.endm
-
-generate_composite_function \
-    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
-    FLAG_DST_WRITEONLY, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_0888_0888_process_pixblock_head, \
-    pixman_composite_src_0888_0888_process_pixblock_tail, \
-    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
-    vswp   d0, d2
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
-    vst4.8 {d0, d1, d2, d3}, [DST_W]!
-    fetch_src_pixblock
-    vswp   d0, d2
-    cache_preload 8, 8
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_init
-    veor   d3, d3, d3
-.endm
-
-generate_composite_function \
-    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    pixman_composite_src_0888_8888_rev_init, \
-    default_cleanup, \
-    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
-    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
-    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
-    vshll.u8    q8, d1, #8
-    vshll.u8    q9, d2, #8
-.endm
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
-    vshll.u8    q14, d0, #8
-    vsri.u16    q14, q8, #5
-    vsri.u16    q14, q9, #11
-.endm
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
-        vshll.u8    q14, d0, #8
-    fetch_src_pixblock
-        vsri.u16    q14, q8, #5
-        vsri.u16    q14, q9, #11
-    vshll.u8    q8, d1, #8
-        vst1.16 {d28, d29}, [DST_W, :128]!
-    vshll.u8    q9, d2, #8
-.endm
-
-generate_composite_function \
-    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
-    FLAG_DST_WRITEONLY, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
-    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
-    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
-    vmull.u8    q8, d3, d0
-    vmull.u8    q9, d3, d1
-    vmull.u8    q10, d3, d2
-.endm
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
-    vrshr.u16   q11, q8, #8
-    vswp        d3, d31
-    vrshr.u16   q12, q9, #8
-    vrshr.u16   q13, q10, #8
-    vraddhn.u16 d30, q11, q8
-    vraddhn.u16 d29, q12, q9
-    vraddhn.u16 d28, q13, q10
-.endm
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
-        vrshr.u16   q11, q8, #8
-        vswp        d3, d31
-        vrshr.u16   q12, q9, #8
-        vrshr.u16   q13, q10, #8
-    fetch_src_pixblock
-        vraddhn.u16 d30, q11, q8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vraddhn.u16 d29, q12, q9
-        vraddhn.u16 d28, q13, q10
-    vmull.u8    q8, d3, d0
-    vmull.u8    q9, d3, d1
-    vmull.u8    q10, d3, d2
-        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-.endm
-
-generate_composite_function \
-    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
-    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
-    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
-    vmull.u8    q8, d3, d0
-    vmull.u8    q9, d3, d1
-    vmull.u8    q10, d3, d2
-.endm
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
-    vrshr.u16   q11, q8, #8
-    vswp        d3, d31
-    vrshr.u16   q12, q9, #8
-    vrshr.u16   q13, q10, #8
-    vraddhn.u16 d28, q11, q8
-    vraddhn.u16 d29, q12, q9
-    vraddhn.u16 d30, q13, q10
-.endm
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
-        vrshr.u16   q11, q8, #8
-        vswp        d3, d31
-        vrshr.u16   q12, q9, #8
-        vrshr.u16   q13, q10, #8
-    fetch_src_pixblock
-        vraddhn.u16 d28, q11, q8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vraddhn.u16 d29, q12, q9
-        vraddhn.u16 d30, q13, q10
-    vmull.u8    q8, d3, d0
-    vmull.u8    q9, d3, d1
-    vmull.u8    q10, d3, d2
-        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-.endm
-
-generate_composite_function \
-    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
-    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
-    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_0565_8_0565_process_pixblock_head
-    /* mask is in d15 */
-    convert_0565_to_x888 q4, d2, d1, d0
-    convert_0565_to_x888 q5, d6, d5, d4
-    /* source pixel data is in      {d0, d1, d2, XX} */
-    /* destination pixel data is in {d4, d5, d6, XX} */
-    vmvn.8      d7,  d15
-    vmull.u8    q6,  d15, d2
-    vmull.u8    q5,  d15, d1
-    vmull.u8    q4,  d15, d0
-    vmull.u8    q8,  d7,  d4
-    vmull.u8    q9,  d7,  d5
-    vmull.u8    q13, d7,  d6
-    vrshr.u16   q12, q6,  #8
-    vrshr.u16   q11, q5,  #8
-    vrshr.u16   q10, q4,  #8
-    vraddhn.u16 d2,  q6,  q12
-    vraddhn.u16 d1,  q5,  q11
-    vraddhn.u16 d0,  q4,  q10
-.endm
-
-.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
-    vrshr.u16   q14, q8,  #8
-    vrshr.u16   q15, q9,  #8
-    vrshr.u16   q12, q13, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q12, q13
-    vqadd.u8    q0,  q0,  q14
-    vqadd.u8    q1,  q1,  q15
-    /* 32bpp result is in {d0, d1, d2, XX} */
-    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
-    fetch_mask_pixblock
-    pixman_composite_over_0565_8_0565_process_pixblock_tail
-    fetch_src_pixblock
-    vld1.16    {d10, d11}, [DST_R, :128]!
-    cache_preload 8, 8
-    pixman_composite_over_0565_8_0565_process_pixblock_head
-    vst1.16    {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_0565_8_0565_process_pixblock_head, \
-    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
-    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    10,  /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    15  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_0565_n_0565_init
-    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
-    vpush       {d8-d15}
-    vld1.32     {d15[0]}, [DUMMY]
-    vdup.8      d15, d15[3]
-.endm
-
-.macro pixman_composite_over_0565_n_0565_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_0565_n_0565_init, \
-    pixman_composite_over_0565_n_0565_cleanup, \
-    pixman_composite_over_0565_8_0565_process_pixblock_head, \
-    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
-    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    10, /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    15  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_0565_8_0565_process_pixblock_head
-    /* mask is in d15 */
-    convert_0565_to_x888 q4, d2, d1, d0
-    convert_0565_to_x888 q5, d6, d5, d4
-    /* source pixel data is in      {d0, d1, d2, XX} */
-    /* destination pixel data is in {d4, d5, d6, XX} */
-    vmull.u8    q6,  d15, d2
-    vmull.u8    q5,  d15, d1
-    vmull.u8    q4,  d15, d0
-    vrshr.u16   q12, q6,  #8
-    vrshr.u16   q11, q5,  #8
-    vrshr.u16   q10, q4,  #8
-    vraddhn.u16 d2,  q6,  q12
-    vraddhn.u16 d1,  q5,  q11
-    vraddhn.u16 d0,  q4,  q10
-.endm
-
-.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
-    vqadd.u8    q0,  q0,  q2
-    vqadd.u8    q1,  q1,  q3
-    /* 32bpp result is in {d0, d1, d2, XX} */
-    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
-    fetch_mask_pixblock
-    pixman_composite_add_0565_8_0565_process_pixblock_tail
-    fetch_src_pixblock
-    vld1.16    {d10, d11}, [DST_R, :128]!
-    cache_preload 8, 8
-    pixman_composite_add_0565_8_0565_process_pixblock_head
-    vst1.16    {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_add_0565_8_0565_process_pixblock_head, \
-    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
-    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    10, /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    15  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
-    /* mask is in d15 */
-    convert_0565_to_x888 q5, d6, d5, d4
-    /* destination pixel data is in {d4, d5, d6, xx} */
-    vmvn.8      d24, d15 /* get inverted alpha */
-    /* now do alpha blending */
-    vmull.u8    q8, d24, d4
-    vmull.u8    q9, d24, d5
-    vmull.u8    q10, d24, d6
-.endm
-
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
-    vrshr.u16   q14, q8, #8
-    vrshr.u16   q15, q9, #8
-    vrshr.u16   q12, q10, #8
-    vraddhn.u16 d0, q14, q8
-    vraddhn.u16 d1, q15, q9
-    vraddhn.u16 d2, q12, q10
-    /* 32bpp result is in {d0, d1, d2, XX} */
-    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
-    fetch_src_pixblock
-    pixman_composite_out_reverse_8_0565_process_pixblock_tail
-    vld1.16    {d10, d11}, [DST_R, :128]!
-    cache_preload 8, 8
-    pixman_composite_out_reverse_8_0565_process_pixblock_head
-    vst1.16    {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
-    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
-    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    10, /* dst_r_basereg */ \
-    15, /* src_basereg   */ \
-    0   /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
-    /* src is in d0 */
-    /* destination pixel data is in {d4, d5, d6, d7} */
-    vmvn.8      d1, d0 /* get inverted alpha */
-    /* now do alpha blending */
-    vmull.u8    q8, d1, d4
-    vmull.u8    q9, d1, d5
-    vmull.u8    q10, d1, d6
-    vmull.u8    q11, d1, d7
-.endm
-
-.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
-    vrshr.u16   q14, q8, #8
-    vrshr.u16   q15, q9, #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q12, q10
-    vraddhn.u16 d31, q13, q11
-    /* 32bpp result is in {d28, d29, d30, d31} */
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
-    fetch_src_pixblock
-    pixman_composite_out_reverse_8_8888_process_pixblock_tail
-    vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
-    cache_preload 8, 8
-    pixman_composite_out_reverse_8_8888_process_pixblock_head
-    vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
-    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
-    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0   /* mask_basereg  */
-
-/******************************************************************************/
-
-generate_composite_function_nearest_scanline \
-    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_8888_process_pixblock_head, \
-    pixman_composite_over_8888_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_nearest_scanline \
-    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_0565_process_pixblock_head, \
-    pixman_composite_over_8888_0565_process_pixblock_tail, \
-    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-generate_composite_function_nearest_scanline \
-    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_8888_0565_process_pixblock_head, \
-    pixman_composite_src_8888_0565_process_pixblock_tail, \
-    pixman_composite_src_8888_0565_process_pixblock_tail_head
-
-generate_composite_function_nearest_scanline \
-    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_0565_8888_process_pixblock_head, \
-    pixman_composite_src_0565_8888_process_pixblock_tail, \
-    pixman_composite_src_0565_8888_process_pixblock_tail_head
-
-generate_composite_function_nearest_scanline \
-    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_8888_8_0565_process_pixblock_head, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-generate_composite_function_nearest_scanline \
-    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_0565_8_0565_process_pixblock_head, \
-    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
-    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    10,  /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    15  /* mask_basereg  */
-
-/******************************************************************************/
-
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
-    .func fname
-    .global fname
-#ifdef __ELF__
-    .hidden fname
-    .type fname, %function
-#endif
-fname:
-.endm
-
-/*
- * Bilinear scaling support code which tries to provide pixel fetching, color
- * format conversion, and interpolation as separate macros which can be used
- * as the basic building blocks for constructing bilinear scanline functions.
- */
-
-.macro bilinear_load_8888 reg1, reg2, tmp
-    mov       TMP1, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    vld1.32   {reg1}, [TMP1], STRIDE
-    vld1.32   {reg2}, [TMP1]
-.endm
-
-.macro bilinear_load_0565 reg1, reg2, tmp
-    mov       TMP1, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #1
-    vld1.32   {reg2[0]}, [TMP1], STRIDE
-    vld1.32   {reg2[1]}, [TMP1]
-    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
-.endm
-
-.macro bilinear_load_and_vertical_interpolate_two_8888 \
-                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
-
-    bilinear_load_8888 reg1, reg2, tmp1
-    vmull.u8  acc1, reg1, d28
-    vmlal.u8  acc1, reg2, d29
-    bilinear_load_8888 reg3, reg4, tmp2
-    vmull.u8  acc2, reg3, d28
-    vmlal.u8  acc2, reg4, d29
-.endm
-
-.macro bilinear_load_and_vertical_interpolate_four_8888 \
-                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
-                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
-
-    bilinear_load_and_vertical_interpolate_two_8888 \
-                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
-    bilinear_load_and_vertical_interpolate_two_8888 \
-                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
-.endm
-
-.macro bilinear_load_and_vertical_interpolate_two_0565 \
-                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
-
-    mov       TMP1, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #1
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
-    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
-    vld1.32   {acc2lo[1]}, [TMP1]
-    vld1.32   {acc2hi[1]}, [TMP2]
-    convert_0565_to_x888 acc2, reg3, reg2, reg1
-    vzip.u8   reg1, reg3
-    vzip.u8   reg2, reg4
-    vzip.u8   reg3, reg4
-    vzip.u8   reg1, reg2
-    vmull.u8  acc1, reg1, d28
-    vmlal.u8  acc1, reg2, d29
-    vmull.u8  acc2, reg3, d28
-    vmlal.u8  acc2, reg4, d29
-.endm
-
-.macro bilinear_load_and_vertical_interpolate_four_0565 \
-                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
-                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
-
-    mov       TMP1, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #1
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
-    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
-    vld1.32   {xacc2lo[1]}, [TMP1]
-    vld1.32   {xacc2hi[1]}, [TMP2]
-    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
-    mov       TMP1, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #1
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
-    vzip.u8   xreg1, xreg3
-    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
-    vzip.u8   xreg2, xreg4
-    vld1.32   {yacc2lo[1]}, [TMP1]
-    vzip.u8   xreg3, xreg4
-    vld1.32   {yacc2hi[1]}, [TMP2]
-    vzip.u8   xreg1, xreg2
-    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
-    vmull.u8  xacc1, xreg1, d28
-    vzip.u8   yreg1, yreg3
-    vmlal.u8  xacc1, xreg2, d29
-    vzip.u8   yreg2, yreg4
-    vmull.u8  xacc2, xreg3, d28
-    vzip.u8   yreg3, yreg4
-    vmlal.u8  xacc2, xreg4, d29
-    vzip.u8   yreg1, yreg2
-    vmull.u8  yacc1, yreg1, d28
-    vmlal.u8  yacc1, yreg2, d29
-    vmull.u8  yacc2, yreg3, d28
-    vmlal.u8  yacc2, yreg4, d29
-.endm
-
-.macro bilinear_store_8888 numpix, tmp1, tmp2
-.if numpix == 4
-    vst1.32   {d0, d1}, [OUT, :128]!
-.elseif numpix == 2
-    vst1.32   {d0}, [OUT, :64]!
-.elseif numpix == 1
-    vst1.32   {d0[0]}, [OUT, :32]!
-.else
-    .error bilinear_store_8888 numpix is unsupported
-.endif
-.endm
-
-.macro bilinear_store_0565 numpix, tmp1, tmp2
-    vuzp.u8 d0, d1
-    vuzp.u8 d2, d3
-    vuzp.u8 d1, d3
-    vuzp.u8 d0, d2
-    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
-.if numpix == 4
-    vst1.16   {d2}, [OUT, :64]!
-.elseif numpix == 2
-    vst1.32   {d2[0]}, [OUT, :32]!
-.elseif numpix == 1
-    vst1.16   {d2[0]}, [OUT, :16]!
-.else
-    .error bilinear_store_0565 numpix is unsupported
-.endif
-.endm
-
-.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
-    bilinear_load_&src_fmt d0, d1, d2
-    vmull.u8  q1, d0, d28
-    vmlal.u8  q1, d1, d29
-    /* 5 cycles bubble */
-    vshll.u16 q0, d2, #8
-    vmlsl.u16 q0, d2, d30
-    vmlal.u16 q0, d3, d30
-    /* 5 cycles bubble */
-    vshrn.u32 d0, q0, #16
-    /* 3 cycles bubble */
-    vmovn.u16 d0, q0
-    /* 1 cycle bubble */
-    bilinear_store_&dst_fmt 1, q2, q3
-.endm
-
-.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
-    bilinear_load_and_vertical_interpolate_two_&src_fmt \
-                q1, q11, d0, d1, d20, d21, d22, d23
-    vshll.u16 q0, d2, #8
-    vmlsl.u16 q0, d2, d30
-    vmlal.u16 q0, d3, d30
-    vshll.u16 q10, d22, #8
-    vmlsl.u16 q10, d22, d31
-    vmlal.u16 q10, d23, d31
-    vshrn.u32 d0, q0, #16
-    vshrn.u32 d1, q10, #16
-    vshr.u16  q15, q12, #8
-    vadd.u16  q12, q12, q13
-    vmovn.u16 d0, q0
-    bilinear_store_&dst_fmt 2, q2, q3
-.endm
-
-.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
-    bilinear_load_and_vertical_interpolate_four_&src_fmt \
-                q1, q11, d0, d1, d20, d21, d22, d23 \
-                q3, q9,  d4, d5, d16, d17, d18, d19
-    pld       [TMP1, PF_OFFS]
-    sub       TMP1, TMP1, STRIDE
-    vshll.u16 q0, d2, #8
-    vmlsl.u16 q0, d2, d30
-    vmlal.u16 q0, d3, d30
-    vshll.u16 q10, d22, #8
-    vmlsl.u16 q10, d22, d31
-    vmlal.u16 q10, d23, d31
-    vshr.u16  q15, q12, #8
-    vshll.u16 q2, d6, #8
-    vmlsl.u16 q2, d6, d30
-    vmlal.u16 q2, d7, d30
-    vshll.u16 q8, d18, #8
-    pld       [TMP2, PF_OFFS]
-    vmlsl.u16 q8, d18, d31
-    vmlal.u16 q8, d19, d31
-    vadd.u16  q12, q12, q13
-    vshrn.u32 d0, q0, #16
-    vshrn.u32 d1, q10, #16
-    vshrn.u32 d4, q2, #16
-    vshrn.u32 d5, q8, #16
-    vshr.u16  q15, q12, #8
-    vmovn.u16 d0, q0
-    vmovn.u16 d1, q2
-    vadd.u16  q12, q12, q13
-    bilinear_store_&dst_fmt 4, q2, q3
-.endm
-
-.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
-.else
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
-.endif
-.endm
-
-.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
-.endif
-.endm
-
-.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
-.else
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
-.endif
-.endm
-
-.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
-.else
-    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-.endif
-.endm
-
-.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
-.else
-    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
-.endif
-.endm
-
-.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
-.else
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-.endif
-.endm
-
-.set BILINEAR_FLAG_UNROLL_4,          0
-.set BILINEAR_FLAG_UNROLL_8,          1
-.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
-
-/*
- * Main template macro for generating NEON optimized bilinear scanline
- * functions.
- *
- * Bilinear scanline scaler macro template uses the following arguments:
- *  fname             - name of the function to generate
- *  src_fmt           - source color format (8888 or 0565)
- *  dst_fmt           - destination color format (8888 or 0565)
- *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
- *  prefetch_distance - prefetch in the source image by that many
- *                      pixels ahead
- */
-
-.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
-                                       src_bpp_shift, dst_bpp_shift, \
-                                       prefetch_distance, flags
-
-pixman_asm_function fname
-    OUT       .req      r0
-    TOP       .req      r1
-    BOTTOM    .req      r2
-    WT        .req      r3
-    WB        .req      r4
-    X         .req      r5
-    UX        .req      r6
-    WIDTH     .req      ip
-    TMP1      .req      r3
-    TMP2      .req      r4
-    PF_OFFS   .req      r7
-    TMP3      .req      r8
-    TMP4      .req      r9
-    STRIDE    .req      r2
-
-    mov       ip, sp
-    push      {r4, r5, r6, r7, r8, r9}
-    mov       PF_OFFS, #prefetch_distance
-    ldmia     ip, {WB, X, UX, WIDTH}
-    mul       PF_OFFS, PF_OFFS, UX
-
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
-    vpush     {d8-d15}
-.endif
-
-    sub       STRIDE, BOTTOM, TOP
-    .unreq    BOTTOM
-
-    cmp       WIDTH, #0
-    ble       3f
-
-    vdup.u16  q12, X
-    vdup.u16  q13, UX
-    vdup.u8   d28, WT
-    vdup.u8   d29, WB
-    vadd.u16  d25, d25, d26
-
-    /* ensure good destination alignment  */
-    cmp       WIDTH, #1
-    blt       0f
-    tst       OUT, #(1 << dst_bpp_shift)
-    beq       0f
-    vshr.u16  q15, q12, #8
-    vadd.u16  q12, q12, q13
-    bilinear_interpolate_last_pixel src_fmt, dst_fmt
-    sub       WIDTH, WIDTH, #1
-0:
-    vadd.u16  q13, q13, q13
-    vshr.u16  q15, q12, #8
-    vadd.u16  q12, q12, q13
-
-    cmp       WIDTH, #2
-    blt       0f
-    tst       OUT, #(1 << (dst_bpp_shift + 1))
-    beq       0f
-    bilinear_interpolate_two_pixels src_fmt, dst_fmt
-    sub       WIDTH, WIDTH, #2
-0:
-.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
-/*********** 8 pixels per iteration *****************/
-    cmp       WIDTH, #4
-    blt       0f
-    tst       OUT, #(1 << (dst_bpp_shift + 2))
-    beq       0f
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
-    sub       WIDTH, WIDTH, #4
-0:
-    subs      WIDTH, WIDTH, #8
-    blt       1f
-    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
-    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
-    subs      WIDTH, WIDTH, #8
-    blt       5f
-0:
-    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
-    subs      WIDTH, WIDTH, #8
-    bge       0b
-5:
-    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
-1:
-    tst       WIDTH, #4
-    beq       2f
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
-2:
-.else
-/*********** 4 pixels per iteration *****************/
-    subs      WIDTH, WIDTH, #4
-    blt       1f
-    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
-    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
-    subs      WIDTH, WIDTH, #4
-    blt       5f
-0:
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-    subs      WIDTH, WIDTH, #4
-    bge       0b
-5:
-    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
-1:
-/****************************************************/
-.endif
-    /* handle the remaining trailing pixels */
-    tst       WIDTH, #2
-    beq       2f
-    bilinear_interpolate_two_pixels src_fmt, dst_fmt
-2:
-    tst       WIDTH, #1
-    beq       3f
-    bilinear_interpolate_last_pixel src_fmt, dst_fmt
-3:
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
-    vpop      {d8-d15}
-.endif
-    pop       {r4, r5, r6, r7, r8, r9}
-    bx        lr
-
-    .unreq    OUT
-    .unreq    TOP
-    .unreq    WT
-    .unreq    WB
-    .unreq    X
-    .unreq    UX
-    .unreq    WIDTH
-    .unreq    TMP1
-    .unreq    TMP2
-    .unreq    PF_OFFS
-    .unreq    TMP3
-    .unreq    TMP4
-    .unreq    STRIDE
-.endfunc
-
-.endm
-
-/*****************************************************************************/
-
-.set have_bilinear_interpolate_four_pixels_8888_8888, 1
-
-.macro bilinear_interpolate_four_pixels_8888_8888_head
-    mov       TMP1, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP2, TOP, TMP2, asl #2
-
-    vld1.32   {d22}, [TMP1], STRIDE
-    vld1.32   {d23}, [TMP1]
-    mov       TMP3, X, asr #16
-    add       X, X, UX
-    add       TMP3, TOP, TMP3, asl #2
-    vmull.u8  q8, d22, d28
-    vmlal.u8  q8, d23, d29
-
-    vld1.32   {d22}, [TMP2], STRIDE
-    vld1.32   {d23}, [TMP2]
-    mov       TMP4, X, asr #16
-    add       X, X, UX
-    add       TMP4, TOP, TMP4, asl #2
-    vmull.u8  q9, d22, d28
-    vmlal.u8  q9, d23, d29
-
-    vld1.32   {d22}, [TMP3], STRIDE
-    vld1.32   {d23}, [TMP3]
-    vmull.u8  q10, d22, d28
-    vmlal.u8  q10, d23, d29
-
-    vshll.u16 q0, d16, #8
-    vmlsl.u16 q0, d16, d30
-    vmlal.u16 q0, d17, d30
-
-    pld       [TMP4, PF_OFFS]
-    vld1.32   {d16}, [TMP4], STRIDE
-    vld1.32   {d17}, [TMP4]
-    pld       [TMP4, PF_OFFS]
-    vmull.u8  q11, d16, d28
-    vmlal.u8  q11, d17, d29
-
-    vshll.u16 q1, d18, #8
-    vmlsl.u16 q1, d18, d31
-.endm
-
-.macro bilinear_interpolate_four_pixels_8888_8888_tail
-    vmlal.u16 q1, d19, d31
-    vshr.u16  q15, q12, #8
-    vshll.u16 q2, d20, #8
-    vmlsl.u16 q2, d20, d30
-    vmlal.u16 q2, d21, d30
-    vshll.u16 q3, d22, #8
-    vmlsl.u16 q3, d22, d31
-    vmlal.u16 q3, d23, d31
-    vadd.u16  q12, q12, q13
-    vshrn.u32 d0, q0, #16
-    vshrn.u32 d1, q1, #16
-    vshrn.u32 d4, q2, #16
-    vshr.u16  q15, q12, #8
-    vshrn.u32 d5, q3, #16
-    vmovn.u16 d6, q0
-    vmovn.u16 d7, q2
-    vadd.u16  q12, q12, q13
-    vst1.32   {d6, d7}, [OUT, :128]!
-.endm
-
-.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
-    mov       TMP1, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP2, TOP, TMP2, asl #2
-        vmlal.u16 q1, d19, d31
-        vshr.u16  q15, q12, #8
-        vshll.u16 q2, d20, #8
-        vmlsl.u16 q2, d20, d30
-        vmlal.u16 q2, d21, d30
-        vshll.u16 q3, d22, #8
-    vld1.32   {d20}, [TMP1], STRIDE
-        vmlsl.u16 q3, d22, d31
-        vmlal.u16 q3, d23, d31
-    vld1.32   {d21}, [TMP1]
-    vmull.u8  q8, d20, d28
-    vmlal.u8  q8, d21, d29
-        vshrn.u32 d0, q0, #16
-        vshrn.u32 d1, q1, #16
-        vshrn.u32 d4, q2, #16
-    vld1.32   {d22}, [TMP2], STRIDE
-        vshrn.u32 d5, q3, #16
-        vadd.u16  q12, q12, q13
-    vld1.32   {d23}, [TMP2]
-    vmull.u8  q9, d22, d28
-    mov       TMP3, X, asr #16
-    add       X, X, UX
-    add       TMP3, TOP, TMP3, asl #2
-    mov       TMP4, X, asr #16
-    add       X, X, UX
-    add       TMP4, TOP, TMP4, asl #2
-    vmlal.u8  q9, d23, d29
-    vld1.32   {d22}, [TMP3], STRIDE
-        vshr.u16  q15, q12, #8
-    vld1.32   {d23}, [TMP3]
-    vmull.u8  q10, d22, d28
-    vmlal.u8  q10, d23, d29
-        vmovn.u16 d6, q0
-    vshll.u16 q0, d16, #8
-        vmovn.u16 d7, q2
-    vmlsl.u16 q0, d16, d30
-    vmlal.u16 q0, d17, d30
-    pld       [TMP4, PF_OFFS]
-    vld1.32   {d16}, [TMP4], STRIDE
-        vadd.u16  q12, q12, q13
-    vld1.32   {d17}, [TMP4]
-    pld       [TMP4, PF_OFFS]
-    vmull.u8  q11, d16, d28
-    vmlal.u8  q11, d17, d29
-        vst1.32   {d6, d7}, [OUT, :128]!
-    vshll.u16 q1, d18, #8
-    vmlsl.u16 q1, d18, d31
-.endm
-
-/*****************************************************************************/
-
-.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
-
-.macro bilinear_interpolate_eight_pixels_8888_0565_head
-    mov       TMP1, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP2, TOP, TMP2, asl #2
-    vld1.32   {d20}, [TMP1], STRIDE
-    vld1.32   {d21}, [TMP1]
-    vmull.u8  q8, d20, d28
-    vmlal.u8  q8, d21, d29
-    vld1.32   {d22}, [TMP2], STRIDE
-    vld1.32   {d23}, [TMP2]
-    vmull.u8  q9, d22, d28
-    mov       TMP3, X, asr #16
-    add       X, X, UX
-    add       TMP3, TOP, TMP3, asl #2
-    mov       TMP4, X, asr #16
-    add       X, X, UX
-    add       TMP4, TOP, TMP4, asl #2
-    vmlal.u8  q9, d23, d29
-    vld1.32   {d22}, [TMP3], STRIDE
-    vld1.32   {d23}, [TMP3]
-    vmull.u8  q10, d22, d28
-    vmlal.u8  q10, d23, d29
-    vshll.u16 q0, d16, #8
-    vmlsl.u16 q0, d16, d30
-    vmlal.u16 q0, d17, d30
-    pld       [TMP4, PF_OFFS]
-    vld1.32   {d16}, [TMP4], STRIDE
-    vld1.32   {d17}, [TMP4]
-    pld       [TMP4, PF_OFFS]
-    vmull.u8  q11, d16, d28
-    vmlal.u8  q11, d17, d29
-    vshll.u16 q1, d18, #8
-    vmlsl.u16 q1, d18, d31
-
-    mov       TMP1, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP2, TOP, TMP2, asl #2
-        vmlal.u16 q1, d19, d31
-        vshr.u16  q15, q12, #8
-        vshll.u16 q2, d20, #8
-        vmlsl.u16 q2, d20, d30
-        vmlal.u16 q2, d21, d30
-        vshll.u16 q3, d22, #8
-    vld1.32   {d20}, [TMP1], STRIDE
-        vmlsl.u16 q3, d22, d31
-        vmlal.u16 q3, d23, d31
-    vld1.32   {d21}, [TMP1]
-    vmull.u8  q8, d20, d28
-    vmlal.u8  q8, d21, d29
-        vshrn.u32 d0, q0, #16
-        vshrn.u32 d1, q1, #16
-        vshrn.u32 d4, q2, #16
-    vld1.32   {d22}, [TMP2], STRIDE
-        vshrn.u32 d5, q3, #16
-        vadd.u16  q12, q12, q13
-    vld1.32   {d23}, [TMP2]
-    vmull.u8  q9, d22, d28
-    mov       TMP3, X, asr #16
-    add       X, X, UX
-    add       TMP3, TOP, TMP3, asl #2
-    mov       TMP4, X, asr #16
-    add       X, X, UX
-    add       TMP4, TOP, TMP4, asl #2
-    vmlal.u8  q9, d23, d29
-    vld1.32   {d22}, [TMP3], STRIDE
-        vshr.u16  q15, q12, #8
-    vld1.32   {d23}, [TMP3]
-    vmull.u8  q10, d22, d28
-    vmlal.u8  q10, d23, d29
-        vmovn.u16 d8, q0
-    vshll.u16 q0, d16, #8
-        vmovn.u16 d9, q2
-    vmlsl.u16 q0, d16, d30
-    vmlal.u16 q0, d17, d30
-    pld       [TMP4, PF_OFFS]
-    vld1.32   {d16}, [TMP4], STRIDE
-        vadd.u16  q12, q12, q13
-    vld1.32   {d17}, [TMP4]
-    pld       [TMP4, PF_OFFS]
-    vmull.u8  q11, d16, d28
-    vmlal.u8  q11, d17, d29
-    vshll.u16 q1, d18, #8
-    vmlsl.u16 q1, d18, d31
-.endm
-
-.macro bilinear_interpolate_eight_pixels_8888_0565_tail
-    vmlal.u16 q1, d19, d31
-    vshr.u16  q15, q12, #8
-    vshll.u16 q2, d20, #8
-    vmlsl.u16 q2, d20, d30
-    vmlal.u16 q2, d21, d30
-    vshll.u16 q3, d22, #8
-    vmlsl.u16 q3, d22, d31
-    vmlal.u16 q3, d23, d31
-    vadd.u16  q12, q12, q13
-    vshrn.u32 d0, q0, #16
-    vshrn.u32 d1, q1, #16
-    vshrn.u32 d4, q2, #16
-    vshr.u16  q15, q12, #8
-    vshrn.u32 d5, q3, #16
-    vmovn.u16 d10, q0
-    vmovn.u16 d11, q2
-    vadd.u16  q12, q12, q13
-
-    vuzp.u8   d8, d9
-    vuzp.u8   d10, d11
-    vuzp.u8   d9, d11
-    vuzp.u8   d8, d10
-    vshll.u8  q6, d9, #8
-    vshll.u8  q5, d10, #8
-    vshll.u8  q7, d8, #8
-    vsri.u16  q5, q6, #5
-    vsri.u16  q5, q7, #11
-    vst1.32   {d10, d11}, [OUT, :128]!
-.endm
-
-.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
-    mov       TMP1, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP2, TOP, TMP2, asl #2
-        vmlal.u16 q1, d19, d31
-        vshr.u16  q15, q12, #8
-            vuzp.u8 d8, d9
-        vshll.u16 q2, d20, #8
-        vmlsl.u16 q2, d20, d30
-        vmlal.u16 q2, d21, d30
-        vshll.u16 q3, d22, #8
-    vld1.32   {d20}, [TMP1], STRIDE
-        vmlsl.u16 q3, d22, d31
-        vmlal.u16 q3, d23, d31
-    vld1.32   {d21}, [TMP1]
-    vmull.u8  q8, d20, d28
-    vmlal.u8  q8, d21, d29
-        vshrn.u32 d0, q0, #16
-        vshrn.u32 d1, q1, #16
-        vshrn.u32 d4, q2, #16
-    vld1.32   {d22}, [TMP2], STRIDE
-        vshrn.u32 d5, q3, #16
-        vadd.u16  q12, q12, q13
-    vld1.32   {d23}, [TMP2]
-    vmull.u8  q9, d22, d28
-    mov       TMP3, X, asr #16
-    add       X, X, UX
-    add       TMP3, TOP, TMP3, asl #2
-    mov       TMP4, X, asr #16
-    add       X, X, UX
-    add       TMP4, TOP, TMP4, asl #2
-    vmlal.u8  q9, d23, d29
-    vld1.32   {d22}, [TMP3], STRIDE
-        vshr.u16  q15, q12, #8
-    vld1.32   {d23}, [TMP3]
-    vmull.u8  q10, d22, d28
-    vmlal.u8  q10, d23, d29
-        vmovn.u16 d10, q0
-    vshll.u16 q0, d16, #8
-        vmovn.u16 d11, q2
-    vmlsl.u16 q0, d16, d30
-    vmlal.u16 q0, d17, d30
-    pld       [TMP4, PF_OFFS]
-    vld1.32   {d16}, [TMP4], STRIDE
-        vadd.u16  q12, q12, q13
-    vld1.32   {d17}, [TMP4]
-    pld       [TMP4, PF_OFFS]
-    vmull.u8  q11, d16, d28
-    vmlal.u8  q11, d17, d29
-            vuzp.u8 d10, d11
-    vshll.u16 q1, d18, #8
-    vmlsl.u16 q1, d18, d31
-
-    mov       TMP1, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP2, TOP, TMP2, asl #2
-        vmlal.u16 q1, d19, d31
-            vuzp.u8 d9, d11
-        vshr.u16  q15, q12, #8
-        vshll.u16 q2, d20, #8
-            vuzp.u8 d8, d10
-        vmlsl.u16 q2, d20, d30
-        vmlal.u16 q2, d21, d30
-        vshll.u16 q3, d22, #8
-    vld1.32   {d20}, [TMP1], STRIDE
-        vmlsl.u16 q3, d22, d31
-        vmlal.u16 q3, d23, d31
-    vld1.32   {d21}, [TMP1]
-    vmull.u8  q8, d20, d28
-    vmlal.u8  q8, d21, d29
-            vshll.u8  q6, d9, #8
-            vshll.u8  q5, d10, #8
-            vshll.u8  q7, d8, #8
-        vshrn.u32 d0, q0, #16
-            vsri.u16  q5, q6, #5
-        vshrn.u32 d1, q1, #16
-            vsri.u16  q5, q7, #11
-        vshrn.u32 d4, q2, #16
-    vld1.32   {d22}, [TMP2], STRIDE
-        vshrn.u32 d5, q3, #16
-        vadd.u16  q12, q12, q13
-    vld1.32   {d23}, [TMP2]
-    vmull.u8  q9, d22, d28
-    mov       TMP3, X, asr #16
-    add       X, X, UX
-    add       TMP3, TOP, TMP3, asl #2
-    mov       TMP4, X, asr #16
-    add       X, X, UX
-    add       TMP4, TOP, TMP4, asl #2
-    vmlal.u8  q9, d23, d29
-    vld1.32   {d22}, [TMP3], STRIDE
-        vshr.u16  q15, q12, #8
-    vld1.32   {d23}, [TMP3]
-    vmull.u8  q10, d22, d28
-    vmlal.u8  q10, d23, d29
-        vmovn.u16 d8, q0
-    vshll.u16 q0, d16, #8
-        vmovn.u16 d9, q2
-    vmlsl.u16 q0, d16, d30
-    vmlal.u16 q0, d17, d30
-    pld       [TMP4, PF_OFFS]
-    vld1.32   {d16}, [TMP4], STRIDE
-        vadd.u16  q12, q12, q13
-    vld1.32   {d17}, [TMP4]
-    pld       [TMP4, PF_OFFS]
-    vmull.u8  q11, d16, d28
-    vmlal.u8  q11, d17, d29
-    vshll.u16 q1, d18, #8
-            vst1.32   {d10, d11}, [OUT, :128]!
-    vmlsl.u16 q1, d18, d31
-.endm
-/*****************************************************************************/
-
-generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
-    2, 2, 28, BILINEAR_FLAG_UNROLL_4
-
-generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
-    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
-
-generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
-    1, 2, 28, BILINEAR_FLAG_UNROLL_4
-
-generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
-    1, 1, 28, BILINEAR_FLAG_UNROLL_4
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ *  - pixman_composite_over_8888_0565_asm_neon
+ *  - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+    .text
+    .fpu neon
+    .arch armv7a
+    .object_arch armv4
+    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
+    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
+    .arm
+    .altmacro
+    .p2align 2
+
+#include "pixman-arm-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ *       to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
+ * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
+ * d28, d29, d30, d31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5            - contain loaded destination pixels (they are needed)
+ * d28, d29           - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
+ * perform all the needed calculations and write the result to {d28, d29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
+ * actually use d0 register for blue channel (a vector of eight 8-bit
+ * values), d1 register for green, d2 for red and d3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion:
+ *  vuzp.8 d0, d1
+ *  vuzp.8 d2, d3
+ *  vuzp.8 d1, d3
+ *  vuzp.8 d0, d2
+ *
+ * Planar to packed conversion:
+ *  vzip.8 d0, d2
+ *  vzip.8 d1, d3
+ *  vzip.8 d2, d3
+ *  vzip.8 d0, d1
+ *
+ * But pixel can be loaded directly in planar format using VLD4.8 NEON
+ * instruction. It is 1 cycle slower than VLD1.32, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vmvn.8      d3, d3      /* invert source alpha */
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ *   head
+ *   while (...) {
+ *     tail
+ *     head
+ *   }
+ *   tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ *   pixman_composite_over_8888_0565_process_pixblock_tail
+ *   vst1.16     {d28, d29}, [DST_W, :128]!
+ *   vld1.16     {d4, d5}, [DST_R, :128]!
+ *   vld4.32     {d0, d1, d2, d3}, [SRC]!
+ *   pixman_composite_over_8888_0565_process_pixblock_head
+ *   cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few VLD/VST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+        vqadd.u8    d16, d2, d20
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vqadd.u8    q9, q0, q11
+    vshrn.u16   d6, q2, #8
+    fetch_src_pixblock
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+        vshll.u8    q14, d16, #8
+                                    PF add PF_X, PF_X, #8
+        vshll.u8    q8, d19, #8
+                                    PF tst PF_CTL, #0xF
+    vsri.u8     d6, d6, #5
+                                    PF addne PF_X, PF_X, #8
+    vmvn.8      d3, d3
+                                    PF subne PF_CTL, PF_CTL, #1
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    vmull.u8    q10, d3, d6
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vsri.u16    q14, q8, #5
+                                    PF cmp PF_X, ORIG_W
+        vshll.u8    q9, d18, #8
+    vrshr.u16   q13, q10, #8
+                                    PF subge PF_X, PF_X, ORIG_W
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+                                    PF subges PF_CTL, PF_CTL, #0x10
+        vsri.u16    q14, q9, #11
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vraddhn.u16 d22, q12, q15
+        vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_0565_process_pixblock_tail
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    fetch_src_pixblock
+    pixman_composite_over_8888_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
+ *                             and written, for write-only buffer we would use
+ *                             FLAG_DST_WRITEONLY flag instead
+ *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ *                             and separate color channels for 32bpp format.
+ * The next things are:
+ *  - the number of pixels processed per iteration (8 in this case, because
+ *    that's the maximum what can fit into four 64-bit NEON registers).
+ *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ *    prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+    pixman_composite_over_n_0565_process_pixblock_tail
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    pixman_composite_over_n_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d3, d3      /* invert source alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_0565_init, \
+    default_cleanup, \
+    pixman_composite_over_n_0565_process_pixblock_head, \
+    pixman_composite_over_n_0565_process_pixblock_tail, \
+    pixman_composite_over_n_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q14, d2, #8
+    vshll.u8    q9, d0, #8
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+        vsri.u16    q14, q8, #5
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    fetch_src_pixblock
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vsri.u16    q14, q9, #11
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vshll.u8    q8, d1, #8
+        vst1.16     {d28, d29}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vshll.u8    q14, d2, #8
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vshll.u8    q9, d0, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+    vshrn.u16   d30, q0, #8
+    vshrn.u16   d29, q0, #3
+    vsli.u16    q0, q0, #5
+    vmov.u8     d31, #255
+    vsri.u8     d30, d30, #5
+    vsri.u8     d29, d29, #6
+    vshrn.u16   d28, q0, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+    pixman_composite_src_0565_8888_process_pixblock_tail
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+    fetch_src_pixblock
+    pixman_composite_src_0565_8888_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_process_pixblock_head
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #32
+                                    PF tst PF_CTL, #0xF
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #32
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+    fetch_src_pixblock
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    fetch_src_pixblock
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_process_pixblock_head
+    /* deinterleaved source pixels in {d0, d1, d2, d3} */
+    /* inverted alpha in {d24} */
+    /* destination pixels in {d4, d5, d6, d7} */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q2, q10, #8
+    vrshr.u16   q3, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q2, q10
+    vraddhn.u16 d31, q3, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q2, q10, #8
+        vrshr.u16   q3, q11, #8
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+        vraddhn.u16 d30, q2, q10
+        vraddhn.u16 d31, q3, q11
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vqadd.u8    q14, q0, q14
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0x0F
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vqadd.u8    q15, q1, q15
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q8, d24, d4
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+    vmull.u8    q9, d24, d5
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q10, d24, d6
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q11, d24, d7
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d24, d3  /* get inverted alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d7[0]}, [DUMMY]
+    vdup.8      d4, d7[0]
+    vdup.8      d5, d7[1]
+    vdup.8      d6, d7[2]
+    vdup.8      d7, d7[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_reverse_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0,  /* dst_r_basereg */ \
+    4,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
+    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
+    vmull.u8    q1,  d24, d9
+    vmull.u8    q6,  d24, d10
+    vmull.u8    q7,  d24, d11
+        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
+        vshrn.u16   d7,  q2, #3
+        vsli.u16    q2,  q2, #5
+    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
+    vrshr.u16   q9,  q1,  #8
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q11, q7,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q9
+    vraddhn.u16 d2,  q6,  q10
+    vraddhn.u16 d3,  q7,  q11
+        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
+        vsri.u8     d7,  d7, #6
+    vmvn.8      d3,  d3
+        vshrn.u16   d30, q2, #2
+    vmull.u8    q8,  d3, d6     /* now do alpha blending */
+    vmull.u8    q9,  d3, d7
+    vmull.u8    q10, d3, d30
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
+    /* 3 cycle bubble (after vmull.u8) */
+    vrshr.u16   q13, q8,  #8
+    vrshr.u16   q11, q9,  #8
+    vrshr.u16   q15, q10, #8
+    vraddhn.u16 d16, q8,  q13
+    vraddhn.u16 d27, q9,  q11
+    vraddhn.u16 d26, q10, q15
+    vqadd.u8    d16, d2,  d16
+    /* 1 cycle bubble */
+    vqadd.u8    q9,  q0,  q13
+    vshll.u8    q14, d16, #8    /* convert to 16bpp */
+    vshll.u8    q8,  d19, #8
+    vshll.u8    q9,  d18, #8
+    vsri.u16    q14, q8,  #5
+    /* 1 cycle bubble */
+    vsri.u16    q14, q9,  #11
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vshrn.u16   d6,  q2,  #8
+    fetch_mask_pixblock
+    vshrn.u16   d7,  q2,  #3
+    fetch_src_pixblock
+    vmull.u8    q6,  d24, d10
+        vrshr.u16   q13, q8,  #8
+        vrshr.u16   q11, q9,  #8
+        vrshr.u16   q15, q10, #8
+        vraddhn.u16 d16, q8,  q13
+        vraddhn.u16 d27, q9,  q11
+        vraddhn.u16 d26, q10, q15
+        vqadd.u8    d16, d2,  d16
+    vmull.u8    q1,  d24, d9
+        vqadd.u8    q9,  q0,  q13
+        vshll.u8    q14, d16, #8
+    vmull.u8    q0,  d24, d8
+        vshll.u8    q8,  d19, #8
+        vshll.u8    q9,  d18, #8
+        vsri.u16    q14, q8,  #5
+    vmull.u8    q7,  d24, d11
+        vsri.u16    q14, q9,  #11
+
+    cache_preload 8, 8
+
+    vsli.u16    q2,  q2,  #5
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q9,  q1,  #8
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q11, q7,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q9
+    vraddhn.u16 d2,  q6,  q10
+    vraddhn.u16 d3,  q7,  q11
+    vsri.u8     d6,  d6,  #5
+    vsri.u8     d7,  d7,  #6
+    vmvn.8      d3,  d3
+    vshrn.u16   d30, q2,  #2
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vmull.u8    q8,  d3,  d6
+    vmull.u8    q9,  d3,  d7
+    vmull.u8    q10, d3,  d30
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_0565_init, \
+    pixman_composite_over_n_8_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_0565_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vpush       {d8-d15}
+    vld1.32     {d24[0]}, [DUMMY]
+    vdup.8      d24, d24[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_0565_init, \
+    pixman_composite_over_8888_n_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    cache_preload 16, 16
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_0565_process_pixblock_head, \
+    pixman_composite_src_0565_0565_process_pixblock_tail, \
+    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #8
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_8_init, \
+    pixman_composite_src_n_8_cleanup, \
+    pixman_composite_src_n_8_process_pixblock_head, \
+    pixman_composite_src_n_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_0565_init, \
+    pixman_composite_src_n_0565_cleanup, \
+    pixman_composite_src_n_0565_process_pixblock_head, \
+    pixman_composite_src_n_0565_process_pixblock_tail, \
+    pixman_composite_src_n_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    0, /* prefetch distance */ \
+    pixman_composite_src_n_8888_init, \
+    pixman_composite_src_n_8888_cleanup, \
+    pixman_composite_src_n_8888_process_pixblock_head, \
+    pixman_composite_src_n_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_8888_process_pixblock_head, \
+    pixman_composite_src_8888_8888_process_pixblock_tail, \
+    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+    vmov.u8  q2, #0xFF
+    vshl.u32 q2, q2, #24
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_x888_8888_init, \
+    default_cleanup, \
+    pixman_composite_src_x888_8888_process_pixblock_head, \
+    pixman_composite_src_x888_8888_process_pixblock_tail, \
+    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24 (d25, d26, d27 are unused) */
+
+    /* in */
+    vmull.u8    q6, d24, d8
+    vmull.u8    q7, d24, d9
+    vmull.u8    q8, d24, d10
+    vmull.u8    q9, d24, d11
+    vrshr.u16   q10, q6, #8
+    vrshr.u16   q11, q7, #8
+    vrshr.u16   q12, q8, #8
+    vrshr.u16   q13, q9, #8
+    vraddhn.u16 d0, q6, q10
+    vraddhn.u16 d1, q7, q11
+    vraddhn.u16 d2, q8, q12
+    vraddhn.u16 d3, q9, q13
+    vmvn.8      d25, d3  /* get inverted alpha */
+    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
+    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d25, d4
+    vmull.u8    q9, d25, d5
+    vmull.u8    q10, d25, d6
+    vmull.u8    q11, d25, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q6, q10, #8
+    vrshr.u16   q7, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q6, q10
+    vraddhn.u16 d31, q7, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q15, q9, #8
+    fetch_mask_pixblock
+        vrshr.u16   q6, q10, #8
+                                    PF add PF_X, PF_X, #8
+        vrshr.u16   q7, q11, #8
+                                    PF tst PF_CTL, #0x0F
+        vraddhn.u16 d28, q14, q8
+                                    PF addne PF_X, PF_X, #8
+        vraddhn.u16 d29, q15, q9
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d30, q6, q10
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d31, q7, q11
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+    vmull.u8    q6, d24, d8
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q7, d24, d9
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d24, d10
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d24, d11
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+        vqadd.u8    q14, q0, q14
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vqadd.u8    q15, q1, q15
+    vrshr.u16   q10, q6, #8
+    vrshr.u16   q11, q7, #8
+    vrshr.u16   q12, q8, #8
+    vrshr.u16   q13, q9, #8
+    vraddhn.u16 d0, q6, q10
+    vraddhn.u16 d1, q7, q11
+    vraddhn.u16 d2, q8, q12
+    vraddhn.u16 d3, q9, q13
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vmvn.8      d25, d3
+    vmull.u8    q8, d25, d4
+    vmull.u8    q9, d25, d5
+    vmull.u8    q10, d25, d6
+    vmull.u8    q11, d25, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8888_init, \
+    pixman_composite_over_n_8_8888_cleanup, \
+    pixman_composite_over_n_8_8888_process_pixblock_head, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d8
+    vmull.u8    q6,  d26, d8
+    vmull.u8    q7,  d27, d8
+    vrshr.u16   q10, q0,  #8
+    vrshr.u16   q11, q1,  #8
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q13, q7,  #8
+    vraddhn.u16 d0,  q0,  q10
+    vraddhn.u16 d1,  q1,  q11
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d3,  q7,  q13
+    vmvn.8      q12, q0
+    vmvn.8      q13, q1
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_n_8_8_process_pixblock_tail
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d8[0]}, [DUMMY]
+    vdup.8      d8, d8[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8_init, \
+    pixman_composite_over_n_8_8_cleanup, \
+    pixman_composite_over_n_8_8_process_pixblock_head, \
+    pixman_composite_over_n_8_8_process_pixblock_tail, \
+    pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}
+     *         dest in          {d4,  d5,  d6,  d7 }
+     *         mask in          {d24, d25, d26, d27}
+     * output: updated src in   {d0,  d1,  d2,  d3 }
+     *         updated mask in  {d24, d25, d26, d3 }
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q7,  d27, d11
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vrshr.u16   q10, q7,  #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    vraddhn.u16 d26, q13, q6
+    vraddhn.u16 d3,  q7,  q10
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in {d28, d29, d30, d31}
+     */
+    vmvn.8      q12, q12
+    vmvn.8      d26, d26
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmvn.8      d27, d3
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q6,  q10, #8
+    vrshr.u16   q7,  q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q6,  q10
+    vraddhn.u16 d31, q7,  q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+        vrshr.u16   q15, q9, #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q6, q10, #8
+        vrshr.u16   q7, q11, #8
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+        vraddhn.u16 d30, q6, q10
+        vraddhn.u16 d31, q7, q11
+    fetch_mask_pixblock
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    cache_preload 8, 8
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_8888_ca_init, \
+    pixman_composite_over_n_8888_8888_ca_cleanup, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
+     *         mask in          {d24, d25, d26}       [B, G, R]
+     * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
+     *         updated mask in  {d24, d25, d26}       [B, G, R]
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    /*
+     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+     * and put data into d16 - blue, d17 - green, d18 - red
+     */
+       vshrn.u16   d17, q2,  #3
+       vshrn.u16   d18, q2,  #8
+    vraddhn.u16 d26, q13, q6
+       vsli.u16    q2,  q2,  #5
+       vsri.u8     d18, d18, #5
+       vsri.u8     d17, d17, #6
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in d16 - blue, d17 - green, d18 - red
+     */
+    vmvn.8      q12, q12
+       vshrn.u16   d16, q2,  #2
+    vmvn.8      d26, d26
+    vmull.u8    q6,  d16, d24
+    vmull.u8    q7,  d17, d25
+    vmull.u8    q11, d18, d26
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q14, q7,  #8
+    vrshr.u16   q15, q11, #8
+    vraddhn.u16 d16, q10, q6
+    vraddhn.u16 d17, q14, q7
+    vraddhn.u16 d18, q15, q11
+    vqadd.u8    q8,  q0,  q8
+    vqadd.u8    d18, d2,  d18
+    /*
+     * convert the results in d16, d17, d18 to r5g6b5 and store
+     * them into {d28, d29}
+     */
+    vshll.u8    q14, d18, #8
+    vshll.u8    q10, d17, #8
+    vshll.u8    q15, d16, #8
+    vsri.u16    q14, q10, #5
+    vsri.u16    q14, q15, #11
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+    fetch_mask_pixblock
+        vrshr.u16   q10, q6, #8
+        vrshr.u16   q14, q7, #8
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vrshr.u16   q15, q11, #8
+        vraddhn.u16 d16, q10, q6
+        vraddhn.u16 d17, q14, q7
+        vraddhn.u16 d22, q15, q11
+            /* process_pixblock_head */
+            /*
+             * 'combine_mask_ca' replacement
+             *
+             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
+             *         mask in          {d24, d25, d26}       [B, G, R]
+             * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
+             *         updated mask in  {d24, d25, d26}       [B, G, R]
+             */
+            vmull.u8    q6,  d26, d10
+        vqadd.u8    q8,  q0, q8
+            vmull.u8    q0,  d24, d8
+        vqadd.u8    d22, d2, d22
+            vmull.u8    q1,  d25, d9
+        /*
+         * convert the result in d16, d17, d22 to r5g6b5 and store
+         * it into {d28, d29}
+         */
+        vshll.u8    q14, d22, #8
+        vshll.u8    q10, d17, #8
+        vshll.u8    q15, d16, #8
+            vmull.u8    q9,  d11, d25
+        vsri.u16    q14, q10, #5
+            vmull.u8    q12, d11, d24
+            vmull.u8    q13, d11, d26
+        vsri.u16    q14, q15, #11
+    cache_preload 8, 8
+            vrshr.u16   q8,  q0,  #8
+            vrshr.u16   q10, q1,  #8
+            vrshr.u16   q11, q6,  #8
+            vraddhn.u16 d0,  q0,  q8
+            vraddhn.u16 d1,  q1,  q10
+            vraddhn.u16 d2,  q6,  q11
+            vrshr.u16   q11, q12, #8
+            vrshr.u16   q8,  q9,  #8
+            vrshr.u16   q6,  q13, #8
+            vraddhn.u16 d24, q12, q11
+            vraddhn.u16 d25, q9,  q8
+                /*
+                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
+	         * 8-bit format and put data into d16 - blue, d17 - green,
+	         * d18 - red
+                 */
+                vshrn.u16   d17, q2,  #3
+                vshrn.u16   d18, q2,  #8
+            vraddhn.u16 d26, q13, q6
+                vsli.u16    q2,  q2,  #5
+                vsri.u8     d17, d17, #6
+                vsri.u8     d18, d18, #5
+            /*
+             * 'combine_over_ca' replacement
+             *
+             * output: updated dest in d16 - blue, d17 - green, d18 - red
+             */
+            vmvn.8      q12, q12
+                vshrn.u16   d16, q2,  #2
+            vmvn.8      d26, d26
+            vmull.u8    q7,  d17, d25
+            vmull.u8    q6,  d16, d24
+            vmull.u8    q11, d18, d26
+    vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_0565_ca_init, \
+    pixman_composite_over_n_8888_0565_ca_cleanup, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_in_n_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* and destination data in {d4, d5, d6, d7} */
+    vmull.u8    q8,  d4,  d3
+    vmull.u8    q9,  d5,  d3
+    vmull.u8    q10, d6,  d3
+    vmull.u8    q11, d7,  d3
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q8,  q14
+    vraddhn.u16 d29, q9,  q15
+    vraddhn.u16 d30, q10, q12
+    vraddhn.u16 d31, q11, q13
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+    pixman_composite_in_n_8_process_pixblock_tail
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    cache_preload 32, 32
+    pixman_composite_in_n_8_process_pixblock_head
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_in_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_in_n_8_init, \
+    pixman_composite_in_n_8_cleanup, \
+    pixman_composite_in_n_8_process_pixblock_head, \
+    pixman_composite_in_n_8_process_pixblock_tail, \
+    pixman_composite_in_n_8_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+    /* expecting source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24, d25, d26, d27 */
+    vmull.u8    q0, d24, d11
+    vmull.u8    q1, d25, d11
+    vmull.u8    q6, d26, d11
+    vmull.u8    q7, d27, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+    pixman_composite_add_n_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8_init, \
+    pixman_composite_add_n_8_8_cleanup, \
+    pixman_composite_add_n_8_8_process_pixblock_head, \
+    pixman_composite_add_n_8_8_process_pixblock_tail, \
+    pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8, d24, d0
+    vmull.u8    q9, d25, d1
+    vmull.u8    q10, d26, d2
+    vmull.u8    q11, d27, d3
+    vrshr.u16   q0, q8, #8
+    vrshr.u16   q1, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d0, q0, q8
+    vraddhn.u16 d1, q1, q9
+    vraddhn.u16 d2, q12, q10
+    vraddhn.u16 d3, q13, q11
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+    pixman_composite_add_8_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    fetch_mask_pixblock
+    fetch_src_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8_8_8_init, \
+    pixman_composite_add_8_8_8_cleanup, \
+    pixman_composite_add_8_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8,  d27, d0
+    vmull.u8    q9,  d27, d1
+    vmull.u8    q10, d27, d2
+    vmull.u8    q11, d27, d3
+    /* 1 cycle bubble */
+    vrsra.u16   q8,  q8,  #8
+    vrsra.u16   q9,  q9,  #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+    /* 2 cycle bubble */
+    vrshrn.u16  d28, q8,  #8
+    vrshrn.u16  d29, q9,  #8
+    vrshrn.u16  d30, q10, #8
+    vrshrn.u16  d31, q11, #8
+    vqadd.u8    q14, q2,  q14
+    /* 1 cycle bubble */
+    vqadd.u8    q15, q3,  q15
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+        vrshrn.u16  d28, q8,  #8
+    fetch_mask_pixblock
+        vrshrn.u16  d29, q9,  #8
+    vmull.u8    q8,  d27, d0
+        vrshrn.u16  d30, q10, #8
+    vmull.u8    q9,  d27, d1
+        vrshrn.u16  d31, q11, #8
+    vmull.u8    q10, d27, d2
+        vqadd.u8    q14, q2,  q14
+    vmull.u8    q11, d27, d3
+        vqadd.u8    q15, q3,  q15
+    vrsra.u16   q8,  q8,  #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vrsra.u16   q9,  q9,  #8
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q10, q10, #8
+
+    cache_preload 8, 8
+
+    vrsra.u16   q11, q11, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+generate_composite_function \
+    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8888_init, \
+    pixman_composite_add_n_8_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_n_8888_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vld1.32     {d27[0]}, [DUMMY]
+    vdup.8      d27, d27[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8888_n_8888_init, \
+    pixman_composite_add_8888_n_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* solid mask is in d15 */
+
+    /* 'in' */
+    vmull.u8    q8, d15, d3
+    vmull.u8    q6, d15, d2
+    vmull.u8    q5, d15, d1
+    vmull.u8    q4, d15, d0
+    vrshr.u16   q13, q8, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q11, q5, #8
+    vrshr.u16   q10, q4, #8
+    vraddhn.u16 d3, q8, q13
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d1, q5, q11
+    vraddhn.u16 d0, q4, q10
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+    add         DUMMY, sp, #48
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_8888_init, \
+    pixman_composite_over_8888_n_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+    vst3.8 {d0, d1, d2}, [DST_W]!
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0888_process_pixblock_head, \
+    pixman_composite_src_0888_0888_process_pixblock_tail, \
+    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+    vswp   d0, d2
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+    vst4.8 {d0, d1, d2, d3}, [DST_W]!
+    fetch_src_pixblock
+    vswp   d0, d2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+    veor   d3, d3, d3
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_0888_8888_rev_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q9, d2, #8
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+    vshll.u8    q14, d0, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+        vshll.u8    q14, d0, #8
+    fetch_src_pixblock
+        vsri.u16    q14, q8, #5
+        vsri.u16    q14, q9, #11
+    vshll.u8    q8, d1, #8
+        vst1.16 {d28, d29}, [DST_W, :128]!
+    vshll.u8    q9, d2, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d30, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d28, q13, q10
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    fetch_src_pixblock
+        vraddhn.u16 d30, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d28, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d28, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d30, q13, q10
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    fetch_src_pixblock
+        vraddhn.u16 d28, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d30, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q4, d2, d1, d0
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* source pixel data is in      {d0, d1, d2, XX} */
+    /* destination pixel data is in {d4, d5, d6, XX} */
+    vmvn.8      d7,  d15
+    vmull.u8    q6,  d15, d2
+    vmull.u8    q5,  d15, d1
+    vmull.u8    q4,  d15, d0
+    vmull.u8    q8,  d7,  d4
+    vmull.u8    q9,  d7,  d5
+    vmull.u8    q13, d7,  d6
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q11, q5,  #8
+    vrshr.u16   q10, q4,  #8
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d1,  q5,  q11
+    vraddhn.u16 d0,  q4,  q10
+.endm
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q13, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q13
+    vqadd.u8    q0,  q0,  q14
+    vqadd.u8    q1,  q1,  q15
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_over_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_over_0565_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_n_0565_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_0565_n_0565_init, \
+    pixman_composite_over_0565_n_0565_cleanup, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q4, d2, d1, d0
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* source pixel data is in      {d0, d1, d2, XX} */
+    /* destination pixel data is in {d4, d5, d6, XX} */
+    vmull.u8    q6,  d15, d2
+    vmull.u8    q5,  d15, d1
+    vmull.u8    q4,  d15, d0
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q11, q5,  #8
+    vrshr.u16   q10, q4,  #8
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d1,  q5,  q11
+    vraddhn.u16 d0,  q4,  q10
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+    vqadd.u8    q0,  q0,  q2
+    vqadd.u8    q1,  q1,  q3
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_add_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_add_0565_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_add_0565_8_0565_process_pixblock_head, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* destination pixel data is in {d4, d5, d6, xx} */
+    vmvn.8      d24, d15 /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vraddhn.u16 d0, q14, q8
+    vraddhn.u16 d1, q15, q9
+    vraddhn.u16 d2, q12, q10
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    15, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
+    /* src is in d0 */
+    /* destination pixel data is in {d4, d5, d6, d7} */
+    vmvn.8      d1, d0 /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d1, d4
+    vmull.u8    q9, d1, d5
+    vmull.u8    q10, d1, d6
+    vmull.u8    q11, d1, d7
+.endm
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    /* 32bpp result is in {d28, d29, d30, d31} */
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_8888_process_pixblock_head
+    vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+    .func fname
+    .global fname
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    vld1.32   {reg1}, [TMP1], STRIDE
+    vld1.32   {reg2}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    vld1.32   {reg2[0]}, [TMP1], STRIDE
+    vld1.32   {reg2[1]}, [TMP1]
+    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 reg1, reg2, tmp1
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    bilinear_load_8888 reg3, reg4, tmp2
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {acc2lo[1]}, [TMP1]
+    vld1.32   {acc2hi[1]}, [TMP2]
+    convert_0565_to_x888 acc2, reg3, reg2, reg1
+    vzip.u8   reg1, reg3
+    vzip.u8   reg2, reg4
+    vzip.u8   reg3, reg4
+    vzip.u8   reg1, reg2
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {xacc2lo[1]}, [TMP1]
+    vld1.32   {xacc2hi[1]}, [TMP2]
+    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
+    vzip.u8   xreg1, xreg3
+    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
+    vzip.u8   xreg2, xreg4
+    vld1.32   {yacc2lo[1]}, [TMP1]
+    vzip.u8   xreg3, xreg4
+    vld1.32   {yacc2hi[1]}, [TMP2]
+    vzip.u8   xreg1, xreg2
+    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+    vmull.u8  xacc1, xreg1, d28
+    vzip.u8   yreg1, yreg3
+    vmlal.u8  xacc1, xreg2, d29
+    vzip.u8   yreg2, yreg4
+    vmull.u8  xacc2, xreg3, d28
+    vzip.u8   yreg3, yreg4
+    vmlal.u8  xacc2, xreg4, d29
+    vzip.u8   yreg1, yreg2
+    vmull.u8  yacc1, yreg1, d28
+    vmlal.u8  yacc1, yreg2, d29
+    vmull.u8  yacc2, yreg3, d28
+    vmlal.u8  yacc2, yreg4, d29
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+    vst1.32   {d0, d1}, [OUT, :128]!
+.elseif numpix == 2
+    vst1.32   {d0}, [OUT, :64]!
+.elseif numpix == 1
+    vst1.32   {d0[0]}, [OUT, :32]!
+.else
+    .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+    vuzp.u8 d0, d1
+    vuzp.u8 d2, d3
+    vuzp.u8 d1, d3
+    vuzp.u8 d0, d2
+    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+    vst1.16   {d2}, [OUT, :64]!
+.elseif numpix == 2
+    vst1.32   {d2[0]}, [OUT, :32]!
+.elseif numpix == 1
+    vst1.16   {d2[0]}, [OUT, :16]!
+.else
+    .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_load_&src_fmt d0, d1, d2
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    /* 5 cycles bubble */
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    /* 5 cycles bubble */
+    vshrn.u32 d0, q0, #16
+    /* 3 cycles bubble */
+    vmovn.u16 d0, q0
+    /* 1 cycle bubble */
+    bilinear_store_&dst_fmt 1, q2, q3
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    vmovn.u16 d0, q0
+    bilinear_store_&dst_fmt 2, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23 \
+                q3, q9,  d4, d5, d16, d17, d18, d19
+    pld       [TMP1, PF_OFFS]
+    sub       TMP1, TMP1, STRIDE
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d6, #8
+    vmlsl.u16 q2, d6, d30
+    vmlal.u16 q2, d7, d30
+    vshll.u16 q8, d18, #8
+    pld       [TMP2, PF_OFFS]
+    vmlsl.u16 q8, d18, d31
+    vmlal.u16 q8, d19, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    vshrn.u32 d4, q2, #16
+    vshrn.u32 d5, q8, #16
+    vshr.u16  q15, q12, #8
+    vmovn.u16 d0, q0
+    vmovn.u16 d1, q2
+    vadd.u16  q12, q12, q13
+    bilinear_store_&dst_fmt 4, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
+.else
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.else
+    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.else
+    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.set BILINEAR_FLAG_UNROLL_4,          0
+.set BILINEAR_FLAG_UNROLL_8,          1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ *  fname             - name of the function to generate
+ *  src_fmt           - source color format (8888 or 0565)
+ *  dst_fmt           - destination color format (8888 or 0565)
+ *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
+ *  prefetch_distance - prefetch in the source image by that many
+ *                      pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+                                       src_bpp_shift, dst_bpp_shift, \
+                                       prefetch_distance, flags
+
+pixman_asm_function fname
+    OUT       .req      r0
+    TOP       .req      r1
+    BOTTOM    .req      r2
+    WT        .req      r3
+    WB        .req      r4
+    X         .req      r5
+    UX        .req      r6
+    WIDTH     .req      ip
+    TMP1      .req      r3
+    TMP2      .req      r4
+    PF_OFFS   .req      r7
+    TMP3      .req      r8
+    TMP4      .req      r9
+    STRIDE    .req      r2
+
+    mov       ip, sp
+    push      {r4, r5, r6, r7, r8, r9}
+    mov       PF_OFFS, #prefetch_distance
+    ldmia     ip, {WB, X, UX, WIDTH}
+    mul       PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpush     {d8-d15}
+.endif
+
+    sub       STRIDE, BOTTOM, TOP
+    .unreq    BOTTOM
+
+    cmp       WIDTH, #0
+    ble       3f
+
+    vdup.u16  q12, X
+    vdup.u16  q13, UX
+    vdup.u8   d28, WT
+    vdup.u8   d29, WB
+    vadd.u16  d25, d25, d26
+
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       0f
+    tst       OUT, #(1 << dst_bpp_shift)
+    beq       0f
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #1
+0:
+    vadd.u16  q13, q13, q13
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+
+    cmp       WIDTH, #2
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    beq       0f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #2
+0:
+.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+/*********** 8 pixels per iteration *****************/
+    cmp       WIDTH, #4
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    beq       0f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #4
+0:
+    subs      WIDTH, WIDTH, #8
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    blt       5f
+0:
+    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    bge       0b
+5:
+    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+1:
+    tst       WIDTH, #4
+    beq       2f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+2:
+.else
+/*********** 4 pixels per iteration *****************/
+    subs      WIDTH, WIDTH, #4
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #4
+    blt       5f
+0:
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #4
+    bge       0b
+5:
+    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+1:
+/****************************************************/
+.endif
+    /* handle the remaining trailing pixels */
+    tst       WIDTH, #2
+    beq       2f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+2:
+    tst       WIDTH, #1
+    beq       3f
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpop      {d8-d15}
+.endif
+    pop       {r4, r5, r6, r7, r8, r9}
+    bx        lr
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+    .unreq    STRIDE
+.endfunc
+
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_four_pixels_8888_8888, 1
+
+.macro bilinear_interpolate_four_pixels_8888_8888_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+
+    vld1.32   {d22}, [TMP1], STRIDE
+    vld1.32   {d23}, [TMP1]
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    vmull.u8  q8, d22, d28
+    vmlal.u8  q8, d23, d29
+
+    vld1.32   {d22}, [TMP2], STRIDE
+    vld1.32   {d23}, [TMP2]
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmull.u8  q9, d22, d28
+    vmlal.u8  q9, d23, d29
+
+    vld1.32   {d22}, [TMP3], STRIDE
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+
+    vshll.u16 q0, d16, #8
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail
+    vmlal.u16 q1, d19, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d20, #8
+    vmlsl.u16 q2, d20, d30
+    vmlal.u16 q2, d21, d30
+    vshll.u16 q3, d22, #8
+    vmlsl.u16 q3, d22, d31
+    vmlal.u16 q3, d23, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q1, #16
+    vshrn.u32 d4, q2, #16
+    vshr.u16  q15, q12, #8
+    vshrn.u32 d5, q3, #16
+    vmovn.u16 d6, q0
+    vmovn.u16 d7, q2
+    vadd.u16  q12, q12, q13
+    vst1.32   {d6, d7}, [OUT, :128]!
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #8
+        vshll.u16 q2, d20, #8
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #16
+        vshrn.u32 d1, q1, #16
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d6, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d7, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+        vst1.32   {d6, d7}, [OUT, :128]!
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+    vld1.32   {d20}, [TMP1], STRIDE
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+    vld1.32   {d22}, [TMP2], STRIDE
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+    vshll.u16 q0, d16, #8
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #8
+        vshll.u16 q2, d20, #8
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #16
+        vshrn.u32 d1, q1, #16
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d8, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d9, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+.endm
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_tail
+    vmlal.u16 q1, d19, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d20, #8
+    vmlsl.u16 q2, d20, d30
+    vmlal.u16 q2, d21, d30
+    vshll.u16 q3, d22, #8
+    vmlsl.u16 q3, d22, d31
+    vmlal.u16 q3, d23, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q1, #16
+    vshrn.u32 d4, q2, #16
+    vshr.u16  q15, q12, #8
+    vshrn.u32 d5, q3, #16
+    vmovn.u16 d10, q0
+    vmovn.u16 d11, q2
+    vadd.u16  q12, q12, q13
+
+    vuzp.u8   d8, d9
+    vuzp.u8   d10, d11
+    vuzp.u8   d9, d11
+    vuzp.u8   d8, d10
+    vshll.u8  q6, d9, #8
+    vshll.u8  q5, d10, #8
+    vshll.u8  q7, d8, #8
+    vsri.u16  q5, q6, #5
+    vsri.u16  q5, q7, #11
+    vst1.32   {d10, d11}, [OUT, :128]!
+.endm
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #8
+            vuzp.u8 d8, d9
+        vshll.u16 q2, d20, #8
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #16
+        vshrn.u32 d1, q1, #16
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d10, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d11, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+            vuzp.u8 d10, d11
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+            vuzp.u8 d9, d11
+        vshr.u16  q15, q12, #8
+        vshll.u16 q2, d20, #8
+            vuzp.u8 d8, d10
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+            vshll.u8  q6, d9, #8
+            vshll.u8  q5, d10, #8
+            vshll.u8  q7, d8, #8
+        vshrn.u32 d0, q0, #16
+            vsri.u16  q5, q6, #5
+        vshrn.u32 d1, q1, #16
+            vsri.u16  q5, q7, #11
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d8, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d9, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #8
+            vst1.32   {d10, d11}, [OUT, :128]!
+    vmlsl.u16 q1, d18, d31
+.endm
+/*****************************************************************************/
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
+    2, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
+    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
+    1, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
+    1, 1, 28, BILINEAR_FLAG_UNROLL_4
diff --git a/pixman/pixman/pixman-arm-neon.c b/pixman/pixman/pixman-arm-neon.c
index 1ec61c202..effb50b4f 100644
--- a/pixman/pixman/pixman-arm-neon.c
+++ b/pixman/pixman/pixman-arm-neon.c
@@ -1,507 +1,507 @@
-/*
- * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of ARM Ltd not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  ARM Ltd makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author:  Ian Rickards (ian.rickards@arm.com)
- * Author:  Jonathan Morton (jonathan.morton@movial.com)
- * Author:  Markku Vire (markku.vire@movial.com)
- *
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <string.h>
-#include "pixman-private.h"
-#include "pixman-arm-common.h"
-
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_8888,
-                                   uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_x888_8888,
-                                   uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_0565,
-                                   uint16_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0888,
-                                   uint8_t, 3, uint8_t, 3)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_0565,
-                                   uint32_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_8888,
-                                   uint16_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_8888_rev,
-                                   uint8_t, 3, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev,
-                                   uint8_t, 3, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888,
-                                   uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_rpixbuf_8888,
-                                   uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8,
-                                   uint8_t, 1, uint8_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888,
-                                   uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_0565,
-                                   uint32_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888,
-                                   uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565,
-                                   uint8_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_8888,
-                                   uint8_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_0565,
-                                 uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_8888,
-                                 uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888,
-                                 uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8,
-                                 uint8_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565,
-                                      uint8_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888,
-                                      uint8_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca,
-                                      uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_0565_ca,
-				      uint32_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8,
-                                      uint8_t, 1, uint8_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
-                                      uint8_t, 1, uint8_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
-                                      uint8_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
-                                     uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_0565,
-                                     uint32_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_0565_n_0565,
-                                     uint16_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, add_8888_n_8888,
-                                     uint32_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
-                                        uint8_t, 1, uint8_t, 1, uint8_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_0565_8_0565,
-                                        uint16_t, 1, uint8_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8_8888,
-                                        uint32_t, 1, uint8_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888,
-                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888,
-                                        uint32_t, 1, uint8_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8888_8888,
-                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_0565,
-                                        uint32_t, 1, uint8_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565,
-                                        uint16_t, 1, uint8_t, 1, uint16_t, 1)
-
-PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
-                                        uint32_t, uint32_t)
-PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
-                                        uint32_t, uint16_t)
-PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC,
-                                        uint32_t, uint16_t)
-PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC,
-                                        uint16_t, uint32_t)
-
-PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565,
-                                           OVER, uint32_t, uint16_t)
-PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
-                                           OVER, uint16_t, uint16_t)
-
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
-                                         uint32_t, uint32_t)
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
-                                         uint32_t, uint16_t)
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
-                                         uint16_t, uint32_t)
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
-                                         uint16_t, uint16_t)
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER,
-                                         uint32_t, uint32_t)
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD,
-                                         uint32_t, uint32_t)
-
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC,
-                                            uint32_t, uint32_t)
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC,
-                                            uint32_t, uint16_t)
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC,
-                                            uint16_t, uint32_t)
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_0565, SRC,
-                                            uint16_t, uint16_t)
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, OVER,
-                                            uint32_t, uint32_t)
-PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, ADD,
-                                            uint32_t, uint32_t)
-
-void
-pixman_composite_src_n_8_asm_neon (int32_t   w,
-                                   int32_t   h,
-                                   uint8_t  *dst,
-                                   int32_t   dst_stride,
-                                   uint8_t   src);
-
-void
-pixman_composite_src_n_0565_asm_neon (int32_t   w,
-                                      int32_t   h,
-                                      uint16_t *dst,
-                                      int32_t   dst_stride,
-                                      uint16_t  src);
-
-void
-pixman_composite_src_n_8888_asm_neon (int32_t   w,
-                                      int32_t   h,
-                                      uint32_t *dst,
-                                      int32_t   dst_stride,
-                                      uint32_t  src);
-
-static pixman_bool_t
-pixman_fill_neon (uint32_t *bits,
-                  int       stride,
-                  int       bpp,
-                  int       x,
-                  int       y,
-                  int       width,
-                  int       height,
-                  uint32_t  _xor)
-{
-    /* stride is always multiple of 32bit units in pixman */
-    uint32_t byte_stride = stride * sizeof(uint32_t);
-
-    switch (bpp)
-    {
-    case 8:
-	pixman_composite_src_n_8_asm_neon (
-		width,
-		height,
-		(uint8_t *)(((char *) bits) + y * byte_stride + x),
-		byte_stride,
-		_xor & 0xff);
-	return TRUE;
-    case 16:
-	pixman_composite_src_n_0565_asm_neon (
-		width,
-		height,
-		(uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
-		byte_stride / 2,
-		_xor & 0xffff);
-	return TRUE;
-    case 32:
-	pixman_composite_src_n_8888_asm_neon (
-		width,
-		height,
-		(uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
-		byte_stride / 4,
-		_xor);
-	return TRUE;
-    default:
-	return FALSE;
-    }
-}
-
-static pixman_bool_t
-pixman_blt_neon (uint32_t *src_bits,
-                 uint32_t *dst_bits,
-                 int       src_stride,
-                 int       dst_stride,
-                 int       src_bpp,
-                 int       dst_bpp,
-                 int       src_x,
-                 int       src_y,
-                 int       dest_x,
-                 int       dest_y,
-                 int       width,
-                 int       height)
-{
-    if (src_bpp != dst_bpp)
-	return FALSE;
-
-    switch (src_bpp)
-    {
-    case 16:
-	pixman_composite_src_0565_0565_asm_neon (
-		width, height,
-		(uint16_t *)(((char *) dst_bits) +
-		dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2,
-		(uint16_t *)(((char *) src_bits) +
-		src_y * src_stride * 4 + src_x * 2), src_stride * 2);
-	return TRUE;
-    case 32:
-	pixman_composite_src_8888_8888_asm_neon (
-		width, height,
-		(uint32_t *)(((char *) dst_bits) +
-		dest_y * dst_stride * 4 + dest_x * 4), dst_stride,
-		(uint32_t *)(((char *) src_bits) +
-		src_y * src_stride * 4 + src_x * 4), src_stride);
-	return TRUE;
-    default:
-	return FALSE;
-    }
-}
-
-static const pixman_fast_path_t arm_neon_fast_paths[] =
-{
-    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565),
-    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     b5g6r5,   neon_composite_src_0565_0565),
-    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
-    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
-    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
-    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
-    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     a8r8g8b8, neon_composite_src_0565_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     x8r8g8b8, neon_composite_src_0565_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     a8b8g8r8, neon_composite_src_0565_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     x8b8g8r8, neon_composite_src_0565_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     a8r8g8b8, neon_composite_src_8888_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     a8b8g8r8, neon_composite_src_8888_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  r8g8b8,   null,     r8g8b8,   neon_composite_src_0888_0888),
-    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     x8r8g8b8, neon_composite_src_0888_8888_rev),
-    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     r5g6b5,   neon_composite_src_0888_0565_rev),
-    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8r8g8b8, neon_composite_src_pixbuf_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8b8g8r8, neon_composite_src_rpixbuf_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8r8g8b8, neon_composite_src_rpixbuf_8888),
-    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8b8g8r8, neon_composite_src_pixbuf_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       neon_composite_over_n_8_8),
-    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   neon_composite_over_n_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   neon_composite_over_n_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8r8g8b8, neon_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8r8g8b8, neon_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8b8g8r8, neon_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8b8g8r8, neon_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     r5g6b5,   neon_composite_over_n_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     a8r8g8b8, neon_composite_over_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     x8r8g8b8, neon_composite_over_n_8888),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, neon_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5,   neon_composite_over_n_8888_0565_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5,   neon_composite_over_n_8888_0565_ca),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    a8r8g8b8, neon_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    x8r8g8b8, neon_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    r5g6b5,   neon_composite_over_8888_n_0565),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid,    b5g6r5,   neon_composite_over_8888_n_0565),
-    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   solid,    r5g6b5,   neon_composite_over_0565_n_0565),
-    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   solid,    b5g6r5,   neon_composite_over_0565_n_0565),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       a8r8g8b8, neon_composite_over_8888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       x8r8g8b8, neon_composite_over_8888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       a8b8g8r8, neon_composite_over_8888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       x8b8g8r8, neon_composite_over_8888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       r5g6b5,   neon_composite_over_8888_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   neon_composite_over_8888_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   neon_composite_over_0565_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   neon_composite_over_0565_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   neon_composite_over_8888_0565),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   neon_composite_over_8888_0565),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     a8r8g8b8, neon_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, neon_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, neon_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, neon_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8),
-    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, neon_composite_add_n_8_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, neon_composite_add_n_8_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8),
-    PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   neon_composite_add_0565_8_0565),
-    PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   neon_composite_add_0565_8_0565),
-    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, neon_composite_add_8888_8_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, neon_composite_add_8888_8_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, neon_composite_add_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, neon_composite_add_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8_8),
-    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888),
-    PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8,       neon_composite_in_n_8),
-    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
-    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),
-    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565),
-    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8, neon_composite_out_reverse_8_8888),
-    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8, neon_composite_out_reverse_8_8888),
-
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
-
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
-
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
-
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
-    /* Note: NONE repeat is not supported yet */
-    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
-    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
-
-    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565),
-
-    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
-
-    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888),
-    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
-    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
-
-    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
-    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
-
-    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
-    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),
-
-    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
-    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
-
-    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888),
-    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888),
-
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8_8888),
-
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_8_0565),
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_8_0565),
-
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8_x888),
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565),
-
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
-
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
-
-    { PIXMAN_OP_NONE },
-};
-
-static pixman_bool_t
-arm_neon_blt (pixman_implementation_t *imp,
-              uint32_t *               src_bits,
-              uint32_t *               dst_bits,
-              int                      src_stride,
-              int                      dst_stride,
-              int                      src_bpp,
-              int                      dst_bpp,
-              int                      src_x,
-              int                      src_y,
-              int                      dest_x,
-              int                      dest_y,
-              int                      width,
-              int                      height)
-{
-    if (!pixman_blt_neon (
-            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-            src_x, src_y, dest_x, dest_y, width, height))
-
-    {
-	return _pixman_implementation_blt (
-	    imp->delegate,
-	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-	    src_x, src_y, dest_x, dest_y, width, height);
-    }
-
-    return TRUE;
-}
-
-static pixman_bool_t
-arm_neon_fill (pixman_implementation_t *imp,
-               uint32_t *               bits,
-               int                      stride,
-               int                      bpp,
-               int                      x,
-               int                      y,
-               int                      width,
-               int                      height,
-               uint32_t xor)
-{
-    if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
-	return TRUE;
-
-    return _pixman_implementation_fill (
-	imp->delegate, bits, stride, bpp, x, y, width, height, xor);
-}
-
-#define BIND_COMBINE_U(name)                                             \
-void                                                                     \
-pixman_composite_scanline_##name##_mask_asm_neon (int32_t         w,     \
-                                                  const uint32_t *dst,   \
-                                                  const uint32_t *src,   \
-                                                  const uint32_t *mask); \
-                                                                         \
-void                                                                     \
-pixman_composite_scanline_##name##_asm_neon (int32_t         w,          \
-                                             const uint32_t *dst,        \
-                                             const uint32_t *src);       \
-                                                                         \
-static void                                                              \
-neon_combine_##name##_u (pixman_implementation_t *imp,                   \
-                         pixman_op_t              op,                    \
-                         uint32_t *               dest,                  \
-                         const uint32_t *         src,                   \
-                         const uint32_t *         mask,                  \
-                         int                      width)                 \
-{                                                                        \
-    if (mask)                                                            \
-	pixman_composite_scanline_##name##_mask_asm_neon (width, dest,   \
-	                                                  src, mask);    \
-    else                                                                 \
-	pixman_composite_scanline_##name##_asm_neon (width, dest, src);  \
-}
-
-BIND_COMBINE_U (over)
-BIND_COMBINE_U (add)
-BIND_COMBINE_U (out_reverse)
-
-pixman_implementation_t *
-_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
-{
-    pixman_implementation_t *imp =
-	_pixman_implementation_create (fallback, arm_neon_fast_paths);
-
-    imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
-    imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
-    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u;
-
-    imp->blt = arm_neon_blt;
-    imp->fill = arm_neon_fill;
-
-    return imp;
-}
+/*
+ * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of ARM Ltd not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  ARM Ltd makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ian Rickards (ian.rickards@arm.com)
+ * Author:  Jonathan Morton (jonathan.morton@movial.com)
+ * Author:  Markku Vire (markku.vire@movial.com)
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_x888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_0565,
+                                   uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0888,
+                                   uint8_t, 3, uint8_t, 3)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_0565,
+                                   uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_8888,
+                                   uint16_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_8888_rev,
+                                   uint8_t, 3, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev,
+                                   uint8_t, 3, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_rpixbuf_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_0565,
+                                   uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565,
+                                   uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_8888,
+                                   uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_0565,
+                                 uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_8888,
+                                 uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888,
+                                 uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8,
+                                 uint8_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565,
+                                      uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca,
+                                      uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_0565_ca,
+				      uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_0565,
+                                     uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_0565_n_0565,
+                                     uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, add_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
+                                        uint8_t, 1, uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_0565_8_0565,
+                                        uint16_t, 1, uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8_8888,
+                                        uint32_t, 1, uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888,
+                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888,
+                                        uint32_t, 1, uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8888_8888,
+                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_0565,
+                                        uint32_t, 1, uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565,
+                                        uint16_t, 1, uint8_t, 1, uint16_t, 1)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
+                                        uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
+                                        uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC,
+                                        uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC,
+                                        uint16_t, uint32_t)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565,
+                                           OVER, uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
+                                           OVER, uint16_t, uint16_t)
+
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
+                                         uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
+                                         uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
+                                         uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
+                                         uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER,
+                                         uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD,
+                                         uint32_t, uint32_t)
+
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC,
+                                            uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC,
+                                            uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC,
+                                            uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_0565, SRC,
+                                            uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, OVER,
+                                            uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, ADD,
+                                            uint32_t, uint32_t)
+
+void
+pixman_composite_src_n_8_asm_neon (int32_t   w,
+                                   int32_t   h,
+                                   uint8_t  *dst,
+                                   int32_t   dst_stride,
+                                   uint8_t   src);
+
+void
+pixman_composite_src_n_0565_asm_neon (int32_t   w,
+                                      int32_t   h,
+                                      uint16_t *dst,
+                                      int32_t   dst_stride,
+                                      uint16_t  src);
+
+void
+pixman_composite_src_n_8888_asm_neon (int32_t   w,
+                                      int32_t   h,
+                                      uint32_t *dst,
+                                      int32_t   dst_stride,
+                                      uint32_t  src);
+
+static pixman_bool_t
+pixman_fill_neon (uint32_t *bits,
+                  int       stride,
+                  int       bpp,
+                  int       x,
+                  int       y,
+                  int       width,
+                  int       height,
+                  uint32_t  _xor)
+{
+    /* stride is always multiple of 32bit units in pixman */
+    uint32_t byte_stride = stride * sizeof(uint32_t);
+
+    switch (bpp)
+    {
+    case 8:
+	pixman_composite_src_n_8_asm_neon (
+		width,
+		height,
+		(uint8_t *)(((char *) bits) + y * byte_stride + x),
+		byte_stride,
+		_xor & 0xff);
+	return TRUE;
+    case 16:
+	pixman_composite_src_n_0565_asm_neon (
+		width,
+		height,
+		(uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
+		byte_stride / 2,
+		_xor & 0xffff);
+	return TRUE;
+    case 32:
+	pixman_composite_src_n_8888_asm_neon (
+		width,
+		height,
+		(uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
+		byte_stride / 4,
+		_xor);
+	return TRUE;
+    default:
+	return FALSE;
+    }
+}
+
+static pixman_bool_t
+pixman_blt_neon (uint32_t *src_bits,
+                 uint32_t *dst_bits,
+                 int       src_stride,
+                 int       dst_stride,
+                 int       src_bpp,
+                 int       dst_bpp,
+                 int       src_x,
+                 int       src_y,
+                 int       dest_x,
+                 int       dest_y,
+                 int       width,
+                 int       height)
+{
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    switch (src_bpp)
+    {
+    case 16:
+	pixman_composite_src_0565_0565_asm_neon (
+		width, height,
+		(uint16_t *)(((char *) dst_bits) +
+		dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2,
+		(uint16_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 2), src_stride * 2);
+	return TRUE;
+    case 32:
+	pixman_composite_src_8888_8888_asm_neon (
+		width, height,
+		(uint32_t *)(((char *) dst_bits) +
+		dest_y * dst_stride * 4 + dest_x * 4), dst_stride,
+		(uint32_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 4), src_stride);
+	return TRUE;
+    default:
+	return FALSE;
+    }
+}
+
+static const pixman_fast_path_t arm_neon_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     b5g6r5,   neon_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     a8r8g8b8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     x8r8g8b8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     a8b8g8r8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     x8b8g8r8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     a8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     a8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  r8g8b8,   null,     r8g8b8,   neon_composite_src_0888_0888),
+    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     x8r8g8b8, neon_composite_src_0888_8888_rev),
+    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     r5g6b5,   neon_composite_src_0888_0565_rev),
+    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8r8g8b8, neon_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8b8g8r8, neon_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8r8g8b8, neon_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8b8g8r8, neon_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       neon_composite_over_n_8_8),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   neon_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   neon_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8r8g8b8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8r8g8b8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8b8g8r8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8b8g8r8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     r5g6b5,   neon_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     a8r8g8b8, neon_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     x8r8g8b8, neon_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5,   neon_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5,   neon_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    a8r8g8b8, neon_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    x8r8g8b8, neon_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    r5g6b5,   neon_composite_over_8888_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid,    b5g6r5,   neon_composite_over_8888_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   solid,    r5g6b5,   neon_composite_over_0565_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   solid,    b5g6r5,   neon_composite_over_0565_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       a8r8g8b8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       x8r8g8b8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       a8b8g8r8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       x8b8g8r8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       r5g6b5,   neon_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   neon_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   neon_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   neon_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   neon_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   neon_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     a8r8g8b8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, neon_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, neon_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   neon_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   neon_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, neon_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, neon_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8,       neon_composite_in_n_8),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8, neon_composite_out_reverse_8_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8, neon_composite_out_reverse_8_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
+    /* Note: NONE repeat is not supported yet */
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_8_0565),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8_x888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+arm_neon_blt (pixman_implementation_t *imp,
+              uint32_t *               src_bits,
+              uint32_t *               dst_bits,
+              int                      src_stride,
+              int                      dst_stride,
+              int                      src_bpp,
+              int                      dst_bpp,
+              int                      src_x,
+              int                      src_y,
+              int                      dest_x,
+              int                      dest_y,
+              int                      width,
+              int                      height)
+{
+    if (!pixman_blt_neon (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dest_x, dest_y, width, height))
+
+    {
+	return _pixman_implementation_blt (
+	    imp->delegate,
+	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+	    src_x, src_y, dest_x, dest_y, width, height);
+    }
+
+    return TRUE;
+}
+
+static pixman_bool_t
+arm_neon_fill (pixman_implementation_t *imp,
+               uint32_t *               bits,
+               int                      stride,
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+               uint32_t xor)
+{
+    if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
+	return TRUE;
+
+    return _pixman_implementation_fill (
+	imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+}
+
+#define BIND_COMBINE_U(name)                                             \
+void                                                                     \
+pixman_composite_scanline_##name##_mask_asm_neon (int32_t         w,     \
+                                                  const uint32_t *dst,   \
+                                                  const uint32_t *src,   \
+                                                  const uint32_t *mask); \
+                                                                         \
+void                                                                     \
+pixman_composite_scanline_##name##_asm_neon (int32_t         w,          \
+                                             const uint32_t *dst,        \
+                                             const uint32_t *src);       \
+                                                                         \
+static void                                                              \
+neon_combine_##name##_u (pixman_implementation_t *imp,                   \
+                         pixman_op_t              op,                    \
+                         uint32_t *               dest,                  \
+                         const uint32_t *         src,                   \
+                         const uint32_t *         mask,                  \
+                         int                      width)                 \
+{                                                                        \
+    if (mask)                                                            \
+	pixman_composite_scanline_##name##_mask_asm_neon (width, dest,   \
+	                                                  src, mask);    \
+    else                                                                 \
+	pixman_composite_scanline_##name##_asm_neon (width, dest, src);  \
+}
+
+BIND_COMBINE_U (over)
+BIND_COMBINE_U (add)
+BIND_COMBINE_U (out_reverse)
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+	_pixman_implementation_create (fallback, arm_neon_fast_paths);
+
+    imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
+    imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u;
+
+    imp->blt = arm_neon_blt;
+    imp->fill = arm_neon_fill;
+
+    return imp;
+}
diff --git a/pixman/pixman/pixman-combine.c.template b/pixman/pixman/pixman-combine.c.template
index 8c1dd647d..806a18498 100644
--- a/pixman/pixman/pixman-combine.c.template
+++ b/pixman/pixman/pixman-combine.c.template
@@ -1,2460 +1,2460 @@
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <math.h>
-#include <string.h>
-
-#include "pixman-private.h"
-
-#include "pixman-combine.h"
-
-/*** per channel helper functions ***/
-
-static void
-combine_mask_ca (comp4_t *src, comp4_t *mask)
-{
-    comp4_t a = *mask;
-
-    comp4_t x;
-    comp2_t xa;
-
-    if (!a)
-    {
-	*(src) = 0;
-	return;
-    }
-
-    x = *(src);
-    if (a == ~0)
-    {
-	x = x >> A_SHIFT;
-	x |= x << G_SHIFT;
-	x |= x << R_SHIFT;
-	*(mask) = x;
-	return;
-    }
-
-    xa = x >> A_SHIFT;
-    UNcx4_MUL_UNcx4 (x, a);
-    *(src) = x;
-    
-    UNcx4_MUL_UNc (a, xa);
-    *(mask) = a;
-}
-
-static void
-combine_mask_value_ca (comp4_t *src, const comp4_t *mask)
-{
-    comp4_t a = *mask;
-    comp4_t x;
-
-    if (!a)
-    {
-	*(src) = 0;
-	return;
-    }
-
-    if (a == ~0)
-	return;
-
-    x = *(src);
-    UNcx4_MUL_UNcx4 (x, a);
-    *(src) = x;
-}
-
-static void
-combine_mask_alpha_ca (const comp4_t *src, comp4_t *mask)
-{
-    comp4_t a = *(mask);
-    comp4_t x;
-
-    if (!a)
-	return;
-
-    x = *(src) >> A_SHIFT;
-    if (x == MASK)
-	return;
-
-    if (a == ~0)
-    {
-	x |= x << G_SHIFT;
-	x |= x << R_SHIFT;
-	*(mask) = x;
-	return;
-    }
-
-    UNcx4_MUL_UNc (a, x);
-    *(mask) = a;
-}
-
-/*
- * There are two ways of handling alpha -- either as a single unified value or
- * a separate value for each component, hence each macro must have two
- * versions.  The unified alpha version has a 'U' at the end of the name,
- * the component version has a 'C'.  Similarly, functions which deal with
- * this difference will have two versions using the same convention.
- */
-
-/*
- * All of the composing functions
- */
-
-static force_inline comp4_t
-combine_mask (const comp4_t *src, const comp4_t *mask, int i)
-{
-    comp4_t s, m;
-
-    if (mask)
-    {
-	m = *(mask + i) >> A_SHIFT;
-
-	if (!m)
-	    return 0;
-    }
-
-    s = *(src + i);
-
-    if (mask)
-	UNcx4_MUL_UNc (s, m);
-
-    return s;
-}
-
-static void
-combine_clear (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    memset (dest, 0, width * sizeof(comp4_t));
-}
-
-static void
-combine_dst (pixman_implementation_t *imp,
-	     pixman_op_t	      op,
-	     comp4_t *		      dest,
-	     const comp4_t *	      src,
-	     const comp4_t *          mask,
-	     int		      width)
-{
-    return;
-}
-
-static void
-combine_src_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    int i;
-
-    if (!mask)
-	memcpy (dest, src, width * sizeof (comp4_t));
-    else
-    {
-	for (i = 0; i < width; ++i)
-	{
-	    comp4_t s = combine_mask (src, mask, i);
-
-	    *(dest + i) = s;
-	}
-    }
-}
-
-/* if the Src is opaque, call combine_src_u */
-static void
-combine_over_u (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t ia = ALPHA_c (~s);
-
-	UNcx4_MUL_UNc_ADD_UNcx4 (d, ia, s);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Dst is opaque, this is a noop */
-static void
-combine_over_reverse_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t ia = ALPHA_c (~*(dest + i));
-	UNcx4_MUL_UNc_ADD_UNcx4 (s, ia, d);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Dst is opaque, call combine_src_u */
-static void
-combine_in_u (pixman_implementation_t *imp,
-              pixman_op_t              op,
-              comp4_t *                dest,
-              const comp4_t *          src,
-              const comp4_t *          mask,
-              int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t a = ALPHA_c (*(dest + i));
-	UNcx4_MUL_UNc (s, a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, this is a noop */
-static void
-combine_in_reverse_u (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      comp4_t *                dest,
-                      const comp4_t *          src,
-                      const comp4_t *          mask,
-                      int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t a = ALPHA_c (s);
-	UNcx4_MUL_UNc (d, a);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Dst is opaque, call combine_clear */
-static void
-combine_out_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t a = ALPHA_c (~*(dest + i));
-	UNcx4_MUL_UNc (s, a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call combine_clear */
-static void
-combine_out_reverse_u (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       comp4_t *                dest,
-                       const comp4_t *          src,
-                       const comp4_t *          mask,
-                       int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t a = ALPHA_c (~s);
-	UNcx4_MUL_UNc (d, a);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Src is opaque, call combine_in_u */
-/* if the Dst is opaque, call combine_over_u */
-/* if both the Src and Dst are opaque, call combine_src_u */
-static void
-combine_atop_u (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t dest_a = ALPHA_c (d);
-	comp4_t src_ia = ALPHA_c (~s);
-
-	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_a, d, src_ia);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call combine_over_reverse_u */
-/* if the Dst is opaque, call combine_in_reverse_u */
-/* if both the Src and Dst are opaque, call combine_dst_u */
-static void
-combine_atop_reverse_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t src_a = ALPHA_c (s);
-	comp4_t dest_ia = ALPHA_c (~d);
-
-	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call combine_over_u */
-/* if the Dst is opaque, call combine_over_reverse_u */
-/* if both the Src and Dst are opaque, call combine_clear */
-static void
-combine_xor_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t src_ia = ALPHA_c (~s);
-	comp4_t dest_ia = ALPHA_c (~d);
-
-	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_ia);
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_add_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	UNcx4_ADD_UNcx4 (d, s);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Src is opaque, call combine_add_u */
-/* if the Dst is opaque, call combine_add_u */
-/* if both the Src and Dst are opaque, call combine_add_u */
-static void
-combine_saturate_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    comp4_t *                dest,
-                    const comp4_t *          src,
-                    const comp4_t *          mask,
-                    int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp2_t sa, da;
-
-	sa = s >> A_SHIFT;
-	da = ~d >> A_SHIFT;
-	if (sa > da)
-	{
-	    sa = DIV_UNc (da, sa);
-	    UNcx4_MUL_UNc (s, sa);
-	}
-	;
-	UNcx4_ADD_UNcx4 (d, s);
-	*(dest + i) = d;
-    }
-}
-
-/*
- * PDF blend modes:
- * The following blend modes have been taken from the PDF ISO 32000
- * specification, which at this point in time is available from
- * http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
- * The relevant chapters are 11.3.5 and 11.3.6.
- * The formula for computing the final pixel color given in 11.3.6 is:
- * αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
- * with B() being the blend function.
- * Note that OVER is a special case of this operation, using B(Cb, Cs) = Cs
- *
- * These blend modes should match the SVG filter draft specification, as
- * it has been designed to mirror ISO 32000. Note that at the current point
- * no released draft exists that shows this, as the formulas have not been
- * updated yet after the release of ISO 32000.
- *
- * The default implementation here uses the PDF_SEPARABLE_BLEND_MODE and
- * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
- * argument. Note that this implementation operates on premultiplied colors,
- * while the PDF specification does not. Therefore the code uses the formula
- * Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
- */
-
-/*
- * Multiply
- * B(Dca, ad, Sca, as) = Dca.Sca
- */
-
-static void
-combine_multiply_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    comp4_t *                dest,
-                    const comp4_t *          src,
-                    const comp4_t *          mask,
-                    int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t ss = s;
-	comp4_t src_ia = ALPHA_c (~s);
-	comp4_t dest_ia = ALPHA_c (~d);
-
-	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (ss, dest_ia, d, src_ia);
-	UNcx4_MUL_UNcx4 (d, s);
-	UNcx4_ADD_UNcx4 (d, ss);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_multiply_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     comp4_t *                dest,
-                     const comp4_t *          src,
-                     const comp4_t *          mask,
-                     int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t m = *(mask + i);
-	comp4_t s = *(src + i);
-	comp4_t d = *(dest + i);
-	comp4_t r = d;
-	comp4_t dest_ia = ALPHA_c (~d);
-
-	combine_mask_value_ca (&s, &m);
-
-	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (r, ~m, s, dest_ia);
-	UNcx4_MUL_UNcx4 (d, s);
-	UNcx4_ADD_UNcx4 (r, d);
-
-	*(dest + i) = r;
-    }
-}
-
-#define PDF_SEPARABLE_BLEND_MODE(name)					\
-    static void								\
-    combine_ ## name ## _u (pixman_implementation_t *imp,		\
-			    pixman_op_t              op,		\
-                            comp4_t *                dest,		\
-			    const comp4_t *          src,		\
-			    const comp4_t *          mask,		\
-			    int                      width)		\
-    {									\
-	int i;								\
-	for (i = 0; i < width; ++i) {					\
-	    comp4_t s = combine_mask (src, mask, i);			\
-	    comp4_t d = *(dest + i);					\
-	    comp1_t sa = ALPHA_c (s);					\
-	    comp1_t isa = ~sa;						\
-	    comp1_t da = ALPHA_c (d);					\
-	    comp1_t ida = ~da;						\
-	    comp4_t result;						\
-									\
-	    result = d;							\
-	    UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);	\
-	    								\
-	    *(dest + i) = result +					\
-		(DIV_ONE_UNc (sa * da) << A_SHIFT) +			\
-		(blend_ ## name (RED_c (d), da, RED_c (s), sa) << R_SHIFT) + \
-		(blend_ ## name (GREEN_c (d), da, GREEN_c (s), sa) << G_SHIFT) + \
-		(blend_ ## name (BLUE_c (d), da, BLUE_c (s), sa));	\
-	}								\
-    }									\
-    									\
-    static void								\
-    combine_ ## name ## _ca (pixman_implementation_t *imp,		\
-			     pixman_op_t              op,		\
-                             comp4_t *                dest,		\
-			     const comp4_t *          src,		\
-			     const comp4_t *          mask,		\
-			     int                     width)		\
-    {									\
-	int i;								\
-	for (i = 0; i < width; ++i) {					\
-	    comp4_t m = *(mask + i);					\
-	    comp4_t s = *(src + i);					\
-	    comp4_t d = *(dest + i);					\
-	    comp1_t da = ALPHA_c (d);					\
-	    comp1_t ida = ~da;						\
-	    comp4_t result;						\
-            								\
-	    combine_mask_value_ca (&s, &m);				\
-            								\
-	    result = d;							\
-	    UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (result, ~m, s, ida);     \
-            								\
-	    result +=							\
-	        (DIV_ONE_UNc (ALPHA_c (m) * da) << A_SHIFT) +		\
-	        (blend_ ## name (RED_c (d), da, RED_c (s), RED_c (m)) << R_SHIFT) + \
-	        (blend_ ## name (GREEN_c (d), da, GREEN_c (s), GREEN_c (m)) << G_SHIFT) + \
-	        (blend_ ## name (BLUE_c (d), da, BLUE_c (s), BLUE_c (m))); \
-	    								\
-	    *(dest + i) = result;					\
-	}								\
-    }
-
-/*
- * Screen
- * B(Dca, ad, Sca, as) = Dca.sa + Sca.da - Dca.Sca
- */
-static inline comp4_t
-blend_screen (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    return DIV_ONE_UNc (sca * da + dca * sa - sca * dca);
-}
-
-PDF_SEPARABLE_BLEND_MODE (screen)
-
-/*
- * Overlay
- * B(Dca, Da, Sca, Sa) =
- *   if 2.Dca < Da
- *     2.Sca.Dca
- *   otherwise
- *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
- */
-static inline comp4_t
-blend_overlay (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    comp4_t rca;
-
-    if (2 * dca < da)
-	rca = 2 * sca * dca;
-    else
-	rca = sa * da - 2 * (da - dca) * (sa - sca);
-    return DIV_ONE_UNc (rca);
-}
-
-PDF_SEPARABLE_BLEND_MODE (overlay)
-
-/*
- * Darken
- * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa)
- */
-static inline comp4_t
-blend_darken (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    comp4_t s, d;
-
-    s = sca * da;
-    d = dca * sa;
-    return DIV_ONE_UNc (s > d ? d : s);
-}
-
-PDF_SEPARABLE_BLEND_MODE (darken)
-
-/*
- * Lighten
- * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa)
- */
-static inline comp4_t
-blend_lighten (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    comp4_t s, d;
-
-    s = sca * da;
-    d = dca * sa;
-    return DIV_ONE_UNc (s > d ? s : d);
-}
-
-PDF_SEPARABLE_BLEND_MODE (lighten)
-
-/*
- * Color dodge
- * B(Dca, Da, Sca, Sa) =
- *   if Dca == 0
- *     0
- *   if Sca == Sa
- *     Sa.Da
- *   otherwise
- *     Sa.Da. min (1, Dca / Da / (1 - Sca/Sa))
- */
-static inline comp4_t
-blend_color_dodge (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    if (sca >= sa)
-    {
-	return dca == 0 ? 0 : DIV_ONE_UNc (sa * da);
-    }
-    else
-    {
-	comp4_t rca = dca * sa / (sa - sca);
-	return DIV_ONE_UNc (sa * MIN (rca, da));
-    }
-}
-
-PDF_SEPARABLE_BLEND_MODE (color_dodge)
-
-/*
- * Color burn
- * B(Dca, Da, Sca, Sa) =
- *   if Dca == Da
- *     Sa.Da
- *   if Sca == 0
- *     0
- *   otherwise
- *     Sa.Da.(1 - min (1, (1 - Dca/Da).Sa / Sca))
- */
-static inline comp4_t
-blend_color_burn (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    if (sca == 0)
-    {
-	return dca < da ? 0 : DIV_ONE_UNc (sa * da);
-    }
-    else
-    {
-	comp4_t rca = (da - dca) * sa / sca;
-	return DIV_ONE_UNc (sa * (MAX (rca, da) - rca));
-    }
-}
-
-PDF_SEPARABLE_BLEND_MODE (color_burn)
-
-/*
- * Hard light
- * B(Dca, Da, Sca, Sa) =
- *   if 2.Sca < Sa
- *     2.Sca.Dca
- *   otherwise
- *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
- */
-static inline comp4_t
-blend_hard_light (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    if (2 * sca < sa)
-	return DIV_ONE_UNc (2 * sca * dca);
-    else
-	return DIV_ONE_UNc (sa * da - 2 * (da - dca) * (sa - sca));
-}
-
-PDF_SEPARABLE_BLEND_MODE (hard_light)
-
-/*
- * Soft light
- * B(Dca, Da, Sca, Sa) =
- *   if (2.Sca <= Sa)
- *     Dca.(Sa - (1 - Dca/Da).(2.Sca - Sa))
- *   otherwise if Dca.4 <= Da
- *     Dca.(Sa + (2.Sca - Sa).((16.Dca/Da - 12).Dca/Da + 3)
- *   otherwise
- *     (Dca.Sa + (SQRT (Dca/Da).Da - Dca).(2.Sca - Sa))
- */
-static inline comp4_t
-blend_soft_light (comp4_t dca_org,
-		  comp4_t da_org,
-		  comp4_t sca_org,
-		  comp4_t sa_org)
-{
-    double dca = dca_org * (1.0 / MASK);
-    double da = da_org * (1.0 / MASK);
-    double sca = sca_org * (1.0 / MASK);
-    double sa = sa_org * (1.0 / MASK);
-    double rca;
-
-    if (2 * sca < sa)
-    {
-	if (da == 0)
-	    rca = dca * sa;
-	else
-	    rca = dca * sa - dca * (da - dca) * (sa - 2 * sca) / da;
-    }
-    else if (da == 0)
-    {
-	rca = 0;
-    }
-    else if (4 * dca <= da)
-    {
-	rca = dca * sa +
-	    (2 * sca - sa) * dca * ((16 * dca / da - 12) * dca / da + 3);
-    }
-    else
-    {
-	rca = dca * sa + (sqrt (dca * da) - dca) * (2 * sca - sa);
-    }
-    return rca * MASK + 0.5;
-}
-
-PDF_SEPARABLE_BLEND_MODE (soft_light)
-
-/*
- * Difference
- * B(Dca, Da, Sca, Sa) = abs (Dca.Sa - Sca.Da)
- */
-static inline comp4_t
-blend_difference (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    comp4_t dcasa = dca * sa;
-    comp4_t scada = sca * da;
-
-    if (scada < dcasa)
-	return DIV_ONE_UNc (dcasa - scada);
-    else
-	return DIV_ONE_UNc (scada - dcasa);
-}
-
-PDF_SEPARABLE_BLEND_MODE (difference)
-
-/*
- * Exclusion
- * B(Dca, Da, Sca, Sa) = (Sca.Da + Dca.Sa - 2.Sca.Dca)
- */
-
-/* This can be made faster by writing it directly and not using
- * PDF_SEPARABLE_BLEND_MODE, but that's a performance optimization */
-
-static inline comp4_t
-blend_exclusion (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
-{
-    return DIV_ONE_UNc (sca * da + dca * sa - 2 * dca * sca);
-}
-
-PDF_SEPARABLE_BLEND_MODE (exclusion)
-
-#undef PDF_SEPARABLE_BLEND_MODE
-
-/*
- * PDF nonseperable blend modes are implemented using the following functions
- * to operate in Hsl space, with Cmax, Cmid, Cmin referring to the max, mid
- * and min value of the red, green and blue components.
- *
- * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue
- *
- * clip_color (C):
- *   l = LUM (C)
- *   min = Cmin
- *   max = Cmax
- *   if n < 0.0
- *     C = l + ( ( ( C – l ) × l ) ⁄ ( l – min ) )
- *   if x > 1.0
- *     C = l + ( ( ( C – l ) × ( 1 – l ) ) ⁄ ( max – l ) )
- *   return C
- *
- * set_lum (C, l):
- *   d = l – LUM (C)
- *   C += d
- *   return clip_color (C)
- *
- * SAT (C) = CH_MAX (C) - CH_MIN (C)
- *
- * set_sat (C, s):
- *  if Cmax > Cmin
- *    Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
- *    Cmax = s
- *  else
- *    Cmid = Cmax = 0.0
- *  Cmin = 0.0
- *  return C
- */
-
-/* For premultiplied colors, we need to know what happens when C is
- * multiplied by a real number. LUM and SAT are linear:
- *
- *    LUM (r × C) = r × LUM (C)		SAT (r * C) = r * SAT (C)
- *
- * If we extend clip_color with an extra argument a and change
- *
- *        if x >= 1.0
- *
- * into
- *
- *        if x >= a
- *
- * then clip_color is also linear:
- *
- *    r * clip_color (C, a) = clip_color (r_c, ra);
- *
- * for positive r.
- *
- * Similarly, we can extend set_lum with an extra argument that is just passed
- * on to clip_color:
- *
- *   r * set_lum ( C, l, a)
- *
- *   = r × clip_color ( C + l - LUM (C), a)
- *
- *   = clip_color ( r * C + r × l - r * LUM (C), r * a)
- *
- *   = set_lum ( r * C, r * l, r * a)
- *
- * Finally, set_sat:
- *
- *    r * set_sat (C, s) = set_sat (x * C, r * s)
- *
- * The above holds for all non-zero x, because the x'es in the fraction for
- * C_mid cancel out. Specifically, it holds for x = r:
- *
- *    r * set_sat (C, s) = set_sat (r_c, rs)
- *
- */
-
-/* So, for the non-separable PDF blend modes, we have (using s, d for
- * non-premultiplied colors, and S, D for premultiplied:
- *
- *   Color:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (S/a_s, LUM (D/a_d), 1)
- *   = set_lum (S * a_d, a_s * LUM (D), a_s * a_d)
- *
- *
- *   Luminosity:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (D/a_d, LUM(S/a_s), 1)
- *   = set_lum (a_s * D, a_d * LUM(S), a_s * a_d)
- *
- *
- *   Saturation:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (set_sat (D/a_d, SAT (S/a_s)), LUM (D/a_d), 1)
- *   = set_lum (a_s * a_d * set_sat (D/a_d, SAT (S/a_s)),
- *                                        a_s * LUM (D), a_s * a_d)
- *   = set_lum (set_sat (a_s * D, a_d * SAT (S), a_s * LUM (D), a_s * a_d))
- *
- *   Hue:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
- *   = set_lum (set_sat (a_d * S, a_s * SAT (D)), a_s * LUM (D), a_s * a_d)
- *
- */
-
-#define CH_MIN(c) (c[0] < c[1] ? (c[0] < c[2] ? c[0] : c[2]) : (c[1] < c[2] ? c[1] : c[2]))
-#define CH_MAX(c) (c[0] > c[1] ? (c[0] > c[2] ? c[0] : c[2]) : (c[1] > c[2] ? c[1] : c[2]))
-#define LUM(c) ((c[0] * 30 + c[1] * 59 + c[2] * 11) / 100)
-#define SAT(c) (CH_MAX (c) - CH_MIN (c))
-
-#define PDF_NON_SEPARABLE_BLEND_MODE(name)				\
-    static void								\
-    combine_ ## name ## _u (pixman_implementation_t *imp,		\
-			    pixman_op_t op,				\
-                            comp4_t *dest,				\
-			    const comp4_t *src,				\
-			    const comp4_t *mask,			\
-			    int width)					\
-    {									\
-	int i;								\
-	for (i = 0; i < width; ++i)					\
-	{								\
-	    comp4_t s = combine_mask (src, mask, i);			\
-	    comp4_t d = *(dest + i);					\
-	    comp1_t sa = ALPHA_c (s);					\
-	    comp1_t isa = ~sa;						\
-	    comp1_t da = ALPHA_c (d);					\
-	    comp1_t ida = ~da;						\
-	    comp4_t result;						\
-	    comp4_t sc[3], dc[3], c[3];					\
-            								\
-	    result = d;							\
-	    UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);	\
-	    dc[0] = RED_c (d);						\
-	    sc[0] = RED_c (s);						\
-	    dc[1] = GREEN_c (d);					\
-	    sc[1] = GREEN_c (s);					\
-	    dc[2] = BLUE_c (d);						\
-	    sc[2] = BLUE_c (s);						\
-	    blend_ ## name (c, dc, da, sc, sa);				\
-            								\
-	    *(dest + i) = result +					\
-		(DIV_ONE_UNc (sa * da) << A_SHIFT) +			\
-		(DIV_ONE_UNc (c[0]) << R_SHIFT) +			\
-		(DIV_ONE_UNc (c[1]) << G_SHIFT) +			\
-		(DIV_ONE_UNc (c[2]));					\
-	}								\
-    }
-
-static void
-set_lum (comp4_t dest[3], comp4_t src[3], comp4_t sa, comp4_t lum)
-{
-    double a, l, min, max;
-    double tmp[3];
-
-    a = sa * (1.0 / MASK);
-
-    l = lum * (1.0 / MASK);
-    tmp[0] = src[0] * (1.0 / MASK);
-    tmp[1] = src[1] * (1.0 / MASK);
-    tmp[2] = src[2] * (1.0 / MASK);
-
-    l = l - LUM (tmp);
-    tmp[0] += l;
-    tmp[1] += l;
-    tmp[2] += l;
-
-    /* clip_color */
-    l = LUM (tmp);
-    min = CH_MIN (tmp);
-    max = CH_MAX (tmp);
-
-    if (min < 0)
-    {
-	if (l - min == 0.0)
-	{
-	    tmp[0] = 0;
-	    tmp[1] = 0;
-	    tmp[2] = 0;
-	}
-	else
-	{
-	    tmp[0] = l + (tmp[0] - l) * l / (l - min);
-	    tmp[1] = l + (tmp[1] - l) * l / (l - min);
-	    tmp[2] = l + (tmp[2] - l) * l / (l - min);
-	}
-    }
-    if (max > a)
-    {
-	if (max - l == 0.0)
-	{
-	    tmp[0] = a;
-	    tmp[1] = a;
-	    tmp[2] = a;
-	}
-	else
-	{
-	    tmp[0] = l + (tmp[0] - l) * (a - l) / (max - l);
-	    tmp[1] = l + (tmp[1] - l) * (a - l) / (max - l);
-	    tmp[2] = l + (tmp[2] - l) * (a - l) / (max - l);
-	}
-    }
-
-    dest[0] = tmp[0] * MASK + 0.5;
-    dest[1] = tmp[1] * MASK + 0.5;
-    dest[2] = tmp[2] * MASK + 0.5;
-}
-
-static void
-set_sat (comp4_t dest[3], comp4_t src[3], comp4_t sat)
-{
-    int id[3];
-    comp4_t min, max;
-
-    if (src[0] > src[1])
-    {
-	if (src[0] > src[2])
-	{
-	    id[0] = 0;
-	    if (src[1] > src[2])
-	    {
-		id[1] = 1;
-		id[2] = 2;
-	    }
-	    else
-	    {
-		id[1] = 2;
-		id[2] = 1;
-	    }
-	}
-	else
-	{
-	    id[0] = 2;
-	    id[1] = 0;
-	    id[2] = 1;
-	}
-    }
-    else
-    {
-	if (src[0] > src[2])
-	{
-	    id[0] = 1;
-	    id[1] = 0;
-	    id[2] = 2;
-	}
-	else
-	{
-	    id[2] = 0;
-	    if (src[1] > src[2])
-	    {
-		id[0] = 1;
-		id[1] = 2;
-	    }
-	    else
-	    {
-		id[0] = 2;
-		id[1] = 1;
-	    }
-	}
-    }
-
-    max = dest[id[0]];
-    min = dest[id[2]];
-    if (max > min)
-    {
-	dest[id[1]] = (dest[id[1]] - min) * sat / (max - min);
-	dest[id[0]] = sat;
-	dest[id[2]] = 0;
-    }
-    else
-    {
-	dest[0] = dest[1] = dest[2] = 0;
-    }
-}
-
-/*
- * Hue:
- * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
- */
-static inline void
-blend_hsl_hue (comp4_t c[3],
-               comp4_t dc[3],
-               comp4_t da,
-               comp4_t sc[3],
-               comp4_t sa)
-{
-    c[0] = sc[0] * da;
-    c[1] = sc[1] * da;
-    c[2] = sc[2] * da;
-    set_sat (c, c, SAT (dc) * sa);
-    set_lum (c, c, sa * da, LUM (dc) * sa);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_hue)
-
-/*
- * Saturation:
- * B(Cb, Cs) = set_lum (set_sat (Cb, SAT (Cs)), LUM (Cb))
- */
-static inline void
-blend_hsl_saturation (comp4_t c[3],
-                      comp4_t dc[3],
-                      comp4_t da,
-                      comp4_t sc[3],
-                      comp4_t sa)
-{
-    c[0] = dc[0] * sa;
-    c[1] = dc[1] * sa;
-    c[2] = dc[2] * sa;
-    set_sat (c, c, SAT (sc) * da);
-    set_lum (c, c, sa * da, LUM (dc) * sa);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_saturation)
-
-/*
- * Color:
- * B(Cb, Cs) = set_lum (Cs, LUM (Cb))
- */
-static inline void
-blend_hsl_color (comp4_t c[3],
-                 comp4_t dc[3],
-                 comp4_t da,
-                 comp4_t sc[3],
-                 comp4_t sa)
-{
-    c[0] = sc[0] * da;
-    c[1] = sc[1] * da;
-    c[2] = sc[2] * da;
-    set_lum (c, c, sa * da, LUM (dc) * sa);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_color)
-
-/*
- * Luminosity:
- * B(Cb, Cs) = set_lum (Cb, LUM (Cs))
- */
-static inline void
-blend_hsl_luminosity (comp4_t c[3],
-                      comp4_t dc[3],
-                      comp4_t da,
-                      comp4_t sc[3],
-                      comp4_t sa)
-{
-    c[0] = dc[0] * sa;
-    c[1] = dc[1] * sa;
-    c[2] = dc[2] * sa;
-    set_lum (c, c, sa * da, LUM (sc) * da);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
-
-#undef SAT
-#undef LUM
-#undef CH_MAX
-#undef CH_MIN
-#undef PDF_NON_SEPARABLE_BLEND_MODE
-
-/* Overlay
- *
- * All of the disjoint composing functions
- *
- * The four entries in the first column indicate what source contributions
- * come from each of the four areas of the picture -- areas covered by neither
- * A nor B, areas covered only by A, areas covered only by B and finally
- * areas covered by both A and B.
- * 
- * Disjoint			Conjoint
- * Fa		Fb		Fa		Fb
- * (0,0,0,0)	0		0		0		0
- * (0,A,0,A)	1		0		1		0
- * (0,0,B,B)	0		1		0		1
- * (0,A,B,A)	1		min((1-a)/b,1)	1		max(1-a/b,0)
- * (0,A,B,B)	min((1-b)/a,1)	1		max(1-b/a,0)	1
- * (0,0,0,A)	max(1-(1-b)/a,0) 0		min(1,b/a)	0
- * (0,0,0,B)	0		max(1-(1-a)/b,0) 0		min(a/b,1)
- * (0,A,0,0)	min(1,(1-b)/a)	0		max(1-b/a,0)	0
- * (0,0,B,0)	0		min(1,(1-a)/b)	0		max(1-a/b,0)
- * (0,0,B,A)	max(1-(1-b)/a,0) min(1,(1-a)/b)	 min(1,b/a)	max(1-a/b,0)
- * (0,A,0,B)	min(1,(1-b)/a)	max(1-(1-a)/b,0) max(1-b/a,0)	min(1,a/b)
- * (0,A,B,0)	min(1,(1-b)/a)	min(1,(1-a)/b)	max(1-b/a,0)	max(1-a/b,0)
- */
-
-#define COMBINE_A_OUT 1
-#define COMBINE_A_IN  2
-#define COMBINE_B_OUT 4
-#define COMBINE_B_IN  8
-
-#define COMBINE_CLEAR   0
-#define COMBINE_A       (COMBINE_A_OUT | COMBINE_A_IN)
-#define COMBINE_B       (COMBINE_B_OUT | COMBINE_B_IN)
-#define COMBINE_A_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
-#define COMBINE_B_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
-#define COMBINE_A_ATOP  (COMBINE_B_OUT | COMBINE_A_IN)
-#define COMBINE_B_ATOP  (COMBINE_A_OUT | COMBINE_B_IN)
-#define COMBINE_XOR     (COMBINE_A_OUT | COMBINE_B_OUT)
-
-/* portion covered by a but not b */
-static comp1_t
-combine_disjoint_out_part (comp1_t a, comp1_t b)
-{
-    /* min (1, (1-b) / a) */
-
-    b = ~b;                 /* 1 - b */
-    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
-	return MASK;        /* 1 */
-    return DIV_UNc (b, a);     /* (1-b) / a */
-}
-
-/* portion covered by both a and b */
-static comp1_t
-combine_disjoint_in_part (comp1_t a, comp1_t b)
-{
-    /* max (1-(1-b)/a,0) */
-    /*  = - min ((1-b)/a - 1, 0) */
-    /*  = 1 - min (1, (1-b)/a) */
-
-    b = ~b;                 /* 1 - b */
-    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
-	return 0;           /* 1 - 1 */
-    return ~DIV_UNc(b, a);    /* 1 - (1-b) / a */
-}
-
-/* portion covered by a but not b */
-static comp1_t
-combine_conjoint_out_part (comp1_t a, comp1_t b)
-{
-    /* max (1-b/a,0) */
-    /* = 1-min(b/a,1) */
-
-    /* min (1, (1-b) / a) */
-
-    if (b >= a)             /* b >= a -> b/a >= 1 */
-	return 0x00;        /* 0 */
-    return ~DIV_UNc(b, a);    /* 1 - b/a */
-}
-
-/* portion covered by both a and b */
-static comp1_t
-combine_conjoint_in_part (comp1_t a, comp1_t b)
-{
-    /* min (1,b/a) */
-
-    if (b >= a)             /* b >= a -> b/a >= 1 */
-	return MASK;        /* 1 */
-    return DIV_UNc (b, a);     /* b/a */
-}
-
-#define GET_COMP(v, i)   ((comp2_t) (comp1_t) ((v) >> i))
-
-#define ADD(x, y, i, t)							\
-    ((t) = GET_COMP (x, i) + GET_COMP (y, i),				\
-     (comp4_t) ((comp1_t) ((t) | (0 - ((t) >> G_SHIFT)))) << (i))
-
-#define GENERIC(x, y, i, ax, ay, t, u, v)				\
-    ((t) = (MUL_UNc (GET_COMP (y, i), ay, (u)) +			\
-            MUL_UNc (GET_COMP (x, i), ax, (v))),			\
-     (comp4_t) ((comp1_t) ((t) |					\
-                           (0 - ((t) >> G_SHIFT)))) << (i))
-
-static void
-combine_disjoint_general_u (comp4_t *      dest,
-                            const comp4_t *src,
-                            const comp4_t *mask,
-                            int            width,
-                            comp1_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t m, n, o, p;
-	comp2_t Fa, Fb, t, u, v;
-	comp1_t sa = s >> A_SHIFT;
-	comp1_t da = d >> A_SHIFT;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    Fa = combine_disjoint_out_part (sa, da);
-	    break;
-
-	case COMBINE_A_IN:
-	    Fa = combine_disjoint_in_part (sa, da);
-	    break;
-
-	case COMBINE_A:
-	    Fa = MASK;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    Fb = combine_disjoint_out_part (da, sa);
-	    break;
-
-	case COMBINE_B_IN:
-	    Fb = combine_disjoint_in_part (da, sa);
-	    break;
-
-	case COMBINE_B:
-	    Fb = MASK;
-	    break;
-	}
-	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
-	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
-	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
-	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
-	s = m | n | o | p;
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_disjoint_over_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp2_t a = s >> A_SHIFT;
-
-	if (s != 0x00)
-	{
-	    comp4_t d = *(dest + i);
-	    a = combine_disjoint_out_part (d >> A_SHIFT, a);
-	    UNcx4_MUL_UNc_ADD_UNcx4 (d, a, s);
-
-	    *(dest + i) = d;
-	}
-    }
-}
-
-static void
-combine_disjoint_in_u (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       comp4_t *                dest,
-                       const comp4_t *          src,
-                       const comp4_t *          mask,
-                       int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               comp4_t *                dest,
-                               const comp4_t *          src,
-                               const comp4_t *          mask,
-                               int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_disjoint_out_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                comp4_t *                dest,
-                                const comp4_t *          src,
-                                const comp4_t *          mask,
-                                int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_disjoint_atop_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 comp4_t *                dest,
-                                 const comp4_t *          src,
-                                 const comp4_t *          mask,
-                                 int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_disjoint_xor_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
-}
-
-static void
-combine_conjoint_general_u (comp4_t *      dest,
-                            const comp4_t *src,
-                            const comp4_t *mask,
-                            int            width,
-                            comp1_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = combine_mask (src, mask, i);
-	comp4_t d = *(dest + i);
-	comp4_t m, n, o, p;
-	comp2_t Fa, Fb, t, u, v;
-	comp1_t sa = s >> A_SHIFT;
-	comp1_t da = d >> A_SHIFT;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    Fa = combine_conjoint_out_part (sa, da);
-	    break;
-
-	case COMBINE_A_IN:
-	    Fa = combine_conjoint_in_part (sa, da);
-	    break;
-
-	case COMBINE_A:
-	    Fa = MASK;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    Fb = combine_conjoint_out_part (da, sa);
-	    break;
-
-	case COMBINE_B_IN:
-	    Fb = combine_conjoint_in_part (da, sa);
-	    break;
-
-	case COMBINE_B:
-	    Fb = MASK;
-	    break;
-	}
-
-	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
-	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
-	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
-	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
-
-	s = m | n | o | p;
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_conjoint_over_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
-}
-
-static void
-combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 comp4_t *                dest,
-                                 const comp4_t *          src,
-                                 const comp4_t *          mask,
-                                 int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
-}
-
-static void
-combine_conjoint_in_u (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       comp4_t *                dest,
-                       const comp4_t *          src,
-                       const comp4_t *          mask,
-                       int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               comp4_t *                dest,
-                               const comp4_t *          src,
-                               const comp4_t *          mask,
-                               int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_conjoint_out_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                comp4_t *                dest,
-                                const comp4_t *          src,
-                                const comp4_t *          mask,
-                                int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_conjoint_atop_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 comp4_t *                dest,
-                                 const comp4_t *          src,
-                                 const comp4_t *          mask,
-                                 int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_conjoint_xor_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
-}
-
-/************************************************************************/
-/*********************** Per Channel functions **************************/
-/************************************************************************/
-
-static void
-combine_clear_ca (pixman_implementation_t *imp,
-                  pixman_op_t              op,
-                  comp4_t *                dest,
-                  const comp4_t *          src,
-                  const comp4_t *          mask,
-                  int                      width)
-{
-    memset (dest, 0, width * sizeof(comp4_t));
-}
-
-static void
-combine_src_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-
-	combine_mask_value_ca (&s, &m);
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_over_ca (pixman_implementation_t *imp,
-                 pixman_op_t              op,
-                 comp4_t *                dest,
-                 const comp4_t *          src,
-                 const comp4_t *          mask,
-                 int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t a;
-
-	combine_mask_ca (&s, &m);
-
-	a = ~m;
-	if (a)
-	{
-	    comp4_t d = *(dest + i);
-	    UNcx4_MUL_UNcx4_ADD_UNcx4 (d, a, s);
-	    s = d;
-	}
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_over_reverse_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp4_t a = ~d >> A_SHIFT;
-
-	if (a)
-	{
-	    comp4_t s = *(src + i);
-	    comp4_t m = *(mask + i);
-
-	    UNcx4_MUL_UNcx4 (s, m);
-	    UNcx4_MUL_UNc_ADD_UNcx4 (s, a, d);
-
-	    *(dest + i) = s;
-	}
-    }
-}
-
-static void
-combine_in_ca (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               comp4_t *                dest,
-               const comp4_t *          src,
-               const comp4_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp2_t a = d >> A_SHIFT;
-	comp4_t s = 0;
-
-	if (a)
-	{
-	    comp4_t m = *(mask + i);
-
-	    s = *(src + i);
-	    combine_mask_value_ca (&s, &m);
-
-	    if (a != MASK)
-		UNcx4_MUL_UNc (s, a);
-	}
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_in_reverse_ca (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       comp4_t *                dest,
-                       const comp4_t *          src,
-                       const comp4_t *          mask,
-                       int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t a;
-
-	combine_mask_alpha_ca (&s, &m);
-
-	a = m;
-	if (a != ~0)
-	{
-	    comp4_t d = 0;
-
-	    if (a)
-	    {
-		d = *(dest + i);
-		UNcx4_MUL_UNcx4 (d, a);
-	    }
-
-	    *(dest + i) = d;
-	}
-    }
-}
-
-static void
-combine_out_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp2_t a = ~d >> A_SHIFT;
-	comp4_t s = 0;
-
-	if (a)
-	{
-	    comp4_t m = *(mask + i);
-
-	    s = *(src + i);
-	    combine_mask_value_ca (&s, &m);
-
-	    if (a != MASK)
-		UNcx4_MUL_UNc (s, a);
-	}
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_out_reverse_ca (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t a;
-
-	combine_mask_alpha_ca (&s, &m);
-
-	a = ~m;
-	if (a != ~0)
-	{
-	    comp4_t d = 0;
-
-	    if (a)
-	    {
-		d = *(dest + i);
-		UNcx4_MUL_UNcx4 (d, a);
-	    }
-
-	    *(dest + i) = d;
-	}
-    }
-}
-
-static void
-combine_atop_ca (pixman_implementation_t *imp,
-                 pixman_op_t              op,
-                 comp4_t *                dest,
-                 const comp4_t *          src,
-                 const comp4_t *          mask,
-                 int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t ad;
-	comp2_t as = d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	ad = ~m;
-
-	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_atop_reverse_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t ad;
-	comp2_t as = ~d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	ad = m;
-
-	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_xor_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t d = *(dest + i);
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t ad;
-	comp2_t as = ~d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	ad = ~m;
-
-	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_add_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                comp4_t *                dest,
-                const comp4_t *          src,
-                const comp4_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t d = *(dest + i);
-
-	combine_mask_value_ca (&s, &m);
-
-	UNcx4_ADD_UNcx4 (d, s);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_saturate_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     comp4_t *                dest,
-                     const comp4_t *          src,
-                     const comp4_t *          mask,
-                     int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s, d;
-	comp2_t sa, sr, sg, sb, da;
-	comp2_t t, u, v;
-	comp4_t m, n, o, p;
-
-	d = *(dest + i);
-	s = *(src + i);
-	m = *(mask + i);
-
-	combine_mask_ca (&s, &m);
-
-	sa = (m >> A_SHIFT);
-	sr = (m >> R_SHIFT) & MASK;
-	sg = (m >> G_SHIFT) & MASK;
-	sb =  m             & MASK;
-	da = ~d >> A_SHIFT;
-
-	if (sb <= da)
-	    m = ADD (s, d, 0, t);
-	else
-	    m = GENERIC (s, d, 0, (da << G_SHIFT) / sb, MASK, t, u, v);
-
-	if (sg <= da)
-	    n = ADD (s, d, G_SHIFT, t);
-	else
-	    n = GENERIC (s, d, G_SHIFT, (da << G_SHIFT) / sg, MASK, t, u, v);
-
-	if (sr <= da)
-	    o = ADD (s, d, R_SHIFT, t);
-	else
-	    o = GENERIC (s, d, R_SHIFT, (da << G_SHIFT) / sr, MASK, t, u, v);
-
-	if (sa <= da)
-	    p = ADD (s, d, A_SHIFT, t);
-	else
-	    p = GENERIC (s, d, A_SHIFT, (da << G_SHIFT) / sa, MASK, t, u, v);
-
-	*(dest + i) = m | n | o | p;
-    }
-}
-
-static void
-combine_disjoint_general_ca (comp4_t *      dest,
-                             const comp4_t *src,
-                             const comp4_t *mask,
-                             int            width,
-                             comp1_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s, d;
-	comp4_t m, n, o, p;
-	comp4_t Fa, Fb;
-	comp2_t t, u, v;
-	comp4_t sa;
-	comp1_t da;
-
-	s = *(src + i);
-	m = *(mask + i);
-	d = *(dest + i);
-	da = d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	sa = m;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    m = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> 0), da);
-	    n = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A_IN:
-	    m = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> 0), da);
-	    n = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A:
-	    Fa = ~0;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    m = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> 0));
-	    n = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B_IN:
-	    m = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> 0));
-	    n = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B:
-	    Fb = ~0;
-	    break;
-	}
-	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
-	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
-	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
-	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
-
-	s = m | n | o | p;
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_disjoint_over_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          comp4_t *                dest,
-                          const comp4_t *          src,
-                          const comp4_t *          mask,
-                          int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
-}
-
-static void
-combine_disjoint_in_ca (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                comp4_t *                dest,
-                                const comp4_t *          src,
-                                const comp4_t *          mask,
-                                int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_disjoint_out_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 comp4_t *                dest,
-                                 const comp4_t *          src,
-                                 const comp4_t *          mask,
-                                 int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_disjoint_atop_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          comp4_t *                dest,
-                          const comp4_t *          src,
-                          const comp4_t *          mask,
-                          int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  comp4_t *                dest,
-                                  const comp4_t *          src,
-                                  const comp4_t *          mask,
-                                  int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_disjoint_xor_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
-}
-
-static void
-combine_conjoint_general_ca (comp4_t *      dest,
-                             const comp4_t *src,
-                             const comp4_t *mask,
-                             int            width,
-                             comp1_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	comp4_t s, d;
-	comp4_t m, n, o, p;
-	comp4_t Fa, Fb;
-	comp2_t t, u, v;
-	comp4_t sa;
-	comp1_t da;
-
-	s = *(src + i);
-	m = *(mask + i);
-	d = *(dest + i);
-	da = d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	sa = m;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    m = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> 0), da);
-	    n = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A_IN:
-	    m = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> 0), da);
-	    n = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A:
-	    Fa = ~0;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    m = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> 0));
-	    n = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B_IN:
-	    m = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> 0));
-	    n = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B:
-	    Fb = ~0;
-	    break;
-	}
-	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
-	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
-	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
-	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
-
-	s = m | n | o | p;
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_conjoint_over_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          comp4_t *                dest,
-                          const comp4_t *          src,
-                          const comp4_t *          mask,
-                          int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
-}
-
-static void
-combine_conjoint_over_reverse_ca (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  comp4_t *                dest,
-                                  const comp4_t *          src,
-                                  const comp4_t *          mask,
-                                  int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER);
-}
-
-static void
-combine_conjoint_in_ca (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        comp4_t *                dest,
-                        const comp4_t *          src,
-                        const comp4_t *          mask,
-                        int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_conjoint_in_reverse_ca (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                comp4_t *                dest,
-                                const comp4_t *          src,
-                                const comp4_t *          mask,
-                                int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_conjoint_out_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_conjoint_out_reverse_ca (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 comp4_t *                dest,
-                                 const comp4_t *          src,
-                                 const comp4_t *          mask,
-                                 int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_conjoint_atop_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          comp4_t *                dest,
-                          const comp4_t *          src,
-                          const comp4_t *          mask,
-                          int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  comp4_t *                dest,
-                                  const comp4_t *          src,
-                                  const comp4_t *          mask,
-                                  int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_conjoint_xor_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         comp4_t *                dest,
-                         const comp4_t *          src,
-                         const comp4_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
-}
-
-void
-_pixman_setup_combiner_functions_width (pixman_implementation_t *imp)
-{
-    /* Unified alpha */
-    imp->combine_width[PIXMAN_OP_CLEAR] = combine_clear;
-    imp->combine_width[PIXMAN_OP_SRC] = combine_src_u;
-    imp->combine_width[PIXMAN_OP_DST] = combine_dst;
-    imp->combine_width[PIXMAN_OP_OVER] = combine_over_u;
-    imp->combine_width[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u;
-    imp->combine_width[PIXMAN_OP_IN] = combine_in_u;
-    imp->combine_width[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u;
-    imp->combine_width[PIXMAN_OP_OUT] = combine_out_u;
-    imp->combine_width[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u;
-    imp->combine_width[PIXMAN_OP_ATOP] = combine_atop_u;
-    imp->combine_width[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u;
-    imp->combine_width[PIXMAN_OP_XOR] = combine_xor_u;
-    imp->combine_width[PIXMAN_OP_ADD] = combine_add_u;
-    imp->combine_width[PIXMAN_OP_SATURATE] = combine_saturate_u;
-
-    /* Disjoint, unified */
-    imp->combine_width[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear;
-    imp->combine_width[PIXMAN_OP_DISJOINT_SRC] = combine_src_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_DST] = combine_dst;
-    imp->combine_width[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_u;
-    imp->combine_width[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_u;
-
-    /* Conjoint, unified */
-    imp->combine_width[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear;
-    imp->combine_width[PIXMAN_OP_CONJOINT_SRC] = combine_src_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_DST] = combine_dst;
-    imp->combine_width[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_u;
-    imp->combine_width[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_u;
-
-    imp->combine_width[PIXMAN_OP_MULTIPLY] = combine_multiply_u;
-    imp->combine_width[PIXMAN_OP_SCREEN] = combine_screen_u;
-    imp->combine_width[PIXMAN_OP_OVERLAY] = combine_overlay_u;
-    imp->combine_width[PIXMAN_OP_DARKEN] = combine_darken_u;
-    imp->combine_width[PIXMAN_OP_LIGHTEN] = combine_lighten_u;
-    imp->combine_width[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_u;
-    imp->combine_width[PIXMAN_OP_COLOR_BURN] = combine_color_burn_u;
-    imp->combine_width[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u;
-    imp->combine_width[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_u;
-    imp->combine_width[PIXMAN_OP_DIFFERENCE] = combine_difference_u;
-    imp->combine_width[PIXMAN_OP_EXCLUSION] = combine_exclusion_u;
-    imp->combine_width[PIXMAN_OP_HSL_HUE] = combine_hsl_hue_u;
-    imp->combine_width[PIXMAN_OP_HSL_SATURATION] = combine_hsl_saturation_u;
-    imp->combine_width[PIXMAN_OP_HSL_COLOR] = combine_hsl_color_u;
-    imp->combine_width[PIXMAN_OP_HSL_LUMINOSITY] = combine_hsl_luminosity_u;
-
-    /* Component alpha combiners */
-    imp->combine_width_ca[PIXMAN_OP_CLEAR] = combine_clear_ca;
-    imp->combine_width_ca[PIXMAN_OP_SRC] = combine_src_ca;
-    /* dest */
-    imp->combine_width_ca[PIXMAN_OP_OVER] = combine_over_ca;
-    imp->combine_width_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_IN] = combine_in_ca;
-    imp->combine_width_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_OUT] = combine_out_ca;
-    imp->combine_width_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_ATOP] = combine_atop_ca;
-    imp->combine_width_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_XOR] = combine_xor_ca;
-    imp->combine_width_ca[PIXMAN_OP_ADD] = combine_add_ca;
-    imp->combine_width_ca[PIXMAN_OP_SATURATE] = combine_saturate_ca;
-
-    /* Disjoint CA */
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_SRC] = combine_src_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_DST] = combine_dst;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_ca;
-
-    /* Conjoint CA */
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_SRC] = combine_src_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_DST] = combine_dst;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_ca;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_ca;
-
-    imp->combine_width_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca;
-    imp->combine_width_ca[PIXMAN_OP_SCREEN] = combine_screen_ca;
-    imp->combine_width_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca;
-    imp->combine_width_ca[PIXMAN_OP_DARKEN] = combine_darken_ca;
-    imp->combine_width_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca;
-    imp->combine_width_ca[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_ca;
-    imp->combine_width_ca[PIXMAN_OP_COLOR_BURN] = combine_color_burn_ca;
-    imp->combine_width_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca;
-    imp->combine_width_ca[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_ca;
-    imp->combine_width_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca;
-    imp->combine_width_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca;
-
-    /* It is not clear that these make sense, so make them noops for now */
-    imp->combine_width_ca[PIXMAN_OP_HSL_HUE] = combine_dst;
-    imp->combine_width_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst;
-    imp->combine_width_ca[PIXMAN_OP_HSL_COLOR] = combine_dst;
-    imp->combine_width_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst;
-}
-
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+
+#include "pixman-private.h"
+
+#include "pixman-combine.h"
+
+/*** per channel helper functions ***/
+
+static void
+combine_mask_ca (comp4_t *src, comp4_t *mask)
+{
+    comp4_t a = *mask;
+
+    comp4_t x;
+    comp2_t xa;
+
+    if (!a)
+    {
+	*(src) = 0;
+	return;
+    }
+
+    x = *(src);
+    if (a == ~0)
+    {
+	x = x >> A_SHIFT;
+	x |= x << G_SHIFT;
+	x |= x << R_SHIFT;
+	*(mask) = x;
+	return;
+    }
+
+    xa = x >> A_SHIFT;
+    UNcx4_MUL_UNcx4 (x, a);
+    *(src) = x;
+    
+    UNcx4_MUL_UNc (a, xa);
+    *(mask) = a;
+}
+
+static void
+combine_mask_value_ca (comp4_t *src, const comp4_t *mask)
+{
+    comp4_t a = *mask;
+    comp4_t x;
+
+    if (!a)
+    {
+	*(src) = 0;
+	return;
+    }
+
+    if (a == ~0)
+	return;
+
+    x = *(src);
+    UNcx4_MUL_UNcx4 (x, a);
+    *(src) = x;
+}
+
+static void
+combine_mask_alpha_ca (const comp4_t *src, comp4_t *mask)
+{
+    comp4_t a = *(mask);
+    comp4_t x;
+
+    if (!a)
+	return;
+
+    x = *(src) >> A_SHIFT;
+    if (x == MASK)
+	return;
+
+    if (a == ~0)
+    {
+	x |= x << G_SHIFT;
+	x |= x << R_SHIFT;
+	*(mask) = x;
+	return;
+    }
+
+    UNcx4_MUL_UNc (a, x);
+    *(mask) = a;
+}
+
+/*
+ * There are two ways of handling alpha -- either as a single unified value or
+ * a separate value for each component, hence each macro must have two
+ * versions.  The unified alpha version has a 'U' at the end of the name,
+ * the component version has a 'C'.  Similarly, functions which deal with
+ * this difference will have two versions using the same convention.
+ */
+
+/*
+ * All of the composing functions
+ */
+
+static force_inline comp4_t
+combine_mask (const comp4_t *src, const comp4_t *mask, int i)
+{
+    comp4_t s, m;
+
+    if (mask)
+    {
+	m = *(mask + i) >> A_SHIFT;
+
+	if (!m)
+	    return 0;
+    }
+
+    s = *(src + i);
+
+    if (mask)
+	UNcx4_MUL_UNc (s, m);
+
+    return s;
+}
+
+static void
+combine_clear (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    memset (dest, 0, width * sizeof(comp4_t));
+}
+
+static void
+combine_dst (pixman_implementation_t *imp,
+	     pixman_op_t	      op,
+	     comp4_t *		      dest,
+	     const comp4_t *	      src,
+	     const comp4_t *          mask,
+	     int		      width)
+{
+    return;
+}
+
+static void
+combine_src_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    if (!mask)
+	memcpy (dest, src, width * sizeof (comp4_t));
+    else
+    {
+	for (i = 0; i < width; ++i)
+	{
+	    comp4_t s = combine_mask (src, mask, i);
+
+	    *(dest + i) = s;
+	}
+    }
+}
+
+/* if the Src is opaque, call combine_src_u */
+static void
+combine_over_u (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t ia = ALPHA_c (~s);
+
+	UNcx4_MUL_UNc_ADD_UNcx4 (d, ia, s);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Dst is opaque, this is a noop */
+static void
+combine_over_reverse_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t ia = ALPHA_c (~*(dest + i));
+	UNcx4_MUL_UNc_ADD_UNcx4 (s, ia, d);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Dst is opaque, call combine_src_u */
+static void
+combine_in_u (pixman_implementation_t *imp,
+              pixman_op_t              op,
+              comp4_t *                dest,
+              const comp4_t *          src,
+              const comp4_t *          mask,
+              int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t a = ALPHA_c (*(dest + i));
+	UNcx4_MUL_UNc (s, a);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, this is a noop */
+static void
+combine_in_reverse_u (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      comp4_t *                dest,
+                      const comp4_t *          src,
+                      const comp4_t *          mask,
+                      int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t a = ALPHA_c (s);
+	UNcx4_MUL_UNc (d, a);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Dst is opaque, call combine_clear */
+static void
+combine_out_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t a = ALPHA_c (~*(dest + i));
+	UNcx4_MUL_UNc (s, a);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_clear */
+static void
+combine_out_reverse_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t a = ALPHA_c (~s);
+	UNcx4_MUL_UNc (d, a);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Src is opaque, call combine_in_u */
+/* if the Dst is opaque, call combine_over_u */
+/* if both the Src and Dst are opaque, call combine_src_u */
+static void
+combine_atop_u (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t dest_a = ALPHA_c (d);
+	comp4_t src_ia = ALPHA_c (~s);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_a, d, src_ia);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_over_reverse_u */
+/* if the Dst is opaque, call combine_in_reverse_u */
+/* if both the Src and Dst are opaque, call combine_dst_u */
+static void
+combine_atop_reverse_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t src_a = ALPHA_c (s);
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_a);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_over_u */
+/* if the Dst is opaque, call combine_over_reverse_u */
+/* if both the Src and Dst are opaque, call combine_clear */
+static void
+combine_xor_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t src_ia = ALPHA_c (~s);
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_ia);
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_add_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	UNcx4_ADD_UNcx4 (d, s);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Src is opaque, call combine_add_u */
+/* if the Dst is opaque, call combine_add_u */
+/* if both the Src and Dst are opaque, call combine_add_u */
+static void
+combine_saturate_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    comp4_t *                dest,
+                    const comp4_t *          src,
+                    const comp4_t *          mask,
+                    int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp2_t sa, da;
+
+	sa = s >> A_SHIFT;
+	da = ~d >> A_SHIFT;
+	if (sa > da)
+	{
+	    sa = DIV_UNc (da, sa);
+	    UNcx4_MUL_UNc (s, sa);
+	}
+	;
+	UNcx4_ADD_UNcx4 (d, s);
+	*(dest + i) = d;
+    }
+}
+
+/*
+ * PDF blend modes:
+ * The following blend modes have been taken from the PDF ISO 32000
+ * specification, which at this point in time is available from
+ * http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
+ * The relevant chapters are 11.3.5 and 11.3.6.
+ * The formula for computing the final pixel color given in 11.3.6 is:
+ * αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
+ * with B() being the blend function.
+ * Note that OVER is a special case of this operation, using B(Cb, Cs) = Cs
+ *
+ * These blend modes should match the SVG filter draft specification, as
+ * it has been designed to mirror ISO 32000. Note that at the current point
+ * no released draft exists that shows this, as the formulas have not been
+ * updated yet after the release of ISO 32000.
+ *
+ * The default implementation here uses the PDF_SEPARABLE_BLEND_MODE and
+ * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
+ * argument. Note that this implementation operates on premultiplied colors,
+ * while the PDF specification does not. Therefore the code uses the formula
+ * Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
+ */
+
+/*
+ * Multiply
+ * B(Dca, ad, Sca, as) = Dca.Sca
+ */
+
+static void
+combine_multiply_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    comp4_t *                dest,
+                    const comp4_t *          src,
+                    const comp4_t *          mask,
+                    int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t ss = s;
+	comp4_t src_ia = ALPHA_c (~s);
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (ss, dest_ia, d, src_ia);
+	UNcx4_MUL_UNcx4 (d, s);
+	UNcx4_ADD_UNcx4 (d, ss);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_multiply_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     comp4_t *                dest,
+                     const comp4_t *          src,
+                     const comp4_t *          mask,
+                     int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t m = *(mask + i);
+	comp4_t s = *(src + i);
+	comp4_t d = *(dest + i);
+	comp4_t r = d;
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	combine_mask_value_ca (&s, &m);
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (r, ~m, s, dest_ia);
+	UNcx4_MUL_UNcx4 (d, s);
+	UNcx4_ADD_UNcx4 (r, d);
+
+	*(dest + i) = r;
+    }
+}
+
+#define PDF_SEPARABLE_BLEND_MODE(name)					\
+    static void								\
+    combine_ ## name ## _u (pixman_implementation_t *imp,		\
+			    pixman_op_t              op,		\
+                            comp4_t *                dest,		\
+			    const comp4_t *          src,		\
+			    const comp4_t *          mask,		\
+			    int                      width)		\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i) {					\
+	    comp4_t s = combine_mask (src, mask, i);			\
+	    comp4_t d = *(dest + i);					\
+	    comp1_t sa = ALPHA_c (s);					\
+	    comp1_t isa = ~sa;						\
+	    comp1_t da = ALPHA_c (d);					\
+	    comp1_t ida = ~da;						\
+	    comp4_t result;						\
+									\
+	    result = d;							\
+	    UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);	\
+	    								\
+	    *(dest + i) = result +					\
+		(DIV_ONE_UNc (sa * da) << A_SHIFT) +			\
+		(blend_ ## name (RED_c (d), da, RED_c (s), sa) << R_SHIFT) + \
+		(blend_ ## name (GREEN_c (d), da, GREEN_c (s), sa) << G_SHIFT) + \
+		(blend_ ## name (BLUE_c (d), da, BLUE_c (s), sa));	\
+	}								\
+    }									\
+    									\
+    static void								\
+    combine_ ## name ## _ca (pixman_implementation_t *imp,		\
+			     pixman_op_t              op,		\
+                             comp4_t *                dest,		\
+			     const comp4_t *          src,		\
+			     const comp4_t *          mask,		\
+			     int                     width)		\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i) {					\
+	    comp4_t m = *(mask + i);					\
+	    comp4_t s = *(src + i);					\
+	    comp4_t d = *(dest + i);					\
+	    comp1_t da = ALPHA_c (d);					\
+	    comp1_t ida = ~da;						\
+	    comp4_t result;						\
+            								\
+	    combine_mask_value_ca (&s, &m);				\
+            								\
+	    result = d;							\
+	    UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (result, ~m, s, ida);     \
+            								\
+	    result +=							\
+	        (DIV_ONE_UNc (ALPHA_c (m) * da) << A_SHIFT) +		\
+	        (blend_ ## name (RED_c (d), da, RED_c (s), RED_c (m)) << R_SHIFT) + \
+	        (blend_ ## name (GREEN_c (d), da, GREEN_c (s), GREEN_c (m)) << G_SHIFT) + \
+	        (blend_ ## name (BLUE_c (d), da, BLUE_c (s), BLUE_c (m))); \
+	    								\
+	    *(dest + i) = result;					\
+	}								\
+    }
+
+/*
+ * Screen
+ * B(Dca, ad, Sca, as) = Dca.sa + Sca.da - Dca.Sca
+ */
+static inline comp4_t
+blend_screen (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    return DIV_ONE_UNc (sca * da + dca * sa - sca * dca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (screen)
+
+/*
+ * Overlay
+ * B(Dca, Da, Sca, Sa) =
+ *   if 2.Dca < Da
+ *     2.Sca.Dca
+ *   otherwise
+ *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
+ */
+static inline comp4_t
+blend_overlay (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t rca;
+
+    if (2 * dca < da)
+	rca = 2 * sca * dca;
+    else
+	rca = sa * da - 2 * (da - dca) * (sa - sca);
+    return DIV_ONE_UNc (rca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (overlay)
+
+/*
+ * Darken
+ * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa)
+ */
+static inline comp4_t
+blend_darken (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t s, d;
+
+    s = sca * da;
+    d = dca * sa;
+    return DIV_ONE_UNc (s > d ? d : s);
+}
+
+PDF_SEPARABLE_BLEND_MODE (darken)
+
+/*
+ * Lighten
+ * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa)
+ */
+static inline comp4_t
+blend_lighten (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t s, d;
+
+    s = sca * da;
+    d = dca * sa;
+    return DIV_ONE_UNc (s > d ? s : d);
+}
+
+PDF_SEPARABLE_BLEND_MODE (lighten)
+
+/*
+ * Color dodge
+ * B(Dca, Da, Sca, Sa) =
+ *   if Dca == 0
+ *     0
+ *   if Sca == Sa
+ *     Sa.Da
+ *   otherwise
+ *     Sa.Da. min (1, Dca / Da / (1 - Sca/Sa))
+ */
+static inline comp4_t
+blend_color_dodge (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (sca >= sa)
+    {
+	return dca == 0 ? 0 : DIV_ONE_UNc (sa * da);
+    }
+    else
+    {
+	comp4_t rca = dca * sa / (sa - sca);
+	return DIV_ONE_UNc (sa * MIN (rca, da));
+    }
+}
+
+PDF_SEPARABLE_BLEND_MODE (color_dodge)
+
+/*
+ * Color burn
+ * B(Dca, Da, Sca, Sa) =
+ *   if Dca == Da
+ *     Sa.Da
+ *   if Sca == 0
+ *     0
+ *   otherwise
+ *     Sa.Da.(1 - min (1, (1 - Dca/Da).Sa / Sca))
+ */
+static inline comp4_t
+blend_color_burn (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (sca == 0)
+    {
+	return dca < da ? 0 : DIV_ONE_UNc (sa * da);
+    }
+    else
+    {
+	comp4_t rca = (da - dca) * sa / sca;
+	return DIV_ONE_UNc (sa * (MAX (rca, da) - rca));
+    }
+}
+
+PDF_SEPARABLE_BLEND_MODE (color_burn)
+
+/*
+ * Hard light
+ * B(Dca, Da, Sca, Sa) =
+ *   if 2.Sca < Sa
+ *     2.Sca.Dca
+ *   otherwise
+ *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
+ */
+static inline comp4_t
+blend_hard_light (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (2 * sca < sa)
+	return DIV_ONE_UNc (2 * sca * dca);
+    else
+	return DIV_ONE_UNc (sa * da - 2 * (da - dca) * (sa - sca));
+}
+
+PDF_SEPARABLE_BLEND_MODE (hard_light)
+
+/*
+ * Soft light
+ * B(Dca, Da, Sca, Sa) =
+ *   if (2.Sca <= Sa)
+ *     Dca.(Sa - (1 - Dca/Da).(2.Sca - Sa))
+ *   otherwise if Dca.4 <= Da
+ *     Dca.(Sa + (2.Sca - Sa).((16.Dca/Da - 12).Dca/Da + 3)
+ *   otherwise
+ *     (Dca.Sa + (SQRT (Dca/Da).Da - Dca).(2.Sca - Sa))
+ */
+static inline comp4_t
+blend_soft_light (comp4_t dca_org,
+		  comp4_t da_org,
+		  comp4_t sca_org,
+		  comp4_t sa_org)
+{
+    double dca = dca_org * (1.0 / MASK);
+    double da = da_org * (1.0 / MASK);
+    double sca = sca_org * (1.0 / MASK);
+    double sa = sa_org * (1.0 / MASK);
+    double rca;
+
+    if (2 * sca < sa)
+    {
+	if (da == 0)
+	    rca = dca * sa;
+	else
+	    rca = dca * sa - dca * (da - dca) * (sa - 2 * sca) / da;
+    }
+    else if (da == 0)
+    {
+	rca = 0;
+    }
+    else if (4 * dca <= da)
+    {
+	rca = dca * sa +
+	    (2 * sca - sa) * dca * ((16 * dca / da - 12) * dca / da + 3);
+    }
+    else
+    {
+	rca = dca * sa + (sqrt (dca * da) - dca) * (2 * sca - sa);
+    }
+    return rca * MASK + 0.5;
+}
+
+PDF_SEPARABLE_BLEND_MODE (soft_light)
+
+/*
+ * Difference
+ * B(Dca, Da, Sca, Sa) = abs (Dca.Sa - Sca.Da)
+ */
+static inline comp4_t
+blend_difference (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t dcasa = dca * sa;
+    comp4_t scada = sca * da;
+
+    if (scada < dcasa)
+	return DIV_ONE_UNc (dcasa - scada);
+    else
+	return DIV_ONE_UNc (scada - dcasa);
+}
+
+PDF_SEPARABLE_BLEND_MODE (difference)
+
+/*
+ * Exclusion
+ * B(Dca, Da, Sca, Sa) = (Sca.Da + Dca.Sa - 2.Sca.Dca)
+ */
+
+/* This can be made faster by writing it directly and not using
+ * PDF_SEPARABLE_BLEND_MODE, but that's a performance optimization */
+
+static inline comp4_t
+blend_exclusion (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    return DIV_ONE_UNc (sca * da + dca * sa - 2 * dca * sca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (exclusion)
+
+#undef PDF_SEPARABLE_BLEND_MODE
+
+/*
+ * PDF nonseperable blend modes are implemented using the following functions
+ * to operate in Hsl space, with Cmax, Cmid, Cmin referring to the max, mid
+ * and min value of the red, green and blue components.
+ *
+ * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue
+ *
+ * clip_color (C):
+ *   l = LUM (C)
+ *   min = Cmin
+ *   max = Cmax
+ *   if n < 0.0
+ *     C = l + ( ( ( C – l ) × l ) ⁄ ( l – min ) )
+ *   if x > 1.0
+ *     C = l + ( ( ( C – l ) × ( 1 – l ) ) ⁄ ( max – l ) )
+ *   return C
+ *
+ * set_lum (C, l):
+ *   d = l – LUM (C)
+ *   C += d
+ *   return clip_color (C)
+ *
+ * SAT (C) = CH_MAX (C) - CH_MIN (C)
+ *
+ * set_sat (C, s):
+ *  if Cmax > Cmin
+ *    Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
+ *    Cmax = s
+ *  else
+ *    Cmid = Cmax = 0.0
+ *  Cmin = 0.0
+ *  return C
+ */
+
+/* For premultiplied colors, we need to know what happens when C is
+ * multiplied by a real number. LUM and SAT are linear:
+ *
+ *    LUM (r × C) = r × LUM (C)		SAT (r * C) = r * SAT (C)
+ *
+ * If we extend clip_color with an extra argument a and change
+ *
+ *        if x >= 1.0
+ *
+ * into
+ *
+ *        if x >= a
+ *
+ * then clip_color is also linear:
+ *
+ *    r * clip_color (C, a) = clip_color (r_c, ra);
+ *
+ * for positive r.
+ *
+ * Similarly, we can extend set_lum with an extra argument that is just passed
+ * on to clip_color:
+ *
+ *   r * set_lum ( C, l, a)
+ *
+ *   = r × clip_color ( C + l - LUM (C), a)
+ *
+ *   = clip_color ( r * C + r × l - r * LUM (C), r * a)
+ *
+ *   = set_lum ( r * C, r * l, r * a)
+ *
+ * Finally, set_sat:
+ *
+ *    r * set_sat (C, s) = set_sat (x * C, r * s)
+ *
+ * The above holds for all non-zero x, because the x'es in the fraction for
+ * C_mid cancel out. Specifically, it holds for x = r:
+ *
+ *    r * set_sat (C, s) = set_sat (r_c, rs)
+ *
+ */
+
+/* So, for the non-separable PDF blend modes, we have (using s, d for
+ * non-premultiplied colors, and S, D for premultiplied:
+ *
+ *   Color:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (S/a_s, LUM (D/a_d), 1)
+ *   = set_lum (S * a_d, a_s * LUM (D), a_s * a_d)
+ *
+ *
+ *   Luminosity:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (D/a_d, LUM(S/a_s), 1)
+ *   = set_lum (a_s * D, a_d * LUM(S), a_s * a_d)
+ *
+ *
+ *   Saturation:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (D/a_d, SAT (S/a_s)), LUM (D/a_d), 1)
+ *   = set_lum (a_s * a_d * set_sat (D/a_d, SAT (S/a_s)),
+ *                                        a_s * LUM (D), a_s * a_d)
+ *   = set_lum (set_sat (a_s * D, a_d * SAT (S), a_s * LUM (D), a_s * a_d))
+ *
+ *   Hue:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
+ *   = set_lum (set_sat (a_d * S, a_s * SAT (D)), a_s * LUM (D), a_s * a_d)
+ *
+ */
+
+#define CH_MIN(c) (c[0] < c[1] ? (c[0] < c[2] ? c[0] : c[2]) : (c[1] < c[2] ? c[1] : c[2]))
+#define CH_MAX(c) (c[0] > c[1] ? (c[0] > c[2] ? c[0] : c[2]) : (c[1] > c[2] ? c[1] : c[2]))
+#define LUM(c) ((c[0] * 30 + c[1] * 59 + c[2] * 11) / 100)
+#define SAT(c) (CH_MAX (c) - CH_MIN (c))
+
+#define PDF_NON_SEPARABLE_BLEND_MODE(name)				\
+    static void								\
+    combine_ ## name ## _u (pixman_implementation_t *imp,		\
+			    pixman_op_t op,				\
+                            comp4_t *dest,				\
+			    const comp4_t *src,				\
+			    const comp4_t *mask,			\
+			    int width)					\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    comp4_t s = combine_mask (src, mask, i);			\
+	    comp4_t d = *(dest + i);					\
+	    comp1_t sa = ALPHA_c (s);					\
+	    comp1_t isa = ~sa;						\
+	    comp1_t da = ALPHA_c (d);					\
+	    comp1_t ida = ~da;						\
+	    comp4_t result;						\
+	    comp4_t sc[3], dc[3], c[3];					\
+            								\
+	    result = d;							\
+	    UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);	\
+	    dc[0] = RED_c (d);						\
+	    sc[0] = RED_c (s);						\
+	    dc[1] = GREEN_c (d);					\
+	    sc[1] = GREEN_c (s);					\
+	    dc[2] = BLUE_c (d);						\
+	    sc[2] = BLUE_c (s);						\
+	    blend_ ## name (c, dc, da, sc, sa);				\
+            								\
+	    *(dest + i) = result +					\
+		(DIV_ONE_UNc (sa * da) << A_SHIFT) +			\
+		(DIV_ONE_UNc (c[0]) << R_SHIFT) +			\
+		(DIV_ONE_UNc (c[1]) << G_SHIFT) +			\
+		(DIV_ONE_UNc (c[2]));					\
+	}								\
+    }
+
+static void
+set_lum (comp4_t dest[3], comp4_t src[3], comp4_t sa, comp4_t lum)
+{
+    double a, l, min, max;
+    double tmp[3];
+
+    a = sa * (1.0 / MASK);
+
+    l = lum * (1.0 / MASK);
+    tmp[0] = src[0] * (1.0 / MASK);
+    tmp[1] = src[1] * (1.0 / MASK);
+    tmp[2] = src[2] * (1.0 / MASK);
+
+    l = l - LUM (tmp);
+    tmp[0] += l;
+    tmp[1] += l;
+    tmp[2] += l;
+
+    /* clip_color */
+    l = LUM (tmp);
+    min = CH_MIN (tmp);
+    max = CH_MAX (tmp);
+
+    if (min < 0)
+    {
+	if (l - min == 0.0)
+	{
+	    tmp[0] = 0;
+	    tmp[1] = 0;
+	    tmp[2] = 0;
+	}
+	else
+	{
+	    tmp[0] = l + (tmp[0] - l) * l / (l - min);
+	    tmp[1] = l + (tmp[1] - l) * l / (l - min);
+	    tmp[2] = l + (tmp[2] - l) * l / (l - min);
+	}
+    }
+    if (max > a)
+    {
+	if (max - l == 0.0)
+	{
+	    tmp[0] = a;
+	    tmp[1] = a;
+	    tmp[2] = a;
+	}
+	else
+	{
+	    tmp[0] = l + (tmp[0] - l) * (a - l) / (max - l);
+	    tmp[1] = l + (tmp[1] - l) * (a - l) / (max - l);
+	    tmp[2] = l + (tmp[2] - l) * (a - l) / (max - l);
+	}
+    }
+
+    dest[0] = tmp[0] * MASK + 0.5;
+    dest[1] = tmp[1] * MASK + 0.5;
+    dest[2] = tmp[2] * MASK + 0.5;
+}
+
+static void
+set_sat (comp4_t dest[3], comp4_t src[3], comp4_t sat)
+{
+    int id[3];
+    comp4_t min, max;
+
+    if (src[0] > src[1])
+    {
+	if (src[0] > src[2])
+	{
+	    id[0] = 0;
+	    if (src[1] > src[2])
+	    {
+		id[1] = 1;
+		id[2] = 2;
+	    }
+	    else
+	    {
+		id[1] = 2;
+		id[2] = 1;
+	    }
+	}
+	else
+	{
+	    id[0] = 2;
+	    id[1] = 0;
+	    id[2] = 1;
+	}
+    }
+    else
+    {
+	if (src[0] > src[2])
+	{
+	    id[0] = 1;
+	    id[1] = 0;
+	    id[2] = 2;
+	}
+	else
+	{
+	    id[2] = 0;
+	    if (src[1] > src[2])
+	    {
+		id[0] = 1;
+		id[1] = 2;
+	    }
+	    else
+	    {
+		id[0] = 2;
+		id[1] = 1;
+	    }
+	}
+    }
+
+    max = dest[id[0]];
+    min = dest[id[2]];
+    if (max > min)
+    {
+	dest[id[1]] = (dest[id[1]] - min) * sat / (max - min);
+	dest[id[0]] = sat;
+	dest[id[2]] = 0;
+    }
+    else
+    {
+	dest[0] = dest[1] = dest[2] = 0;
+    }
+}
+
+/*
+ * Hue:
+ * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
+ */
+static inline void
+blend_hsl_hue (comp4_t c[3],
+               comp4_t dc[3],
+               comp4_t da,
+               comp4_t sc[3],
+               comp4_t sa)
+{
+    c[0] = sc[0] * da;
+    c[1] = sc[1] * da;
+    c[2] = sc[2] * da;
+    set_sat (c, c, SAT (dc) * sa);
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_hue)
+
+/*
+ * Saturation:
+ * B(Cb, Cs) = set_lum (set_sat (Cb, SAT (Cs)), LUM (Cb))
+ */
+static inline void
+blend_hsl_saturation (comp4_t c[3],
+                      comp4_t dc[3],
+                      comp4_t da,
+                      comp4_t sc[3],
+                      comp4_t sa)
+{
+    c[0] = dc[0] * sa;
+    c[1] = dc[1] * sa;
+    c[2] = dc[2] * sa;
+    set_sat (c, c, SAT (sc) * da);
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_saturation)
+
+/*
+ * Color:
+ * B(Cb, Cs) = set_lum (Cs, LUM (Cb))
+ */
+static inline void
+blend_hsl_color (comp4_t c[3],
+                 comp4_t dc[3],
+                 comp4_t da,
+                 comp4_t sc[3],
+                 comp4_t sa)
+{
+    c[0] = sc[0] * da;
+    c[1] = sc[1] * da;
+    c[2] = sc[2] * da;
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_color)
+
+/*
+ * Luminosity:
+ * B(Cb, Cs) = set_lum (Cb, LUM (Cs))
+ */
+static inline void
+blend_hsl_luminosity (comp4_t c[3],
+                      comp4_t dc[3],
+                      comp4_t da,
+                      comp4_t sc[3],
+                      comp4_t sa)
+{
+    c[0] = dc[0] * sa;
+    c[1] = dc[1] * sa;
+    c[2] = dc[2] * sa;
+    set_lum (c, c, sa * da, LUM (sc) * da);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
+
+#undef SAT
+#undef LUM
+#undef CH_MAX
+#undef CH_MIN
+#undef PDF_NON_SEPARABLE_BLEND_MODE
+
+/* Overlay
+ *
+ * All of the disjoint composing functions
+ *
+ * The four entries in the first column indicate what source contributions
+ * come from each of the four areas of the picture -- areas covered by neither
+ * A nor B, areas covered only by A, areas covered only by B and finally
+ * areas covered by both A and B.
+ * 
+ * Disjoint			Conjoint
+ * Fa		Fb		Fa		Fb
+ * (0,0,0,0)	0		0		0		0
+ * (0,A,0,A)	1		0		1		0
+ * (0,0,B,B)	0		1		0		1
+ * (0,A,B,A)	1		min((1-a)/b,1)	1		max(1-a/b,0)
+ * (0,A,B,B)	min((1-b)/a,1)	1		max(1-b/a,0)	1
+ * (0,0,0,A)	max(1-(1-b)/a,0) 0		min(1,b/a)	0
+ * (0,0,0,B)	0		max(1-(1-a)/b,0) 0		min(a/b,1)
+ * (0,A,0,0)	min(1,(1-b)/a)	0		max(1-b/a,0)	0
+ * (0,0,B,0)	0		min(1,(1-a)/b)	0		max(1-a/b,0)
+ * (0,0,B,A)	max(1-(1-b)/a,0) min(1,(1-a)/b)	 min(1,b/a)	max(1-a/b,0)
+ * (0,A,0,B)	min(1,(1-b)/a)	max(1-(1-a)/b,0) max(1-b/a,0)	min(1,a/b)
+ * (0,A,B,0)	min(1,(1-b)/a)	min(1,(1-a)/b)	max(1-b/a,0)	max(1-a/b,0)
+ */
+
+#define COMBINE_A_OUT 1
+#define COMBINE_A_IN  2
+#define COMBINE_B_OUT 4
+#define COMBINE_B_IN  8
+
+#define COMBINE_CLEAR   0
+#define COMBINE_A       (COMBINE_A_OUT | COMBINE_A_IN)
+#define COMBINE_B       (COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_ATOP  (COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_ATOP  (COMBINE_A_OUT | COMBINE_B_IN)
+#define COMBINE_XOR     (COMBINE_A_OUT | COMBINE_B_OUT)
+
+/* portion covered by a but not b */
+static comp1_t
+combine_disjoint_out_part (comp1_t a, comp1_t b)
+{
+    /* min (1, (1-b) / a) */
+
+    b = ~b;                 /* 1 - b */
+    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
+	return MASK;        /* 1 */
+    return DIV_UNc (b, a);     /* (1-b) / a */
+}
+
+/* portion covered by both a and b */
+static comp1_t
+combine_disjoint_in_part (comp1_t a, comp1_t b)
+{
+    /* max (1-(1-b)/a,0) */
+    /*  = - min ((1-b)/a - 1, 0) */
+    /*  = 1 - min (1, (1-b)/a) */
+
+    b = ~b;                 /* 1 - b */
+    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
+	return 0;           /* 1 - 1 */
+    return ~DIV_UNc(b, a);    /* 1 - (1-b) / a */
+}
+
+/* portion covered by a but not b */
+static comp1_t
+combine_conjoint_out_part (comp1_t a, comp1_t b)
+{
+    /* max (1-b/a,0) */
+    /* = 1-min(b/a,1) */
+
+    /* min (1, (1-b) / a) */
+
+    if (b >= a)             /* b >= a -> b/a >= 1 */
+	return 0x00;        /* 0 */
+    return ~DIV_UNc(b, a);    /* 1 - b/a */
+}
+
+/* portion covered by both a and b */
+static comp1_t
+combine_conjoint_in_part (comp1_t a, comp1_t b)
+{
+    /* min (1,b/a) */
+
+    if (b >= a)             /* b >= a -> b/a >= 1 */
+	return MASK;        /* 1 */
+    return DIV_UNc (b, a);     /* b/a */
+}
+
+#define GET_COMP(v, i)   ((comp2_t) (comp1_t) ((v) >> i))
+
+#define ADD(x, y, i, t)							\
+    ((t) = GET_COMP (x, i) + GET_COMP (y, i),				\
+     (comp4_t) ((comp1_t) ((t) | (0 - ((t) >> G_SHIFT)))) << (i))
+
+#define GENERIC(x, y, i, ax, ay, t, u, v)				\
+    ((t) = (MUL_UNc (GET_COMP (y, i), ay, (u)) +			\
+            MUL_UNc (GET_COMP (x, i), ax, (v))),			\
+     (comp4_t) ((comp1_t) ((t) |					\
+                           (0 - ((t) >> G_SHIFT)))) << (i))
+
+static void
+combine_disjoint_general_u (comp4_t *      dest,
+                            const comp4_t *src,
+                            const comp4_t *mask,
+                            int            width,
+                            comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t m, n, o, p;
+	comp2_t Fa, Fb, t, u, v;
+	comp1_t sa = s >> A_SHIFT;
+	comp1_t da = d >> A_SHIFT;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    Fa = combine_disjoint_out_part (sa, da);
+	    break;
+
+	case COMBINE_A_IN:
+	    Fa = combine_disjoint_in_part (sa, da);
+	    break;
+
+	case COMBINE_A:
+	    Fa = MASK;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    Fb = combine_disjoint_out_part (da, sa);
+	    break;
+
+	case COMBINE_B_IN:
+	    Fb = combine_disjoint_in_part (da, sa);
+	    break;
+
+	case COMBINE_B:
+	    Fb = MASK;
+	    break;
+	}
+	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
+	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
+	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
+	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
+	s = m | n | o | p;
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_disjoint_over_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp2_t a = s >> A_SHIFT;
+
+	if (s != 0x00)
+	{
+	    comp4_t d = *(dest + i);
+	    a = combine_disjoint_out_part (d >> A_SHIFT, a);
+	    UNcx4_MUL_UNc_ADD_UNcx4 (d, a, s);
+
+	    *(dest + i) = d;
+	}
+    }
+}
+
+static void
+combine_disjoint_in_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               comp4_t *                dest,
+                               const comp4_t *          src,
+                               const comp4_t *          mask,
+                               int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_disjoint_out_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_disjoint_atop_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_disjoint_xor_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+combine_conjoint_general_u (comp4_t *      dest,
+                            const comp4_t *src,
+                            const comp4_t *mask,
+                            int            width,
+                            comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t m, n, o, p;
+	comp2_t Fa, Fb, t, u, v;
+	comp1_t sa = s >> A_SHIFT;
+	comp1_t da = d >> A_SHIFT;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    Fa = combine_conjoint_out_part (sa, da);
+	    break;
+
+	case COMBINE_A_IN:
+	    Fa = combine_conjoint_in_part (sa, da);
+	    break;
+
+	case COMBINE_A:
+	    Fa = MASK;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    Fb = combine_conjoint_out_part (da, sa);
+	    break;
+
+	case COMBINE_B_IN:
+	    Fb = combine_conjoint_in_part (da, sa);
+	    break;
+
+	case COMBINE_B:
+	    Fb = MASK;
+	    break;
+	}
+
+	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
+	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
+	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
+	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_conjoint_over_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+combine_conjoint_in_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               comp4_t *                dest,
+                               const comp4_t *          src,
+                               const comp4_t *          mask,
+                               int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_conjoint_out_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_conjoint_atop_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_conjoint_xor_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+/************************************************************************/
+/*********************** Per Channel functions **************************/
+/************************************************************************/
+
+static void
+combine_clear_ca (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  comp4_t *                dest,
+                  const comp4_t *          src,
+                  const comp4_t *          mask,
+                  int                      width)
+{
+    memset (dest, 0, width * sizeof(comp4_t));
+}
+
+static void
+combine_src_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+
+	combine_mask_value_ca (&s, &m);
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_over_ca (pixman_implementation_t *imp,
+                 pixman_op_t              op,
+                 comp4_t *                dest,
+                 const comp4_t *          src,
+                 const comp4_t *          mask,
+                 int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t a;
+
+	combine_mask_ca (&s, &m);
+
+	a = ~m;
+	if (a)
+	{
+	    comp4_t d = *(dest + i);
+	    UNcx4_MUL_UNcx4_ADD_UNcx4 (d, a, s);
+	    s = d;
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_over_reverse_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t a = ~d >> A_SHIFT;
+
+	if (a)
+	{
+	    comp4_t s = *(src + i);
+	    comp4_t m = *(mask + i);
+
+	    UNcx4_MUL_UNcx4 (s, m);
+	    UNcx4_MUL_UNc_ADD_UNcx4 (s, a, d);
+
+	    *(dest + i) = s;
+	}
+    }
+}
+
+static void
+combine_in_ca (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp2_t a = d >> A_SHIFT;
+	comp4_t s = 0;
+
+	if (a)
+	{
+	    comp4_t m = *(mask + i);
+
+	    s = *(src + i);
+	    combine_mask_value_ca (&s, &m);
+
+	    if (a != MASK)
+		UNcx4_MUL_UNc (s, a);
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_in_reverse_ca (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t a;
+
+	combine_mask_alpha_ca (&s, &m);
+
+	a = m;
+	if (a != ~0)
+	{
+	    comp4_t d = 0;
+
+	    if (a)
+	    {
+		d = *(dest + i);
+		UNcx4_MUL_UNcx4 (d, a);
+	    }
+
+	    *(dest + i) = d;
+	}
+    }
+}
+
+static void
+combine_out_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp2_t a = ~d >> A_SHIFT;
+	comp4_t s = 0;
+
+	if (a)
+	{
+	    comp4_t m = *(mask + i);
+
+	    s = *(src + i);
+	    combine_mask_value_ca (&s, &m);
+
+	    if (a != MASK)
+		UNcx4_MUL_UNc (s, a);
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_out_reverse_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t a;
+
+	combine_mask_alpha_ca (&s, &m);
+
+	a = ~m;
+	if (a != ~0)
+	{
+	    comp4_t d = 0;
+
+	    if (a)
+	    {
+		d = *(dest + i);
+		UNcx4_MUL_UNcx4 (d, a);
+	    }
+
+	    *(dest + i) = d;
+	}
+    }
+}
+
+static void
+combine_atop_ca (pixman_implementation_t *imp,
+                 pixman_op_t              op,
+                 comp4_t *                dest,
+                 const comp4_t *          src,
+                 const comp4_t *          mask,
+                 int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t ad;
+	comp2_t as = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = ~m;
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_atop_reverse_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t ad;
+	comp2_t as = ~d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = m;
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_xor_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t ad;
+	comp2_t as = ~d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = ~m;
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_add_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t d = *(dest + i);
+
+	combine_mask_value_ca (&s, &m);
+
+	UNcx4_ADD_UNcx4 (d, s);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_saturate_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     comp4_t *                dest,
+                     const comp4_t *          src,
+                     const comp4_t *          mask,
+                     int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s, d;
+	comp2_t sa, sr, sg, sb, da;
+	comp2_t t, u, v;
+	comp4_t m, n, o, p;
+
+	d = *(dest + i);
+	s = *(src + i);
+	m = *(mask + i);
+
+	combine_mask_ca (&s, &m);
+
+	sa = (m >> A_SHIFT);
+	sr = (m >> R_SHIFT) & MASK;
+	sg = (m >> G_SHIFT) & MASK;
+	sb =  m             & MASK;
+	da = ~d >> A_SHIFT;
+
+	if (sb <= da)
+	    m = ADD (s, d, 0, t);
+	else
+	    m = GENERIC (s, d, 0, (da << G_SHIFT) / sb, MASK, t, u, v);
+
+	if (sg <= da)
+	    n = ADD (s, d, G_SHIFT, t);
+	else
+	    n = GENERIC (s, d, G_SHIFT, (da << G_SHIFT) / sg, MASK, t, u, v);
+
+	if (sr <= da)
+	    o = ADD (s, d, R_SHIFT, t);
+	else
+	    o = GENERIC (s, d, R_SHIFT, (da << G_SHIFT) / sr, MASK, t, u, v);
+
+	if (sa <= da)
+	    p = ADD (s, d, A_SHIFT, t);
+	else
+	    p = GENERIC (s, d, A_SHIFT, (da << G_SHIFT) / sa, MASK, t, u, v);
+
+	*(dest + i) = m | n | o | p;
+    }
+}
+
+static void
+combine_disjoint_general_ca (comp4_t *      dest,
+                             const comp4_t *src,
+                             const comp4_t *mask,
+                             int            width,
+                             comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s, d;
+	comp4_t m, n, o, p;
+	comp4_t Fa, Fb;
+	comp2_t t, u, v;
+	comp4_t sa;
+	comp1_t da;
+
+	s = *(src + i);
+	m = *(mask + i);
+	d = *(dest + i);
+	da = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	sa = m;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    m = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A_IN:
+	    m = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A:
+	    Fa = ~0;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    m = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B_IN:
+	    m = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B:
+	    Fb = ~0;
+	    break;
+	}
+	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
+	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
+	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
+	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_disjoint_over_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_disjoint_in_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_disjoint_out_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_disjoint_atop_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_disjoint_xor_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+combine_conjoint_general_ca (comp4_t *      dest,
+                             const comp4_t *src,
+                             const comp4_t *mask,
+                             int            width,
+                             comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s, d;
+	comp4_t m, n, o, p;
+	comp4_t Fa, Fb;
+	comp2_t t, u, v;
+	comp4_t sa;
+	comp1_t da;
+
+	s = *(src + i);
+	m = *(mask + i);
+	d = *(dest + i);
+	da = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	sa = m;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    m = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A_IN:
+	    m = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A:
+	    Fa = ~0;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    m = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B_IN:
+	    m = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B:
+	    Fb = ~0;
+	    break;
+	}
+	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
+	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
+	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
+	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_conjoint_over_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_conjoint_over_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+combine_conjoint_in_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_conjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_conjoint_out_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_conjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_conjoint_atop_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_conjoint_xor_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+void
+_pixman_setup_combiner_functions_width (pixman_implementation_t *imp)
+{
+    /* Unified alpha */
+    imp->combine_width[PIXMAN_OP_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_SRC] = combine_src_u;
+    imp->combine_width[PIXMAN_OP_DST] = combine_dst;
+    imp->combine_width[PIXMAN_OP_OVER] = combine_over_u;
+    imp->combine_width[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u;
+    imp->combine_width[PIXMAN_OP_IN] = combine_in_u;
+    imp->combine_width[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_OUT] = combine_out_u;
+    imp->combine_width[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_ATOP] = combine_atop_u;
+    imp->combine_width[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_XOR] = combine_xor_u;
+    imp->combine_width[PIXMAN_OP_ADD] = combine_add_u;
+    imp->combine_width[PIXMAN_OP_SATURATE] = combine_saturate_u;
+
+    /* Disjoint, unified */
+    imp->combine_width[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_DISJOINT_SRC] = combine_src_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_DST] = combine_dst;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_u;
+
+    /* Conjoint, unified */
+    imp->combine_width[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_CONJOINT_SRC] = combine_src_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_DST] = combine_dst;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_u;
+
+    imp->combine_width[PIXMAN_OP_MULTIPLY] = combine_multiply_u;
+    imp->combine_width[PIXMAN_OP_SCREEN] = combine_screen_u;
+    imp->combine_width[PIXMAN_OP_OVERLAY] = combine_overlay_u;
+    imp->combine_width[PIXMAN_OP_DARKEN] = combine_darken_u;
+    imp->combine_width[PIXMAN_OP_LIGHTEN] = combine_lighten_u;
+    imp->combine_width[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_u;
+    imp->combine_width[PIXMAN_OP_COLOR_BURN] = combine_color_burn_u;
+    imp->combine_width[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u;
+    imp->combine_width[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_u;
+    imp->combine_width[PIXMAN_OP_DIFFERENCE] = combine_difference_u;
+    imp->combine_width[PIXMAN_OP_EXCLUSION] = combine_exclusion_u;
+    imp->combine_width[PIXMAN_OP_HSL_HUE] = combine_hsl_hue_u;
+    imp->combine_width[PIXMAN_OP_HSL_SATURATION] = combine_hsl_saturation_u;
+    imp->combine_width[PIXMAN_OP_HSL_COLOR] = combine_hsl_color_u;
+    imp->combine_width[PIXMAN_OP_HSL_LUMINOSITY] = combine_hsl_luminosity_u;
+
+    /* Component alpha combiners */
+    imp->combine_width_ca[PIXMAN_OP_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_SRC] = combine_src_ca;
+    /* dest */
+    imp->combine_width_ca[PIXMAN_OP_OVER] = combine_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_IN] = combine_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_OUT] = combine_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_ATOP] = combine_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_XOR] = combine_xor_ca;
+    imp->combine_width_ca[PIXMAN_OP_ADD] = combine_add_ca;
+    imp->combine_width_ca[PIXMAN_OP_SATURATE] = combine_saturate_ca;
+
+    /* Disjoint CA */
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_SRC] = combine_src_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_DST] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_ca;
+
+    /* Conjoint CA */
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_SRC] = combine_src_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_DST] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_ca;
+
+    imp->combine_width_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca;
+    imp->combine_width_ca[PIXMAN_OP_SCREEN] = combine_screen_ca;
+    imp->combine_width_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca;
+    imp->combine_width_ca[PIXMAN_OP_DARKEN] = combine_darken_ca;
+    imp->combine_width_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca;
+    imp->combine_width_ca[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_ca;
+    imp->combine_width_ca[PIXMAN_OP_COLOR_BURN] = combine_color_burn_ca;
+    imp->combine_width_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca;
+    imp->combine_width_ca[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_ca;
+    imp->combine_width_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca;
+    imp->combine_width_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca;
+
+    /* It is not clear that these make sense, so make them noops for now */
+    imp->combine_width_ca[PIXMAN_OP_HSL_HUE] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_COLOR] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst;
+}
+
diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c
index ad05493ca..bbdc8e8b0 100644
--- a/pixman/pixman/pixman-fast-path.c
+++ b/pixman/pixman/pixman-fast-path.c
@@ -1,1988 +1,1988 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  SuSE makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author:  Keith Packard, SuSE, Inc.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <string.h>
-#include <stdlib.h>
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-inlines.h"
-
-static force_inline uint32_t
-fetch_24 (uint8_t *a)
-{
-    if (((unsigned long)a) & 1)
-    {
-#ifdef WORDS_BIGENDIAN
-	return (*a << 16) | (*(uint16_t *)(a + 1));
-#else
-	return *a | (*(uint16_t *)(a + 1) << 8);
-#endif
-    }
-    else
-    {
-#ifdef WORDS_BIGENDIAN
-	return (*(uint16_t *)a << 8) | *(a + 2);
-#else
-	return *(uint16_t *)a | (*(a + 2) << 16);
-#endif
-    }
-}
-
-static force_inline void
-store_24 (uint8_t *a,
-          uint32_t v)
-{
-    if (((unsigned long)a) & 1)
-    {
-#ifdef WORDS_BIGENDIAN
-	*a = (uint8_t) (v >> 16);
-	*(uint16_t *)(a + 1) = (uint16_t) (v);
-#else
-	*a = (uint8_t) (v);
-	*(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
-#endif
-    }
-    else
-    {
-#ifdef WORDS_BIGENDIAN
-	*(uint16_t *)a = (uint16_t)(v >> 8);
-	*(a + 2) = (uint8_t)v;
-#else
-	*(uint16_t *)a = (uint16_t)v;
-	*(a + 2) = (uint8_t)(v >> 16);
-#endif
-    }
-}
-
-static force_inline uint32_t
-over (uint32_t src,
-      uint32_t dest)
-{
-    uint32_t a = ~src >> 24;
-
-    UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
-
-    return dest;
-}
-
-static uint32_t
-in (uint32_t x,
-    uint8_t  y)
-{
-    uint16_t a = y;
-
-    UN8x4_MUL_UN8 (x, a);
-
-    return x;
-}
-
-/*
- * Naming convention:
- *
- *  op_src_mask_dest
- */
-static void
-fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
-                                 pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t    *src, *src_line;
-    uint32_t    *dst, *dst_line;
-    uint8_t     *mask, *mask_line;
-    int src_stride, mask_stride, dst_stride;
-    uint8_t m;
-    uint32_t s, d;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	src = src_line;
-	src_line += src_stride;
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-
-	w = width;
-	while (w--)
-	{
-	    m = *mask++;
-	    if (m)
-	    {
-		s = *src | 0xff000000;
-
-		if (m == 0xff)
-		{
-		    *dst = s;
-		}
-		else
-		{
-		    d = in (s, m);
-		    *dst = over (d, *dst);
-		}
-	    }
-	    src++;
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_in_n_8_8 (pixman_implementation_t *imp,
-                         pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t src, srca;
-    uint8_t     *dst_line, *dst;
-    uint8_t     *mask_line, *mask, m;
-    int dst_stride, mask_stride;
-    int32_t w;
-    uint16_t t;
-
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-
-    srca = src >> 24;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    if (srca == 0xff)
-    {
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    while (w--)
-	    {
-		m = *mask++;
-
-		if (m == 0)
-		    *dst = 0;
-		else if (m != 0xff)
-		    *dst = MUL_UN8 (m, *dst, t);
-
-		dst++;
-	    }
-	}
-    }
-    else
-    {
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    while (w--)
-	    {
-		m = *mask++;
-		m = MUL_UN8 (m, srca, t);
-
-		if (m == 0)
-		    *dst = 0;
-		else if (m != 0xff)
-		    *dst = MUL_UN8 (m, *dst, t);
-
-		dst++;
-	    }
-	}
-    }
-}
-
-static void
-fast_composite_in_8_8 (pixman_implementation_t *imp,
-                       pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint8_t     *dst_line, *dst;
-    uint8_t     *src_line, *src;
-    int dst_stride, src_stride;
-    int32_t w;
-    uint8_t s;
-    uint16_t t;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-
-	    if (s == 0)
-		*dst = 0;
-	    else if (s != 0xff)
-		*dst = MUL_UN8 (s, *dst, t);
-
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t src, srca;
-    uint32_t    *dst_line, *dst, d;
-    uint8_t     *mask_line, *mask, m;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    m = *mask++;
-	    if (m == 0xff)
-	    {
-		if (srca == 0xff)
-		    *dst = src;
-		else
-		    *dst = over (src, *dst);
-	    }
-	    else if (m)
-	    {
-		d = in (src, m);
-		*dst = over (d, *dst);
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
-				   pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t src, s;
-    uint32_t    *dst_line, *dst, d;
-    uint32_t    *mask_line, *mask, ma;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    ma = *mask++;
-
-	    if (ma)
-	    {
-		d = *dst;
-		s = src;
-
-		UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
-
-		*dst = s;
-	    }
-
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
-                                    pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t src, srca, s;
-    uint32_t    *dst_line, *dst, d;
-    uint32_t    *mask_line, *mask, ma;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    ma = *mask++;
-	    if (ma == 0xffffffff)
-	    {
-		if (srca == 0xff)
-		    *dst = src;
-		else
-		    *dst = over (src, *dst);
-	    }
-	    else if (ma)
-	    {
-		d = *dst;
-		s = src;
-
-		UN8x4_MUL_UN8x4 (s, ma);
-		UN8x4_MUL_UN8 (ma, srca);
-		ma = ~ma;
-		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
-		*dst = d;
-	    }
-
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t src, srca;
-    uint8_t     *dst_line, *dst;
-    uint32_t d;
-    uint8_t     *mask_line, *mask, m;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    m = *mask++;
-	    if (m == 0xff)
-	    {
-		if (srca == 0xff)
-		{
-		    d = src;
-		}
-		else
-		{
-		    d = fetch_24 (dst);
-		    d = over (src, d);
-		}
-		store_24 (dst, d);
-	    }
-	    else if (m)
-	    {
-		d = over (in (src, m), fetch_24 (dst));
-		store_24 (dst, d);
-	    }
-	    dst += 3;
-	}
-    }
-}
-
-static void
-fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t src, srca;
-    uint16_t    *dst_line, *dst;
-    uint32_t d;
-    uint8_t     *mask_line, *mask, m;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    m = *mask++;
-	    if (m == 0xff)
-	    {
-		if (srca == 0xff)
-		{
-		    d = src;
-		}
-		else
-		{
-		    d = *dst;
-		    d = over (src, CONVERT_0565_TO_0888 (d));
-		}
-		*dst = CONVERT_8888_TO_0565 (d);
-	    }
-	    else if (m)
-	    {
-		d = *dst;
-		d = over (in (src, m), CONVERT_0565_TO_0888 (d));
-		*dst = CONVERT_8888_TO_0565 (d);
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
-                                    pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t  src, srca, s;
-    uint16_t  src16;
-    uint16_t *dst_line, *dst;
-    uint32_t  d;
-    uint32_t *mask_line, *mask, ma;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    src16 = CONVERT_8888_TO_0565 (src);
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    ma = *mask++;
-	    if (ma == 0xffffffff)
-	    {
-		if (srca == 0xff)
-		{
-		    *dst = src16;
-		}
-		else
-		{
-		    d = *dst;
-		    d = over (src, CONVERT_0565_TO_0888 (d));
-		    *dst = CONVERT_8888_TO_0565 (d);
-		}
-	    }
-	    else if (ma)
-	    {
-		d = *dst;
-		d = CONVERT_0565_TO_0888 (d);
-
-		s = src;
-
-		UN8x4_MUL_UN8x4 (s, ma);
-		UN8x4_MUL_UN8 (ma, srca);
-		ma = ~ma;
-		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
-		*dst = CONVERT_8888_TO_0565 (d);
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_over_8888_8888 (pixman_implementation_t *imp,
-                               pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src, s;
-    int dst_stride, src_stride;
-    uint8_t a;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    a = s >> 24;
-	    if (a == 0xff)
-		*dst = s;
-	    else if (s)
-		*dst = over (s, *dst);
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_src_x888_8888 (pixman_implementation_t *imp,
-			      pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	    *dst++ = (*src++) | 0xff000000;
-    }
-}
-
-#if 0
-static void
-fast_composite_over_8888_0888 (pixman_implementation_t *imp,
-			       pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint8_t     *dst_line, *dst;
-    uint32_t d;
-    uint32_t    *src_line, *src, s;
-    uint8_t a;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    a = s >> 24;
-	    if (a)
-	    {
-		if (a == 0xff)
-		    d = s;
-		else
-		    d = over (s, fetch_24 (dst));
-
-		store_24 (dst, d);
-	    }
-	    dst += 3;
-	}
-    }
-}
-#endif
-
-static void
-fast_composite_over_8888_0565 (pixman_implementation_t *imp,
-                               pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint16_t    *dst_line, *dst;
-    uint32_t d;
-    uint32_t    *src_line, *src, s;
-    uint8_t a;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    a = s >> 24;
-	    if (s)
-	    {
-		if (a == 0xff)
-		{
-		    d = s;
-		}
-		else
-		{
-		    d = *dst;
-		    d = over (s, CONVERT_0565_TO_0888 (d));
-		}
-		*dst = CONVERT_8888_TO_0565 (d);
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_src_x888_0565 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint16_t    *dst_line, *dst;
-    uint32_t    *src_line, *src, s;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    *dst = CONVERT_8888_TO_0565 (s);
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_add_8_8 (pixman_implementation_t *imp,
-			pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint8_t     *dst_line, *dst;
-    uint8_t     *src_line, *src;
-    int dst_stride, src_stride;
-    int32_t w;
-    uint8_t s, d;
-    uint16_t t;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    if (s)
-	    {
-		if (s != 0xff)
-		{
-		    d = *dst;
-		    t = d + s;
-		    s = t | (0 - (t >> 8));
-		}
-		*dst = s;
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_add_8888_8888 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    int dst_stride, src_stride;
-    int32_t w;
-    uint32_t s, d;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    if (s)
-	    {
-		if (s != 0xffffffff)
-		{
-		    d = *dst;
-		    if (d)
-			UN8x4_ADD_UN8x4 (s, d);
-		}
-		*dst = s;
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_add_n_8_8 (pixman_implementation_t *imp,
-			  pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint8_t     *dst_line, *dst;
-    uint8_t     *mask_line, *mask;
-    int dst_stride, mask_stride;
-    int32_t w;
-    uint32_t src;
-    uint8_t sa;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-    sa = (src >> 24);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    uint16_t tmp;
-	    uint16_t a;
-	    uint32_t m, d;
-	    uint32_t r;
-
-	    a = *mask++;
-	    d = *dst;
-
-	    m = MUL_UN8 (sa, a, tmp);
-	    r = ADD_UN8 (m, d, tmp);
-
-	    *dst++ = r;
-	}
-    }
-}
-
-#ifdef WORDS_BIGENDIAN
-#define CREATE_BITMASK(n) (0x80000000 >> (n))
-#define UPDATE_BITMASK(n) ((n) >> 1)
-#else
-#define CREATE_BITMASK(n) (1 << (n))
-#define UPDATE_BITMASK(n) ((n) << 1)
-#endif
-
-#define TEST_BIT(p, n)					\
-    (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
-#define SET_BIT(p, n)							\
-    do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
-
-static void
-fast_composite_add_1000_1000 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t     *dst_line, *dst;
-    uint32_t     *src_line, *src;
-    int           dst_stride, src_stride;
-    int32_t       w;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
-                           src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dest_image, 0, dest_y, uint32_t,
-                           dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    /*
-	     * TODO: improve performance by processing uint32_t data instead
-	     *       of individual bits
-	     */
-	    if (TEST_BIT (src, src_x + w))
-		SET_BIT (dst, dest_x + w);
-	}
-    }
-}
-
-static void
-fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t     src, srca;
-    uint32_t    *dst, *dst_line;
-    uint32_t    *mask, *mask_line;
-    int          mask_stride, dst_stride;
-    uint32_t     bitcache, bitmask;
-    int32_t      w;
-
-    if (width <= 0)
-	return;
-
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t,
-                           dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
-                           mask_stride, mask_line, 1);
-    mask_line += mask_x >> 5;
-
-    if (srca == 0xff)
-    {
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    bitcache = *mask++;
-	    bitmask = CREATE_BITMASK (mask_x & 31);
-
-	    while (w--)
-	    {
-		if (bitmask == 0)
-		{
-		    bitcache = *mask++;
-		    bitmask = CREATE_BITMASK (0);
-		}
-		if (bitcache & bitmask)
-		    *dst = src;
-		bitmask = UPDATE_BITMASK (bitmask);
-		dst++;
-	    }
-	}
-    }
-    else
-    {
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    bitcache = *mask++;
-	    bitmask = CREATE_BITMASK (mask_x & 31);
-
-	    while (w--)
-	    {
-		if (bitmask == 0)
-		{
-		    bitcache = *mask++;
-		    bitmask = CREATE_BITMASK (0);
-		}
-		if (bitcache & bitmask)
-		    *dst = over (src, *dst);
-		bitmask = UPDATE_BITMASK (bitmask);
-		dst++;
-	    }
-	}
-    }
-}
-
-static void
-fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t     src, srca;
-    uint16_t    *dst, *dst_line;
-    uint32_t    *mask, *mask_line;
-    int          mask_stride, dst_stride;
-    uint32_t     bitcache, bitmask;
-    int32_t      w;
-    uint32_t     d;
-    uint16_t     src565;
-
-    if (width <= 0)
-	return;
-
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t,
-                           dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
-                           mask_stride, mask_line, 1);
-    mask_line += mask_x >> 5;
-
-    if (srca == 0xff)
-    {
-	src565 = CONVERT_8888_TO_0565 (src);
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    bitcache = *mask++;
-	    bitmask = CREATE_BITMASK (mask_x & 31);
-
-	    while (w--)
-	    {
-		if (bitmask == 0)
-		{
-		    bitcache = *mask++;
-		    bitmask = CREATE_BITMASK (0);
-		}
-		if (bitcache & bitmask)
-		    *dst = src565;
-		bitmask = UPDATE_BITMASK (bitmask);
-		dst++;
-	    }
-	}
-    }
-    else
-    {
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    bitcache = *mask++;
-	    bitmask = CREATE_BITMASK (mask_x & 31);
-
-	    while (w--)
-	    {
-		if (bitmask == 0)
-		{
-		    bitcache = *mask++;
-		    bitmask = CREATE_BITMASK (0);
-		}
-		if (bitcache & bitmask)
-		{
-		    d = over (src, CONVERT_0565_TO_0888 (*dst));
-		    *dst = CONVERT_8888_TO_0565 (d);
-		}
-		bitmask = UPDATE_BITMASK (bitmask);
-		dst++;
-	    }
-	}
-    }
-}
-
-/*
- * Simple bitblt
- */
-
-static void
-fast_composite_solid_fill (pixman_implementation_t *imp,
-                           pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t src;
-
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-
-    if (dest_image->bits.format == PIXMAN_a1)
-    {
-	src = src >> 31;
-    }
-    else if (dest_image->bits.format == PIXMAN_a8)
-    {
-	src = src >> 24;
-    }
-    else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
-             dest_image->bits.format == PIXMAN_b5g6r5)
-    {
-	src = CONVERT_8888_TO_0565 (src);
-    }
-
-    pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
-                 PIXMAN_FORMAT_BPP (dest_image->bits.format),
-                 dest_x, dest_y,
-                 width, height,
-                 src);
-}
-
-static void
-fast_composite_src_memcpy (pixman_implementation_t *imp,
-			   pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
-    uint32_t n_bytes = width * bpp;
-    int dst_stride, src_stride;
-    uint8_t    *dst;
-    uint8_t    *src;
-
-    src_stride = src_image->bits.rowstride * 4;
-    dst_stride = dest_image->bits.rowstride * 4;
-
-    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
-    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
-
-    while (height--)
-    {
-	memcpy (dst, src, n_bytes);
-
-	dst += dst_stride;
-	src += src_stride;
-    }
-}
-
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
-FAST_NEAREST (x888_8888_cover, x888, 8888, uint32_t, uint32_t, SRC, COVER)
-FAST_NEAREST (x888_8888_pad, x888, 8888, uint32_t, uint32_t, SRC, PAD)
-FAST_NEAREST (x888_8888_normal, x888, 8888, uint32_t, uint32_t, SRC, NORMAL)
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
-FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
-
-/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
-static force_inline void
-scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
-				     const uint16_t * src,
-				     int32_t          w,
-				     pixman_fixed_t   vx,
-				     pixman_fixed_t   unit_x,
-				     pixman_fixed_t   max_vx,
-				     pixman_bool_t    fully_transparent_src)
-{
-    uint16_t tmp1, tmp2, tmp3, tmp4;
-    while ((w -= 4) >= 0)
-    {
-	tmp1 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	tmp3 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	tmp4 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	*dst++ = tmp1;
-	*dst++ = tmp2;
-	*dst++ = tmp3;
-	*dst++ = tmp4;
-    }
-    if (w & 2)
-    {
-	tmp1 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	*dst++ = tmp1;
-	*dst++ = tmp2;
-    }
-    if (w & 1)
-	*dst++ = src[pixman_fixed_to_int (vx)];
-}
-
-FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
-		       scaled_nearest_scanline_565_565_SRC,
-		       uint16_t, uint16_t, COVER)
-FAST_NEAREST_MAINLOOP (565_565_none_SRC,
-		       scaled_nearest_scanline_565_565_SRC,
-		       uint16_t, uint16_t, NONE)
-FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
-		       scaled_nearest_scanline_565_565_SRC,
-		       uint16_t, uint16_t, PAD)
-
-static force_inline uint32_t
-fetch_nearest (pixman_repeat_t src_repeat,
-	       pixman_format_code_t format,
-	       uint32_t *src, int x, int src_width)
-{
-    if (repeat (src_repeat, &x, src_width))
-    {
-	if (format == PIXMAN_x8r8g8b8)
-	    return *(src + x) | 0xff000000;
-	else
-	    return *(src + x);
-    }
-    else
-    {
-	return 0;
-    }
-}
-
-static force_inline void
-combine_over (uint32_t s, uint32_t *dst)
-{
-    if (s)
-    {
-	uint8_t ia = 0xff - (s >> 24);
-
-	if (ia)
-	    UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
-	else
-	    *dst = s;
-    }
-}
-
-static force_inline void
-combine_src (uint32_t s, uint32_t *dst)
-{
-    *dst = s;
-}
-
-static void
-fast_composite_scaled_nearest (pixman_implementation_t *imp,
-			       pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t       *dst_line;
-    uint32_t       *src_line;
-    int             dst_stride, src_stride;
-    int		    src_width, src_height;
-    pixman_repeat_t src_repeat;
-    pixman_fixed_t unit_x, unit_y;
-    pixman_format_code_t src_format;
-    pixman_vector_t v;
-    pixman_fixed_t vy;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
-     * transformed from destination space to source space
-     */
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (src_image->common.transform, &v))
-	return;
-
-    unit_x = src_image->common.transform->matrix[0][0];
-    unit_y = src_image->common.transform->matrix[1][1];
-
-    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
-    v.vector[0] -= pixman_fixed_e;
-    v.vector[1] -= pixman_fixed_e;
-
-    src_height = src_image->bits.height;
-    src_width = src_image->bits.width;
-    src_repeat = src_image->common.repeat;
-    src_format = src_image->bits.format;
-
-    vy = v.vector[1];
-    while (height--)
-    {
-        pixman_fixed_t vx = v.vector[0];
-	int y = pixman_fixed_to_int (vy);
-	uint32_t *dst = dst_line;
-
-	dst_line += dst_stride;
-
-        /* adjust the y location by a unit vector in the y direction
-         * this is equivalent to transforming y+1 of the destination point to source space */
-        vy += unit_y;
-
-	if (!repeat (src_repeat, &y, src_height))
-	{
-	    if (op == PIXMAN_OP_SRC)
-		memset (dst, 0, sizeof (*dst) * width);
-	}
-	else
-	{
-	    int w = width;
-
-	    uint32_t *src = src_line + y * src_stride;
-
-	    while (w >= 2)
-	    {
-		uint32_t s1, s2;
-		int x1, x2;
-
-		x1 = pixman_fixed_to_int (vx);
-		vx += unit_x;
-
-		x2 = pixman_fixed_to_int (vx);
-		vx += unit_x;
-
-		w -= 2;
-
-		s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
-		s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
-
-		if (op == PIXMAN_OP_OVER)
-		{
-		    combine_over (s1, dst++);
-		    combine_over (s2, dst++);
-		}
-		else
-		{
-		    combine_src (s1, dst++);
-		    combine_src (s2, dst++);
-		}
-	    }
-
-	    while (w--)
-	    {
-		uint32_t s;
-		int x;
-
-		x = pixman_fixed_to_int (vx);
-		vx += unit_x;
-
-		s = fetch_nearest (src_repeat, src_format, src, x, src_width);
-
-		if (op == PIXMAN_OP_OVER)
-		    combine_over (s, dst++);
-		else
-		    combine_src (s, dst++);
-	    }
-	}
-    }
-}
-
-#define CACHE_LINE_SIZE 64
-
-#define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
-                                                                              \
-static void                                                                   \
-blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
-				 int             dst_stride,                  \
-				 const pix_type *src,                         \
-				 int             src_stride,                  \
-				 int             w,                           \
-				 int             h)                           \
-{                                                                             \
-    int x, y;                                                                 \
-    for (y = 0; y < h; y++)                                                   \
-    {                                                                         \
-	const pix_type *s = src + (h - y - 1);                                \
-	pix_type *d = dst + dst_stride * y;                                   \
-	for (x = 0; x < w; x++)                                               \
-	{                                                                     \
-	    *d++ = *s;                                                        \
-	    s += src_stride;                                                  \
-	}                                                                     \
-    }                                                                         \
-}                                                                             \
-                                                                              \
-static void                                                                   \
-blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
-				  int             dst_stride,                 \
-				  const pix_type *src,                        \
-				  int             src_stride,                 \
-				  int             w,                          \
-				  int             h)                          \
-{                                                                             \
-    int x, y;                                                                 \
-    for (y = 0; y < h; y++)                                                   \
-    {                                                                         \
-	const pix_type *s = src + src_stride * (w - 1) + y;                   \
-	pix_type *d = dst + dst_stride * y;                                   \
-	for (x = 0; x < w; x++)                                               \
-	{                                                                     \
-	    *d++ = *s;                                                        \
-	    s -= src_stride;                                                  \
-	}                                                                     \
-    }                                                                         \
-}                                                                             \
-                                                                              \
-static void                                                                   \
-blt_rotated_90_##suffix (pix_type       *dst,                                 \
-			 int             dst_stride,                          \
-			 const pix_type *src,                                 \
-			 int             src_stride,                          \
-			 int             W,                                   \
-			 int             H)                                   \
-{                                                                             \
-    int x;                                                                    \
-    int leading_pixels = 0, trailing_pixels = 0;                              \
-    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
-                                                                              \
-    /*                                                                        \
-     * split processing into handling destination as TILE_SIZExH cache line   \
-     * aligned vertical stripes (optimistically assuming that destination     \
-     * stride is a multiple of cache line, if not - it will be just a bit     \
-     * slower)                                                                \
-     */                                                                       \
-                                                                              \
-    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
-    {                                                                         \
-	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
-			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
-	if (leading_pixels > W)                                               \
-	    leading_pixels = W;                                               \
-                                                                              \
-	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
-	blt_rotated_90_trivial_##suffix (                                     \
-	    dst,                                                              \
-	    dst_stride,                                                       \
-	    src,                                                              \
-	    src_stride,                                                       \
-	    leading_pixels,                                                   \
-	    H);                                                               \
-	                                                                      \
-	dst += leading_pixels;                                                \
-	src += leading_pixels * src_stride;                                   \
-	W -= leading_pixels;                                                  \
-    }                                                                         \
-                                                                              \
-    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
-    {                                                                         \
-	trailing_pixels = (((uintptr_t)(dst + W) &                            \
-			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
-	if (trailing_pixels > W)                                              \
-	    trailing_pixels = W;                                              \
-	W -= trailing_pixels;                                                 \
-    }                                                                         \
-                                                                              \
-    for (x = 0; x < W; x += TILE_SIZE)                                        \
-    {                                                                         \
-	/* aligned middle part TILE_SIZExH */                                 \
-	blt_rotated_90_trivial_##suffix (                                     \
-	    dst + x,                                                          \
-	    dst_stride,                                                       \
-	    src + src_stride * x,                                             \
-	    src_stride,                                                       \
-	    TILE_SIZE,                                                        \
-	    H);                                                               \
-    }                                                                         \
-                                                                              \
-    if (trailing_pixels)                                                      \
-    {                                                                         \
-	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
-	blt_rotated_90_trivial_##suffix (                                     \
-	    dst + W,                                                          \
-	    dst_stride,                                                       \
-	    src + W * src_stride,                                             \
-	    src_stride,                                                       \
-	    trailing_pixels,                                                  \
-	    H);                                                               \
-    }                                                                         \
-}                                                                             \
-                                                                              \
-static void                                                                   \
-blt_rotated_270_##suffix (pix_type       *dst,                                \
-			  int             dst_stride,                         \
-			  const pix_type *src,                                \
-			  int             src_stride,                         \
-			  int             W,                                  \
-			  int             H)                                  \
-{                                                                             \
-    int x;                                                                    \
-    int leading_pixels = 0, trailing_pixels = 0;                              \
-    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
-                                                                              \
-    /*                                                                        \
-     * split processing into handling destination as TILE_SIZExH cache line   \
-     * aligned vertical stripes (optimistically assuming that destination     \
-     * stride is a multiple of cache line, if not - it will be just a bit     \
-     * slower)                                                                \
-     */                                                                       \
-                                                                              \
-    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
-    {                                                                         \
-	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
-			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
-	if (leading_pixels > W)                                               \
-	    leading_pixels = W;                                               \
-                                                                              \
-	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
-	blt_rotated_270_trivial_##suffix (                                    \
-	    dst,                                                              \
-	    dst_stride,                                                       \
-	    src + src_stride * (W - leading_pixels),                          \
-	    src_stride,                                                       \
-	    leading_pixels,                                                   \
-	    H);                                                               \
-	                                                                      \
-	dst += leading_pixels;                                                \
-	W -= leading_pixels;                                                  \
-    }                                                                         \
-                                                                              \
-    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
-    {                                                                         \
-	trailing_pixels = (((uintptr_t)(dst + W) &                            \
-			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
-	if (trailing_pixels > W)                                              \
-	    trailing_pixels = W;                                              \
-	W -= trailing_pixels;                                                 \
-	src += trailing_pixels * src_stride;                                  \
-    }                                                                         \
-                                                                              \
-    for (x = 0; x < W; x += TILE_SIZE)                                        \
-    {                                                                         \
-	/* aligned middle part TILE_SIZExH */                                 \
-	blt_rotated_270_trivial_##suffix (                                    \
-	    dst + x,                                                          \
-	    dst_stride,                                                       \
-	    src + src_stride * (W - x - TILE_SIZE),                           \
-	    src_stride,                                                       \
-	    TILE_SIZE,                                                        \
-	    H);                                                               \
-    }                                                                         \
-                                                                              \
-    if (trailing_pixels)                                                      \
-    {                                                                         \
-	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
-	blt_rotated_270_trivial_##suffix (                                    \
-	    dst + W,                                                          \
-	    dst_stride,                                                       \
-	    src - trailing_pixels * src_stride,                               \
-	    src_stride,                                                       \
-	    trailing_pixels,                                                  \
-	    H);                                                               \
-    }                                                                         \
-}                                                                             \
-                                                                              \
-static void                                                                   \
-fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
-				   pixman_composite_info_t *info)	      \
-{									      \
-    PIXMAN_COMPOSITE_ARGS (info);					      \
-    pix_type       *dst_line;						      \
-    pix_type       *src_line;                                                 \
-    int             dst_stride, src_stride;                                   \
-    int             src_x_t, src_y_t;                                         \
-                                                                              \
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
-			   dst_stride, dst_line, 1);                          \
-    src_x_t = -src_y + pixman_fixed_to_int (                                  \
-				src_image->common.transform->matrix[0][2] +   \
-				pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
-    src_y_t = src_x + pixman_fixed_to_int (                                   \
-				src_image->common.transform->matrix[1][2] +   \
-				pixman_fixed_1 / 2 - pixman_fixed_e);         \
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
-			   src_stride, src_line, 1);                          \
-    blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
-			     width, height);                                  \
-}                                                                             \
-                                                                              \
-static void                                                                   \
-fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
-				    pixman_composite_info_t *info)            \
-{                                                                             \
-    PIXMAN_COMPOSITE_ARGS (info);					      \
-    pix_type       *dst_line;						      \
-    pix_type       *src_line;                                                 \
-    int             dst_stride, src_stride;                                   \
-    int             src_x_t, src_y_t;                                         \
-                                                                              \
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
-			   dst_stride, dst_line, 1);                          \
-    src_x_t = src_y + pixman_fixed_to_int (                                   \
-				src_image->common.transform->matrix[0][2] +   \
-				pixman_fixed_1 / 2 - pixman_fixed_e);         \
-    src_y_t = -src_x + pixman_fixed_to_int (                                  \
-				src_image->common.transform->matrix[1][2] +   \
-				pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
-			   src_stride, src_line, 1);                          \
-    blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
-			      width, height);                                 \
-}
-
-FAST_SIMPLE_ROTATE (8, uint8_t)
-FAST_SIMPLE_ROTATE (565, uint16_t)
-FAST_SIMPLE_ROTATE (8888, uint32_t)
-
-static const pixman_fast_path_t c_fast_paths[] =
-{
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
-    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
-    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
-    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
-    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
-    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
-
-    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
-
-    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
-
-    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
-    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
-
-    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
-
-    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
-    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
-
-    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
-
-    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
-
-#define NEAREST_FAST_PATH(op,s,d)		\
-    {   PIXMAN_OP_ ## op,			\
-	PIXMAN_ ## s, SCALED_NEAREST_FLAGS,	\
-	PIXMAN_null, 0,				\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,	\
-	fast_composite_scaled_nearest,		\
-    }
-
-    NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
-    NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
-    NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
-    NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
-
-    NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
-    NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
-    NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
-    NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
-
-    NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
-    NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
-    NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
-    NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
-
-    NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
-    NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
-    NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
-    NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
-
-#define SIMPLE_ROTATE_FLAGS(angle)					  \
-    (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM	|			  \
-     FAST_PATH_NEAREST_FILTER			|			  \
-     FAST_PATH_SAMPLES_COVER_CLIP		|			  \
-     FAST_PATH_STANDARD_FLAGS)
-
-#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)				  \
-    {   PIXMAN_OP_ ## op,						  \
-	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),				  \
-	PIXMAN_null, 0,							  \
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
-	fast_composite_rotate_90_##suffix,				  \
-    },									  \
-    {   PIXMAN_OP_ ## op,						  \
-	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),			  \
-	PIXMAN_null, 0,							  \
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
-	fast_composite_rotate_270_##suffix,				  \
-    }
-
-    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
-    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
-    SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
-    SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
-    SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
-
-    {   PIXMAN_OP_NONE	},
-};
-
-#ifdef WORDS_BIGENDIAN
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
-#else
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
-#endif
-
-static force_inline void
-pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
-{
-    if (offs)
-    {
-	int leading_pixels = 32 - offs;
-	if (leading_pixels >= width)
-	{
-	    if (v)
-		*dst |= A1_FILL_MASK (width, offs);
-	    else
-		*dst &= ~A1_FILL_MASK (width, offs);
-	    return;
-	}
-	else
-	{
-	    if (v)
-		*dst++ |= A1_FILL_MASK (leading_pixels, offs);
-	    else
-		*dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
-	    width -= leading_pixels;
-	}
-    }
-    while (width >= 32)
-    {
-	if (v)
-	    *dst++ = 0xFFFFFFFF;
-	else
-	    *dst++ = 0;
-	width -= 32;
-    }
-    if (width > 0)
-    {
-	if (v)
-	    *dst |= A1_FILL_MASK (width, 0);
-	else
-	    *dst &= ~A1_FILL_MASK (width, 0);
-    }
-}
-
-static void
-pixman_fill1 (uint32_t *bits,
-              int       stride,
-              int       x,
-              int       y,
-              int       width,
-              int       height,
-              uint32_t  xor)
-{
-    uint32_t *dst = bits + y * stride + (x >> 5);
-    int offs = x & 31;
-
-    if (xor & 1)
-    {
-	while (height--)
-	{
-	    pixman_fill1_line (dst, offs, width, 1);
-	    dst += stride;
-	}
-    }
-    else
-    {
-	while (height--)
-	{
-	    pixman_fill1_line (dst, offs, width, 0);
-	    dst += stride;
-	}
-    }
-}
-
-static void
-pixman_fill8 (uint32_t *bits,
-              int       stride,
-              int       x,
-              int       y,
-              int       width,
-              int       height,
-              uint32_t xor)
-{
-    int byte_stride = stride * (int) sizeof (uint32_t);
-    uint8_t *dst = (uint8_t *) bits;
-    uint8_t v = xor & 0xff;
-    int i;
-
-    dst = dst + y * byte_stride + x;
-
-    while (height--)
-    {
-	for (i = 0; i < width; ++i)
-	    dst[i] = v;
-
-	dst += byte_stride;
-    }
-}
-
-static void
-pixman_fill16 (uint32_t *bits,
-               int       stride,
-               int       x,
-               int       y,
-               int       width,
-               int       height,
-               uint32_t xor)
-{
-    int short_stride =
-	(stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
-    uint16_t *dst = (uint16_t *)bits;
-    uint16_t v = xor & 0xffff;
-    int i;
-
-    dst = dst + y * short_stride + x;
-
-    while (height--)
-    {
-	for (i = 0; i < width; ++i)
-	    dst[i] = v;
-
-	dst += short_stride;
-    }
-}
-
-static void
-pixman_fill32 (uint32_t *bits,
-               int       stride,
-               int       x,
-               int       y,
-               int       width,
-               int       height,
-               uint32_t  xor)
-{
-    int i;
-
-    bits = bits + y * stride + x;
-
-    while (height--)
-    {
-	for (i = 0; i < width; ++i)
-	    bits[i] = xor;
-
-	bits += stride;
-    }
-}
-
-static pixman_bool_t
-fast_path_fill (pixman_implementation_t *imp,
-                uint32_t *               bits,
-                int                      stride,
-                int                      bpp,
-                int                      x,
-                int                      y,
-                int                      width,
-                int                      height,
-                uint32_t		 xor)
-{
-    switch (bpp)
-    {
-    case 1:
-	pixman_fill1 (bits, stride, x, y, width, height, xor);
-	break;
-
-    case 8:
-	pixman_fill8 (bits, stride, x, y, width, height, xor);
-	break;
-
-    case 16:
-	pixman_fill16 (bits, stride, x, y, width, height, xor);
-	break;
-
-    case 32:
-	pixman_fill32 (bits, stride, x, y, width, height, xor);
-	break;
-
-    default:
-	return _pixman_implementation_fill (
-	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
-	break;
-    }
-
-    return TRUE;
-}
-
-pixman_implementation_t *
-_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
-{
-    pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
-
-    imp->fill = fast_path_fill;
-
-    return imp;
-}
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static force_inline uint32_t
+fetch_24 (uint8_t *a)
+{
+    if (((unsigned long)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+	return (*a << 16) | (*(uint16_t *)(a + 1));
+#else
+	return *a | (*(uint16_t *)(a + 1) << 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+	return (*(uint16_t *)a << 8) | *(a + 2);
+#else
+	return *(uint16_t *)a | (*(a + 2) << 16);
+#endif
+    }
+}
+
+static force_inline void
+store_24 (uint8_t *a,
+          uint32_t v)
+{
+    if (((unsigned long)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+	*a = (uint8_t) (v >> 16);
+	*(uint16_t *)(a + 1) = (uint16_t) (v);
+#else
+	*a = (uint8_t) (v);
+	*(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+	*(uint16_t *)a = (uint16_t)(v >> 8);
+	*(a + 2) = (uint8_t)v;
+#else
+	*(uint16_t *)a = (uint16_t)v;
+	*(a + 2) = (uint8_t)(v >> 16);
+#endif
+    }
+}
+
+static force_inline uint32_t
+over (uint32_t src,
+      uint32_t dest)
+{
+    uint32_t a = ~src >> 24;
+
+    UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
+
+    return dest;
+}
+
+static uint32_t
+in (uint32_t x,
+    uint8_t  y)
+{
+    uint16_t a = y;
+
+    UN8x4_MUL_UN8 (x, a);
+
+    return x;
+}
+
+/*
+ * Naming convention:
+ *
+ *  op_src_mask_dest
+ */
+static void
+fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line;
+    uint32_t    *dst, *dst_line;
+    uint8_t     *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
+    uint8_t m;
+    uint32_t s, d;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	w = width;
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m)
+	    {
+		s = *src | 0xff000000;
+
+		if (m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    d = in (s, m);
+		    *dst = over (d, *dst);
+		}
+	    }
+	    src++;
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint16_t t;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    if (srca == 0xff)
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    while (w--)
+	    {
+		m = *mask++;
+
+		if (m == 0)
+		    *dst = 0;
+		else if (m != 0xff)
+		    *dst = MUL_UN8 (m, *dst, t);
+
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    while (w--)
+	    {
+		m = *mask++;
+		m = MUL_UN8 (m, srca, t);
+
+		if (m == 0)
+		    *dst = 0;
+		else if (m != 0xff)
+		    *dst = MUL_UN8 (m, *dst, t);
+
+		dst++;
+	    }
+	}
+    }
+}
+
+static void
+fast_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+
+	    if (s == 0)
+		*dst = 0;
+	    else if (s != 0xff)
+		*dst = MUL_UN8 (s, *dst, t);
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		    *dst = src;
+		else
+		    *dst = over (src, *dst);
+	    }
+	    else if (m)
+	    {
+		d = in (src, m);
+		*dst = over (d, *dst);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+
+	    if (ma)
+	    {
+		d = *dst;
+		s = src;
+
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
+
+		*dst = s;
+	    }
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+	    if (ma == 0xffffffff)
+	    {
+		if (srca == 0xff)
+		    *dst = src;
+		else
+		    *dst = over (src, *dst);
+	    }
+	    else if (ma)
+	    {
+		d = *dst;
+		s = src;
+
+		UN8x4_MUL_UN8x4 (s, ma);
+		UN8x4_MUL_UN8 (ma, srca);
+		ma = ~ma;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+		*dst = d;
+	    }
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		{
+		    d = src;
+		}
+		else
+		{
+		    d = fetch_24 (dst);
+		    d = over (src, d);
+		}
+		store_24 (dst, d);
+	    }
+	    else if (m)
+	    {
+		d = over (in (src, m), fetch_24 (dst));
+		store_24 (dst, d);
+	    }
+	    dst += 3;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		{
+		    d = src;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (src, CONVERT_0565_TO_0888 (d));
+		}
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    else if (m)
+	    {
+		d = *dst;
+		d = over (in (src, m), CONVERT_0565_TO_0888 (d));
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t  src, srca, s;
+    uint16_t  src16;
+    uint16_t *dst_line, *dst;
+    uint32_t  d;
+    uint32_t *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    src16 = CONVERT_8888_TO_0565 (src);
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+	    if (ma == 0xffffffff)
+	    {
+		if (srca == 0xff)
+		{
+		    *dst = src16;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (src, CONVERT_0565_TO_0888 (d));
+		    *dst = CONVERT_8888_TO_0565 (d);
+		}
+	    }
+	    else if (ma)
+	    {
+		d = *dst;
+		d = CONVERT_0565_TO_0888 (d);
+
+		s = src;
+
+		UN8x4_MUL_UN8x4 (s, ma);
+		UN8x4_MUL_UN8 (ma, srca);
+		ma = ~ma;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (a == 0xff)
+		*dst = s;
+	    else if (s)
+		*dst = over (s, *dst);
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_src_x888_8888 (pixman_implementation_t *imp,
+			      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	    *dst++ = (*src++) | 0xff000000;
+    }
+}
+
+#if 0
+static void
+fast_composite_over_8888_0888 (pixman_implementation_t *imp,
+			       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (a)
+	    {
+		if (a == 0xff)
+		    d = s;
+		else
+		    d = over (s, fetch_24 (dst));
+
+		store_24 (dst, d);
+	    }
+	    dst += 3;
+	}
+    }
+}
+#endif
+
+static void
+fast_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (s)
+	    {
+		if (a == 0xff)
+		{
+		    d = s;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (s, CONVERT_0565_TO_0888 (d));
+		}
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_src_x888_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    *dst = CONVERT_8888_TO_0565 (s);
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s, d;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		if (s != 0xff)
+		{
+		    d = *dst;
+		    t = d + s;
+		    s = t | (0 - (t >> 8));
+		}
+		*dst = s;
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t s, d;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		if (s != 0xffffffff)
+		{
+		    d = *dst;
+		    if (d)
+			UN8x4_ADD_UN8x4 (s, d);
+		}
+		*dst = s;
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_n_8_8 (pixman_implementation_t *imp,
+			  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    sa = (src >> 24);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
+
+	    *dst++ = r;
+	}
+    }
+}
+
+#ifdef WORDS_BIGENDIAN
+#define CREATE_BITMASK(n) (0x80000000 >> (n))
+#define UPDATE_BITMASK(n) ((n) >> 1)
+#else
+#define CREATE_BITMASK(n) (1 << (n))
+#define UPDATE_BITMASK(n) ((n) << 1)
+#endif
+
+#define TEST_BIT(p, n)					\
+    (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
+#define SET_BIT(p, n)							\
+    do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
+
+static void
+fast_composite_add_1000_1000 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     *dst_line, *dst;
+    uint32_t     *src_line, *src;
+    int           dst_stride, src_stride;
+    int32_t       w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
+                           src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, 0, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    /*
+	     * TODO: improve performance by processing uint32_t data instead
+	     *       of individual bits
+	     */
+	    if (TEST_BIT (src, src_x + w))
+		SET_BIT (dst, dest_x + w);
+	}
+    }
+}
+
+static void
+fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     src, srca;
+    uint32_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+
+    if (width <= 0)
+	return;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = src;
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = over (src, *dst);
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+}
+
+static void
+fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     src, srca;
+    uint16_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+    uint32_t     d;
+    uint16_t     src565;
+
+    if (width <= 0)
+	return;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+	src565 = CONVERT_8888_TO_0565 (src);
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = src565;
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		{
+		    d = over (src, CONVERT_0565_TO_0888 (*dst));
+		    *dst = CONVERT_8888_TO_0565 (d);
+		}
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+}
+
+/*
+ * Simple bitblt
+ */
+
+static void
+fast_composite_solid_fill (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (dest_image->bits.format == PIXMAN_a1)
+    {
+	src = src >> 31;
+    }
+    else if (dest_image->bits.format == PIXMAN_a8)
+    {
+	src = src >> 24;
+    }
+    else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
+             dest_image->bits.format == PIXMAN_b5g6r5)
+    {
+	src = CONVERT_8888_TO_0565 (src);
+    }
+
+    pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+                 PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                 dest_x, dest_y,
+                 width, height,
+                 src);
+}
+
+static void
+fast_composite_src_memcpy (pixman_implementation_t *imp,
+			   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
+    uint32_t n_bytes = width * bpp;
+    int dst_stride, src_stride;
+    uint8_t    *dst;
+    uint8_t    *src;
+
+    src_stride = src_image->bits.rowstride * 4;
+    dst_stride = dest_image->bits.rowstride * 4;
+
+    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
+    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
+
+    while (height--)
+    {
+	memcpy (dst, src, n_bytes);
+
+	dst += dst_stride;
+	src += src_stride;
+    }
+}
+
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (x888_8888_cover, x888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (x888_8888_pad, x888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (x888_8888_normal, x888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
+
+/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
+static force_inline void
+scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
+				     const uint16_t * src,
+				     int32_t          w,
+				     pixman_fixed_t   vx,
+				     pixman_fixed_t   unit_x,
+				     pixman_fixed_t   max_vx,
+				     pixman_bool_t    fully_transparent_src)
+{
+    uint16_t tmp1, tmp2, tmp3, tmp4;
+    while ((w -= 4) >= 0)
+    {
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp3 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp4 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	*dst++ = tmp1;
+	*dst++ = tmp2;
+	*dst++ = tmp3;
+	*dst++ = tmp4;
+    }
+    if (w & 2)
+    {
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	*dst++ = tmp1;
+	*dst++ = tmp2;
+    }
+    if (w & 1)
+	*dst++ = src[pixman_fixed_to_int (vx)];
+}
+
+FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, COVER)
+FAST_NEAREST_MAINLOOP (565_565_none_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, NONE)
+FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, PAD)
+
+static force_inline uint32_t
+fetch_nearest (pixman_repeat_t src_repeat,
+	       pixman_format_code_t format,
+	       uint32_t *src, int x, int src_width)
+{
+    if (repeat (src_repeat, &x, src_width))
+    {
+	if (format == PIXMAN_x8r8g8b8)
+	    return *(src + x) | 0xff000000;
+	else
+	    return *(src + x);
+    }
+    else
+    {
+	return 0;
+    }
+}
+
+static force_inline void
+combine_over (uint32_t s, uint32_t *dst)
+{
+    if (s)
+    {
+	uint8_t ia = 0xff - (s >> 24);
+
+	if (ia)
+	    UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
+	else
+	    *dst = s;
+    }
+}
+
+static force_inline void
+combine_src (uint32_t s, uint32_t *dst)
+{
+    *dst = s;
+}
+
+static void
+fast_composite_scaled_nearest (pixman_implementation_t *imp,
+			       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t       *dst_line;
+    uint32_t       *src_line;
+    int             dst_stride, src_stride;
+    int		    src_width, src_height;
+    pixman_repeat_t src_repeat;
+    pixman_fixed_t unit_x, unit_y;
+    pixman_format_code_t src_format;
+    pixman_vector_t v;
+    pixman_fixed_t vy;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
+     * transformed from destination space to source space
+     */
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))
+	return;
+
+    unit_x = src_image->common.transform->matrix[0][0];
+    unit_y = src_image->common.transform->matrix[1][1];
+
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
+    v.vector[0] -= pixman_fixed_e;
+    v.vector[1] -= pixman_fixed_e;
+
+    src_height = src_image->bits.height;
+    src_width = src_image->bits.width;
+    src_repeat = src_image->common.repeat;
+    src_format = src_image->bits.format;
+
+    vy = v.vector[1];
+    while (height--)
+    {
+        pixman_fixed_t vx = v.vector[0];
+	int y = pixman_fixed_to_int (vy);
+	uint32_t *dst = dst_line;
+
+	dst_line += dst_stride;
+
+        /* adjust the y location by a unit vector in the y direction
+         * this is equivalent to transforming y+1 of the destination point to source space */
+        vy += unit_y;
+
+	if (!repeat (src_repeat, &y, src_height))
+	{
+	    if (op == PIXMAN_OP_SRC)
+		memset (dst, 0, sizeof (*dst) * width);
+	}
+	else
+	{
+	    int w = width;
+
+	    uint32_t *src = src_line + y * src_stride;
+
+	    while (w >= 2)
+	    {
+		uint32_t s1, s2;
+		int x1, x2;
+
+		x1 = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		x2 = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		w -= 2;
+
+		s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
+		s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
+
+		if (op == PIXMAN_OP_OVER)
+		{
+		    combine_over (s1, dst++);
+		    combine_over (s2, dst++);
+		}
+		else
+		{
+		    combine_src (s1, dst++);
+		    combine_src (s2, dst++);
+		}
+	    }
+
+	    while (w--)
+	    {
+		uint32_t s;
+		int x;
+
+		x = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		s = fetch_nearest (src_repeat, src_format, src, x, src_width);
+
+		if (op == PIXMAN_OP_OVER)
+		    combine_over (s, dst++);
+		else
+		    combine_src (s, dst++);
+	    }
+	}
+    }
+}
+
+#define CACHE_LINE_SIZE 64
+
+#define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
+				 int             dst_stride,                  \
+				 const pix_type *src,                         \
+				 int             src_stride,                  \
+				 int             w,                           \
+				 int             h)                           \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+	const pix_type *s = src + (h - y - 1);                                \
+	pix_type *d = dst + dst_stride * y;                                   \
+	for (x = 0; x < w; x++)                                               \
+	{                                                                     \
+	    *d++ = *s;                                                        \
+	    s += src_stride;                                                  \
+	}                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
+				  int             dst_stride,                 \
+				  const pix_type *src,                        \
+				  int             src_stride,                 \
+				  int             w,                          \
+				  int             h)                          \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+	const pix_type *s = src + src_stride * (w - 1) + y;                   \
+	pix_type *d = dst + dst_stride * y;                                   \
+	for (x = 0; x < w; x++)                                               \
+	{                                                                     \
+	    *d++ = *s;                                                        \
+	    s -= src_stride;                                                  \
+	}                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_##suffix (pix_type       *dst,                                 \
+			 int             dst_stride,                          \
+			 const pix_type *src,                                 \
+			 int             src_stride,                          \
+			 int             W,                                   \
+			 int             H)                                   \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (leading_pixels > W)                                               \
+	    leading_pixels = W;                                               \
+                                                                              \
+	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst,                                                              \
+	    dst_stride,                                                       \
+	    src,                                                              \
+	    src_stride,                                                       \
+	    leading_pixels,                                                   \
+	    H);                                                               \
+	                                                                      \
+	dst += leading_pixels;                                                \
+	src += leading_pixels * src_stride;                                   \
+	W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+	trailing_pixels = (((uintptr_t)(dst + W) &                            \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (trailing_pixels > W)                                              \
+	    trailing_pixels = W;                                              \
+	W -= trailing_pixels;                                                 \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+	/* aligned middle part TILE_SIZExH */                                 \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst + x,                                                          \
+	    dst_stride,                                                       \
+	    src + src_stride * x,                                             \
+	    src_stride,                                                       \
+	    TILE_SIZE,                                                        \
+	    H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst + W,                                                          \
+	    dst_stride,                                                       \
+	    src + W * src_stride,                                             \
+	    src_stride,                                                       \
+	    trailing_pixels,                                                  \
+	    H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_##suffix (pix_type       *dst,                                \
+			  int             dst_stride,                         \
+			  const pix_type *src,                                \
+			  int             src_stride,                         \
+			  int             W,                                  \
+			  int             H)                                  \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (leading_pixels > W)                                               \
+	    leading_pixels = W;                                               \
+                                                                              \
+	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst,                                                              \
+	    dst_stride,                                                       \
+	    src + src_stride * (W - leading_pixels),                          \
+	    src_stride,                                                       \
+	    leading_pixels,                                                   \
+	    H);                                                               \
+	                                                                      \
+	dst += leading_pixels;                                                \
+	W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+	trailing_pixels = (((uintptr_t)(dst + W) &                            \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (trailing_pixels > W)                                              \
+	    trailing_pixels = W;                                              \
+	W -= trailing_pixels;                                                 \
+	src += trailing_pixels * src_stride;                                  \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+	/* aligned middle part TILE_SIZExH */                                 \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst + x,                                                          \
+	    dst_stride,                                                       \
+	    src + src_stride * (W - x - TILE_SIZE),                           \
+	    src_stride,                                                       \
+	    TILE_SIZE,                                                        \
+	    H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst + W,                                                          \
+	    dst_stride,                                                       \
+	    src - trailing_pixels * src_stride,                               \
+	    src_stride,                                                       \
+	    trailing_pixels,                                                  \
+	    H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
+				   pixman_composite_info_t *info)	      \
+{									      \
+    PIXMAN_COMPOSITE_ARGS (info);					      \
+    pix_type       *dst_line;						      \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+			   dst_stride, dst_line, 1);                          \
+    src_x_t = -src_y + pixman_fixed_to_int (                                  \
+				src_image->common.transform->matrix[0][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
+    src_y_t = src_x + pixman_fixed_to_int (                                   \
+				src_image->common.transform->matrix[1][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+			   src_stride, src_line, 1);                          \
+    blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
+			     width, height);                                  \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
+				    pixman_composite_info_t *info)            \
+{                                                                             \
+    PIXMAN_COMPOSITE_ARGS (info);					      \
+    pix_type       *dst_line;						      \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+			   dst_stride, dst_line, 1);                          \
+    src_x_t = src_y + pixman_fixed_to_int (                                   \
+				src_image->common.transform->matrix[0][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    src_y_t = -src_x + pixman_fixed_to_int (                                  \
+				src_image->common.transform->matrix[1][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+			   src_stride, src_line, 1);                          \
+    blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
+			      width, height);                                 \
+}
+
+FAST_SIMPLE_ROTATE (8, uint8_t)
+FAST_SIMPLE_ROTATE (565, uint16_t)
+FAST_SIMPLE_ROTATE (8888, uint32_t)
+
+static const pixman_fast_path_t c_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
+
+#define NEAREST_FAST_PATH(op,s,d)		\
+    {   PIXMAN_OP_ ## op,			\
+	PIXMAN_ ## s, SCALED_NEAREST_FLAGS,	\
+	PIXMAN_null, 0,				\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,	\
+	fast_composite_scaled_nearest,		\
+    }
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
+
+#define SIMPLE_ROTATE_FLAGS(angle)					  \
+    (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM	|			  \
+     FAST_PATH_NEAREST_FILTER			|			  \
+     FAST_PATH_SAMPLES_COVER_CLIP		|			  \
+     FAST_PATH_STANDARD_FLAGS)
+
+#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)				  \
+    {   PIXMAN_OP_ ## op,						  \
+	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),				  \
+	PIXMAN_null, 0,							  \
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
+	fast_composite_rotate_90_##suffix,				  \
+    },									  \
+    {   PIXMAN_OP_ ## op,						  \
+	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),			  \
+	PIXMAN_null, 0,							  \
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
+	fast_composite_rotate_270_##suffix,				  \
+    }
+
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
+
+    {   PIXMAN_OP_NONE	},
+};
+
+#ifdef WORDS_BIGENDIAN
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
+#else
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
+#endif
+
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
+{
+    if (offs)
+    {
+	int leading_pixels = 32 - offs;
+	if (leading_pixels >= width)
+	{
+	    if (v)
+		*dst |= A1_FILL_MASK (width, offs);
+	    else
+		*dst &= ~A1_FILL_MASK (width, offs);
+	    return;
+	}
+	else
+	{
+	    if (v)
+		*dst++ |= A1_FILL_MASK (leading_pixels, offs);
+	    else
+		*dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+	    width -= leading_pixels;
+	}
+    }
+    while (width >= 32)
+    {
+	if (v)
+	    *dst++ = 0xFFFFFFFF;
+	else
+	    *dst++ = 0;
+	width -= 32;
+    }
+    if (width > 0)
+    {
+	if (v)
+	    *dst |= A1_FILL_MASK (width, 0);
+	else
+	    *dst &= ~A1_FILL_MASK (width, 0);
+    }
+}
+
+static void
+pixman_fill1 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t  xor)
+{
+    uint32_t *dst = bits + y * stride + (x >> 5);
+    int offs = x & 31;
+
+    if (xor & 1)
+    {
+	while (height--)
+	{
+	    pixman_fill1_line (dst, offs, width, 1);
+	    dst += stride;
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    pixman_fill1_line (dst, offs, width, 0);
+	    dst += stride;
+	}
+    }
+}
+
+static void
+pixman_fill8 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t xor)
+{
+    int byte_stride = stride * (int) sizeof (uint32_t);
+    uint8_t *dst = (uint8_t *) bits;
+    uint8_t v = xor & 0xff;
+    int i;
+
+    dst = dst + y * byte_stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    dst[i] = v;
+
+	dst += byte_stride;
+    }
+}
+
+static void
+pixman_fill16 (uint32_t *bits,
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t xor)
+{
+    int short_stride =
+	(stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
+    uint16_t *dst = (uint16_t *)bits;
+    uint16_t v = xor & 0xffff;
+    int i;
+
+    dst = dst + y * short_stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    dst[i] = v;
+
+	dst += short_stride;
+    }
+}
+
+static void
+pixman_fill32 (uint32_t *bits,
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t  xor)
+{
+    int i;
+
+    bits = bits + y * stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    bits[i] = xor;
+
+	bits += stride;
+    }
+}
+
+static pixman_bool_t
+fast_path_fill (pixman_implementation_t *imp,
+                uint32_t *               bits,
+                int                      stride,
+                int                      bpp,
+                int                      x,
+                int                      y,
+                int                      width,
+                int                      height,
+                uint32_t		 xor)
+{
+    switch (bpp)
+    {
+    case 1:
+	pixman_fill1 (bits, stride, x, y, width, height, xor);
+	break;
+
+    case 8:
+	pixman_fill8 (bits, stride, x, y, width, height, xor);
+	break;
+
+    case 16:
+	pixman_fill16 (bits, stride, x, y, width, height, xor);
+	break;
+
+    case 32:
+	pixman_fill32 (bits, stride, x, y, width, height, xor);
+	break;
+
+    default:
+	return _pixman_implementation_fill (
+	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+	break;
+    }
+
+    return TRUE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
+
+    imp->fill = fast_path_fill;
+
+    return imp;
+}
diff --git a/pixman/pixman/pixman-image.c b/pixman/pixman/pixman-image.c
index 584150dca..84bacf87e 100644
--- a/pixman/pixman/pixman-image.c
+++ b/pixman/pixman/pixman-image.c
@@ -1,781 +1,781 @@
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  SuSE makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-#include "pixman-private.h"
-
-pixman_bool_t
-_pixman_init_gradient (gradient_t *                  gradient,
-                       const pixman_gradient_stop_t *stops,
-                       int                           n_stops)
-{
-    return_val_if_fail (n_stops > 0, FALSE);
-
-    gradient->stops = pixman_malloc_ab (n_stops, sizeof (pixman_gradient_stop_t));
-    if (!gradient->stops)
-	return FALSE;
-
-    memcpy (gradient->stops, stops, n_stops * sizeof (pixman_gradient_stop_t));
-
-    gradient->n_stops = n_stops;
-
-    return TRUE;
-}
-
-pixman_image_t *
-_pixman_image_allocate (void)
-{
-    pixman_image_t *image = malloc (sizeof (pixman_image_t));
-
-    if (image)
-    {
-	image_common_t *common = &image->common;
-
-	pixman_region32_init (&common->clip_region);
-
-	common->alpha_count = 0;
-	common->have_clip_region = FALSE;
-	common->clip_sources = FALSE;
-	common->transform = NULL;
-	common->repeat = PIXMAN_REPEAT_NONE;
-	common->filter = PIXMAN_FILTER_NEAREST;
-	common->filter_params = NULL;
-	common->n_filter_params = 0;
-	common->alpha_map = NULL;
-	common->component_alpha = FALSE;
-	common->ref_count = 1;
-	common->property_changed = NULL;
-	common->client_clip = FALSE;
-	common->destroy_func = NULL;
-	common->destroy_data = NULL;
-	common->dirty = TRUE;
-    }
-
-    return image;
-}
-
-static void
-image_property_changed (pixman_image_t *image)
-{
-    image->common.dirty = TRUE;
-}
-
-/* Ref Counting */
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_ref (pixman_image_t *image)
-{
-    image->common.ref_count++;
-
-    return image;
-}
-
-/* returns TRUE when the image is freed */
-PIXMAN_EXPORT pixman_bool_t
-pixman_image_unref (pixman_image_t *image)
-{
-    image_common_t *common = (image_common_t *)image;
-
-    common->ref_count--;
-
-    if (common->ref_count == 0)
-    {
-	if (image->common.destroy_func)
-	    image->common.destroy_func (image, image->common.destroy_data);
-
-	pixman_region32_fini (&common->clip_region);
-
-	if (common->transform)
-	    free (common->transform);
-
-	if (common->filter_params)
-	    free (common->filter_params);
-
-	if (common->alpha_map)
-	    pixman_image_unref ((pixman_image_t *)common->alpha_map);
-
-	if (image->type == LINEAR ||
-	    image->type == RADIAL ||
-	    image->type == CONICAL)
-	{
-	    if (image->gradient.stops)
-		free (image->gradient.stops);
-	}
-
-	if (image->type == BITS && image->bits.free_me)
-	    free (image->bits.free_me);
-
-	free (image);
-
-	return TRUE;
-    }
-
-    return FALSE;
-}
-
-PIXMAN_EXPORT void
-pixman_image_set_destroy_function (pixman_image_t *            image,
-                                   pixman_image_destroy_func_t func,
-                                   void *                      data)
-{
-    image->common.destroy_func = func;
-    image->common.destroy_data = data;
-}
-
-PIXMAN_EXPORT void *
-pixman_image_get_destroy_data (pixman_image_t *image)
-{
-  return image->common.destroy_data;
-}
-
-void
-_pixman_image_reset_clip_region (pixman_image_t *image)
-{
-    image->common.have_clip_region = FALSE;
-}
-
-/* Executive Summary: This function is a no-op that only exists
- * for historical reasons.
- *
- * There used to be a bug in the X server where it would rely on
- * out-of-bounds accesses when it was asked to composite with a
- * window as the source. It would create a pixman image pointing
- * to some bogus position in memory, but then set a clip region
- * to the position where the actual bits were.
- *
- * Due to a bug in old versions of pixman, where it would not clip
- * against the image bounds when a clip region was set, this would
- * actually work. So when the pixman bug was fixed, a workaround was
- * added to allow certain out-of-bound accesses. This function disabled
- * those workarounds.
- *
- * Since 0.21.2, pixman doesn't do these workarounds anymore, so now
- * this function is a no-op.
- */
-PIXMAN_EXPORT void
-pixman_disable_out_of_bounds_workaround (void)
-{
-}
-
-static void
-compute_image_info (pixman_image_t *image)
-{
-    pixman_format_code_t code;
-    uint32_t flags = 0;
-
-    /* Transform */
-    if (!image->common.transform)
-    {
-	flags |= (FAST_PATH_ID_TRANSFORM	|
-		  FAST_PATH_X_UNIT_POSITIVE	|
-		  FAST_PATH_Y_UNIT_ZERO		|
-		  FAST_PATH_AFFINE_TRANSFORM);
-    }
-    else
-    {
-	flags |= FAST_PATH_HAS_TRANSFORM;
-
-	if (image->common.transform->matrix[2][0] == 0			&&
-	    image->common.transform->matrix[2][1] == 0			&&
-	    image->common.transform->matrix[2][2] == pixman_fixed_1)
-	{
-	    flags |= FAST_PATH_AFFINE_TRANSFORM;
-
-	    if (image->common.transform->matrix[0][1] == 0 &&
-		image->common.transform->matrix[1][0] == 0)
-	    {
-		if (image->common.transform->matrix[0][0] == -pixman_fixed_1 &&
-		    image->common.transform->matrix[1][1] == -pixman_fixed_1)
-		{
-		    flags |= FAST_PATH_ROTATE_180_TRANSFORM;
-		}
-		flags |= FAST_PATH_SCALE_TRANSFORM;
-	    }
-	    else if (image->common.transform->matrix[0][0] == 0 &&
-	             image->common.transform->matrix[1][1] == 0)
-	    {
-		pixman_fixed_t m01 = image->common.transform->matrix[0][1];
-		if (m01 == -image->common.transform->matrix[1][0])
-		{
-			if (m01 == -pixman_fixed_1)
-			    flags |= FAST_PATH_ROTATE_90_TRANSFORM;
-			else if (m01 == pixman_fixed_1)
-			    flags |= FAST_PATH_ROTATE_270_TRANSFORM;
-		}
-	    }
-	}
-
-	if (image->common.transform->matrix[0][0] > 0)
-	    flags |= FAST_PATH_X_UNIT_POSITIVE;
-
-	if (image->common.transform->matrix[1][0] == 0)
-	    flags |= FAST_PATH_Y_UNIT_ZERO;
-    }
-
-    /* Filter */
-    switch (image->common.filter)
-    {
-    case PIXMAN_FILTER_NEAREST:
-    case PIXMAN_FILTER_FAST:
-	flags |= (FAST_PATH_NEAREST_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
-	break;
-
-    case PIXMAN_FILTER_BILINEAR:
-    case PIXMAN_FILTER_GOOD:
-    case PIXMAN_FILTER_BEST:
-	flags |= (FAST_PATH_BILINEAR_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
-	break;
-
-    case PIXMAN_FILTER_CONVOLUTION:
-	break;
-
-    default:
-	flags |= FAST_PATH_NO_CONVOLUTION_FILTER;
-	break;
-    }
-
-    /* Repeat mode */
-    switch (image->common.repeat)
-    {
-    case PIXMAN_REPEAT_NONE:
-	flags |=
-	    FAST_PATH_NO_REFLECT_REPEAT		|
-	    FAST_PATH_NO_PAD_REPEAT		|
-	    FAST_PATH_NO_NORMAL_REPEAT;
-	break;
-
-    case PIXMAN_REPEAT_REFLECT:
-	flags |=
-	    FAST_PATH_NO_PAD_REPEAT		|
-	    FAST_PATH_NO_NONE_REPEAT		|
-	    FAST_PATH_NO_NORMAL_REPEAT;
-	break;
-
-    case PIXMAN_REPEAT_PAD:
-	flags |=
-	    FAST_PATH_NO_REFLECT_REPEAT		|
-	    FAST_PATH_NO_NONE_REPEAT		|
-	    FAST_PATH_NO_NORMAL_REPEAT;
-	break;
-
-    default:
-	flags |=
-	    FAST_PATH_NO_REFLECT_REPEAT		|
-	    FAST_PATH_NO_PAD_REPEAT		|
-	    FAST_PATH_NO_NONE_REPEAT;
-	break;
-    }
-
-    /* Component alpha */
-    if (image->common.component_alpha)
-	flags |= FAST_PATH_COMPONENT_ALPHA;
-    else
-	flags |= FAST_PATH_UNIFIED_ALPHA;
-
-    flags |= (FAST_PATH_NO_ACCESSORS | FAST_PATH_NARROW_FORMAT);
-
-    /* Type specific checks */
-    switch (image->type)
-    {
-    case SOLID:
-	code = PIXMAN_solid;
-
-	if (image->solid.color.alpha == 0xffff)
-	    flags |= FAST_PATH_IS_OPAQUE;
-	break;
-
-    case BITS:
-	if (image->bits.width == 1	&&
-	    image->bits.height == 1	&&
-	    image->common.repeat != PIXMAN_REPEAT_NONE)
-	{
-	    code = PIXMAN_solid;
-	}
-	else
-	{
-	    code = image->bits.format;
-	}
-
-	if (!PIXMAN_FORMAT_A (image->bits.format)				&&
-	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_GRAY		&&
-	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_COLOR)
-	{
-	    flags |= FAST_PATH_SAMPLES_OPAQUE;
-
-	    if (image->common.repeat != PIXMAN_REPEAT_NONE)
-		flags |= FAST_PATH_IS_OPAQUE;
-	}
-
-	if (image->bits.read_func || image->bits.write_func)
-	    flags &= ~FAST_PATH_NO_ACCESSORS;
-
-	if (PIXMAN_FORMAT_IS_WIDE (image->bits.format))
-	    flags &= ~FAST_PATH_NARROW_FORMAT;
-	break;
-
-    case RADIAL:
-	code = PIXMAN_unknown;
-
-	/*
-	 * As explained in pixman-radial-gradient.c, every point of
-	 * the plane has a valid associated radius (and thus will be
-	 * colored) if and only if a is negative (i.e. one of the two
-	 * circles contains the other one).
-	 */
-
-        if (image->radial.a >= 0)
-	    break;
-
-	/* Fall through */
-
-    case CONICAL:
-    case LINEAR:
-	code = PIXMAN_unknown;
-
-	if (image->common.repeat != PIXMAN_REPEAT_NONE)
-	{
-	    int i;
-
-	    flags |= FAST_PATH_IS_OPAQUE;
-	    for (i = 0; i < image->gradient.n_stops; ++i)
-	    {
-		if (image->gradient.stops[i].color.alpha != 0xffff)
-		{
-		    flags &= ~FAST_PATH_IS_OPAQUE;
-		    break;
-		}
-	    }
-	}
-	break;
-
-    default:
-	code = PIXMAN_unknown;
-	break;
-    }
-
-    /* Alpha map */
-    if (!image->common.alpha_map)
-    {
-	flags |= FAST_PATH_NO_ALPHA_MAP;
-    }
-    else
-    {
-	if (PIXMAN_FORMAT_IS_WIDE (image->common.alpha_map->format))
-	    flags &= ~FAST_PATH_NARROW_FORMAT;
-    }
-
-    /* Both alpha maps and convolution filters can introduce
-     * non-opaqueness in otherwise opaque images. Also
-     * an image with component alpha turned on is only opaque
-     * if all channels are opaque, so we simply turn it off
-     * unconditionally for those images.
-     */
-    if (image->common.alpha_map					||
-	image->common.filter == PIXMAN_FILTER_CONVOLUTION	||
-	image->common.component_alpha)
-    {
-	flags &= ~(FAST_PATH_IS_OPAQUE | FAST_PATH_SAMPLES_OPAQUE);
-    }
-
-    image->common.flags = flags;
-    image->common.extended_format_code = code;
-}
-
-void
-_pixman_image_validate (pixman_image_t *image)
-{
-    if (image->common.dirty)
-    {
-	compute_image_info (image);
-
-	/* It is important that property_changed is
-	 * called *after* compute_image_info() because
-	 * property_changed() can make use of the flags
-	 * to set up accessors etc.
-	 */
-	if (image->common.property_changed)
-	    image->common.property_changed (image);
-
-	image->common.dirty = FALSE;
-    }
-
-    if (image->common.alpha_map)
-	_pixman_image_validate ((pixman_image_t *)image->common.alpha_map);
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_image_set_clip_region32 (pixman_image_t *   image,
-                                pixman_region32_t *region)
-{
-    image_common_t *common = (image_common_t *)image;
-    pixman_bool_t result;
-
-    if (region)
-    {
-	if ((result = pixman_region32_copy (&common->clip_region, region)))
-	    image->common.have_clip_region = TRUE;
-    }
-    else
-    {
-	_pixman_image_reset_clip_region (image);
-
-	result = TRUE;
-    }
-
-    image_property_changed (image);
-
-    return result;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_image_set_clip_region (pixman_image_t *   image,
-                              pixman_region16_t *region)
-{
-    image_common_t *common = (image_common_t *)image;
-    pixman_bool_t result;
-
-    if (region)
-    {
-	if ((result = pixman_region32_copy_from_region16 (&common->clip_region, region)))
-	    image->common.have_clip_region = TRUE;
-    }
-    else
-    {
-	_pixman_image_reset_clip_region (image);
-
-	result = TRUE;
-    }
-
-    image_property_changed (image);
-
-    return result;
-}
-
-PIXMAN_EXPORT void
-pixman_image_set_has_client_clip (pixman_image_t *image,
-                                  pixman_bool_t   client_clip)
-{
-    image->common.client_clip = client_clip;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_image_set_transform (pixman_image_t *          image,
-                            const pixman_transform_t *transform)
-{
-    static const pixman_transform_t id =
-    {
-	{ { pixman_fixed_1, 0, 0 },
-	  { 0, pixman_fixed_1, 0 },
-	  { 0, 0, pixman_fixed_1 } }
-    };
-
-    image_common_t *common = (image_common_t *)image;
-    pixman_bool_t result;
-
-    if (common->transform == transform)
-	return TRUE;
-
-    if (!transform || memcmp (&id, transform, sizeof (pixman_transform_t)) == 0)
-    {
-	free (common->transform);
-	common->transform = NULL;
-	result = TRUE;
-
-	goto out;
-    }
-
-    if (common->transform &&
-	memcmp (common->transform, transform, sizeof (pixman_transform_t) == 0))
-    {
-	return TRUE;
-    }
-
-    if (common->transform == NULL)
-	common->transform = malloc (sizeof (pixman_transform_t));
-
-    if (common->transform == NULL)
-    {
-	result = FALSE;
-
-	goto out;
-    }
-
-    memcpy (common->transform, transform, sizeof(pixman_transform_t));
-
-    result = TRUE;
-
-out:
-    image_property_changed (image);
-
-    return result;
-}
-
-PIXMAN_EXPORT void
-pixman_image_set_repeat (pixman_image_t *image,
-                         pixman_repeat_t repeat)
-{
-    if (image->common.repeat == repeat)
-	return;
-
-    image->common.repeat = repeat;
-
-    image_property_changed (image);
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_image_set_filter (pixman_image_t *      image,
-                         pixman_filter_t       filter,
-                         const pixman_fixed_t *params,
-                         int                   n_params)
-{
-    image_common_t *common = (image_common_t *)image;
-    pixman_fixed_t *new_params;
-
-    if (params == common->filter_params && filter == common->filter)
-	return TRUE;
-
-    new_params = NULL;
-    if (params)
-    {
-	new_params = pixman_malloc_ab (n_params, sizeof (pixman_fixed_t));
-	if (!new_params)
-	    return FALSE;
-
-	memcpy (new_params,
-	        params, n_params * sizeof (pixman_fixed_t));
-    }
-
-    common->filter = filter;
-
-    if (common->filter_params)
-	free (common->filter_params);
-
-    common->filter_params = new_params;
-    common->n_filter_params = n_params;
-
-    image_property_changed (image);
-    return TRUE;
-}
-
-PIXMAN_EXPORT void
-pixman_image_set_source_clipping (pixman_image_t *image,
-                                  pixman_bool_t   clip_sources)
-{
-    if (image->common.clip_sources == clip_sources)
-	return;
-
-    image->common.clip_sources = clip_sources;
-
-    image_property_changed (image);
-}
-
-/* Unlike all the other property setters, this function does not
- * copy the content of indexed. Doing this copying is simply
- * way, way too expensive.
- */
-PIXMAN_EXPORT void
-pixman_image_set_indexed (pixman_image_t *        image,
-                          const pixman_indexed_t *indexed)
-{
-    bits_image_t *bits = (bits_image_t *)image;
-
-    if (bits->indexed == indexed)
-	return;
-
-    bits->indexed = indexed;
-
-    image_property_changed (image);
-}
-
-PIXMAN_EXPORT void
-pixman_image_set_alpha_map (pixman_image_t *image,
-                            pixman_image_t *alpha_map,
-                            int16_t         x,
-                            int16_t         y)
-{
-    image_common_t *common = (image_common_t *)image;
-
-    return_if_fail (!alpha_map || alpha_map->type == BITS);
-
-    if (alpha_map && common->alpha_count > 0)
-    {
-	/* If this image is being used as an alpha map itself,
-	 * then you can't give it an alpha map of its own.
-	 */
-	return;
-    }
-
-    if (alpha_map && alpha_map->common.alpha_map)
-    {
-	/* If the image has an alpha map of its own,
-	 * then it can't be used as an alpha map itself
-	 */
-	return;
-    }
-
-    if (common->alpha_map != (bits_image_t *)alpha_map)
-    {
-	if (common->alpha_map)
-	{
-	    common->alpha_map->common.alpha_count--;
-
-	    pixman_image_unref ((pixman_image_t *)common->alpha_map);
-	}
-
-	if (alpha_map)
-	{
-	    common->alpha_map = (bits_image_t *)pixman_image_ref (alpha_map);
-
-	    common->alpha_map->common.alpha_count++;
-	}
-	else
-	{
-	    common->alpha_map = NULL;
-	}
-    }
-
-    common->alpha_origin_x = x;
-    common->alpha_origin_y = y;
-
-    image_property_changed (image);
-}
-
-PIXMAN_EXPORT void
-pixman_image_set_component_alpha   (pixman_image_t *image,
-                                    pixman_bool_t   component_alpha)
-{
-    if (image->common.component_alpha == component_alpha)
-	return;
-
-    image->common.component_alpha = component_alpha;
-
-    image_property_changed (image);
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_image_get_component_alpha   (pixman_image_t       *image)
-{
-    return image->common.component_alpha;
-}
-
-PIXMAN_EXPORT void
-pixman_image_set_accessors (pixman_image_t *           image,
-                            pixman_read_memory_func_t  read_func,
-                            pixman_write_memory_func_t write_func)
-{
-    return_if_fail (image != NULL);
-
-    if (image->type == BITS)
-    {
-	image->bits.read_func = read_func;
-	image->bits.write_func = write_func;
-
-	image_property_changed (image);
-    }
-}
-
-PIXMAN_EXPORT uint32_t *
-pixman_image_get_data (pixman_image_t *image)
-{
-    if (image->type == BITS)
-	return image->bits.bits;
-
-    return NULL;
-}
-
-PIXMAN_EXPORT int
-pixman_image_get_width (pixman_image_t *image)
-{
-    if (image->type == BITS)
-	return image->bits.width;
-
-    return 0;
-}
-
-PIXMAN_EXPORT int
-pixman_image_get_height (pixman_image_t *image)
-{
-    if (image->type == BITS)
-	return image->bits.height;
-
-    return 0;
-}
-
-PIXMAN_EXPORT int
-pixman_image_get_stride (pixman_image_t *image)
-{
-    if (image->type == BITS)
-	return image->bits.rowstride * (int) sizeof (uint32_t);
-
-    return 0;
-}
-
-PIXMAN_EXPORT int
-pixman_image_get_depth (pixman_image_t *image)
-{
-    if (image->type == BITS)
-	return PIXMAN_FORMAT_DEPTH (image->bits.format);
-
-    return 0;
-}
-
-PIXMAN_EXPORT pixman_format_code_t
-pixman_image_get_format (pixman_image_t *image)
-{
-    if (image->type == BITS)
-	return image->bits.format;
-
-    return 0;
-}
-
-uint32_t
-_pixman_image_get_solid (pixman_implementation_t *imp,
-			 pixman_image_t *         image,
-                         pixman_format_code_t     format)
-{
-    uint32_t result;
-    pixman_iter_t iter;
-
-    _pixman_implementation_src_iter_init (
-	imp, &iter, image, 0, 0, 1, 1,
-	(uint8_t *)&result, ITER_NARROW);
-
-    result = *iter.get_scanline (&iter, NULL);
-
-    /* If necessary, convert RGB <--> BGR. */
-    if (PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB)
-    {
-	result = (((result & 0xff000000) >>  0) |
-	          ((result & 0x00ff0000) >> 16) |
-	          ((result & 0x0000ff00) >>  0) |
-	          ((result & 0x000000ff) << 16));
-    }
-
-    return result;
-}
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pixman-private.h"
+
+pixman_bool_t
+_pixman_init_gradient (gradient_t *                  gradient,
+                       const pixman_gradient_stop_t *stops,
+                       int                           n_stops)
+{
+    return_val_if_fail (n_stops > 0, FALSE);
+
+    gradient->stops = pixman_malloc_ab (n_stops, sizeof (pixman_gradient_stop_t));
+    if (!gradient->stops)
+	return FALSE;
+
+    memcpy (gradient->stops, stops, n_stops * sizeof (pixman_gradient_stop_t));
+
+    gradient->n_stops = n_stops;
+
+    return TRUE;
+}
+
+pixman_image_t *
+_pixman_image_allocate (void)
+{
+    pixman_image_t *image = malloc (sizeof (pixman_image_t));
+
+    if (image)
+    {
+	image_common_t *common = &image->common;
+
+	pixman_region32_init (&common->clip_region);
+
+	common->alpha_count = 0;
+	common->have_clip_region = FALSE;
+	common->clip_sources = FALSE;
+	common->transform = NULL;
+	common->repeat = PIXMAN_REPEAT_NONE;
+	common->filter = PIXMAN_FILTER_NEAREST;
+	common->filter_params = NULL;
+	common->n_filter_params = 0;
+	common->alpha_map = NULL;
+	common->component_alpha = FALSE;
+	common->ref_count = 1;
+	common->property_changed = NULL;
+	common->client_clip = FALSE;
+	common->destroy_func = NULL;
+	common->destroy_data = NULL;
+	common->dirty = TRUE;
+    }
+
+    return image;
+}
+
+static void
+image_property_changed (pixman_image_t *image)
+{
+    image->common.dirty = TRUE;
+}
+
+/* Ref Counting */
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_ref (pixman_image_t *image)
+{
+    image->common.ref_count++;
+
+    return image;
+}
+
+/* returns TRUE when the image is freed */
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_unref (pixman_image_t *image)
+{
+    image_common_t *common = (image_common_t *)image;
+
+    common->ref_count--;
+
+    if (common->ref_count == 0)
+    {
+	if (image->common.destroy_func)
+	    image->common.destroy_func (image, image->common.destroy_data);
+
+	pixman_region32_fini (&common->clip_region);
+
+	if (common->transform)
+	    free (common->transform);
+
+	if (common->filter_params)
+	    free (common->filter_params);
+
+	if (common->alpha_map)
+	    pixman_image_unref ((pixman_image_t *)common->alpha_map);
+
+	if (image->type == LINEAR ||
+	    image->type == RADIAL ||
+	    image->type == CONICAL)
+	{
+	    if (image->gradient.stops)
+		free (image->gradient.stops);
+	}
+
+	if (image->type == BITS && image->bits.free_me)
+	    free (image->bits.free_me);
+
+	free (image);
+
+	return TRUE;
+    }
+
+    return FALSE;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_destroy_function (pixman_image_t *            image,
+                                   pixman_image_destroy_func_t func,
+                                   void *                      data)
+{
+    image->common.destroy_func = func;
+    image->common.destroy_data = data;
+}
+
+PIXMAN_EXPORT void *
+pixman_image_get_destroy_data (pixman_image_t *image)
+{
+  return image->common.destroy_data;
+}
+
+void
+_pixman_image_reset_clip_region (pixman_image_t *image)
+{
+    image->common.have_clip_region = FALSE;
+}
+
+/* Executive Summary: This function is a no-op that only exists
+ * for historical reasons.
+ *
+ * There used to be a bug in the X server where it would rely on
+ * out-of-bounds accesses when it was asked to composite with a
+ * window as the source. It would create a pixman image pointing
+ * to some bogus position in memory, but then set a clip region
+ * to the position where the actual bits were.
+ *
+ * Due to a bug in old versions of pixman, where it would not clip
+ * against the image bounds when a clip region was set, this would
+ * actually work. So when the pixman bug was fixed, a workaround was
+ * added to allow certain out-of-bound accesses. This function disabled
+ * those workarounds.
+ *
+ * Since 0.21.2, pixman doesn't do these workarounds anymore, so now
+ * this function is a no-op.
+ */
+PIXMAN_EXPORT void
+pixman_disable_out_of_bounds_workaround (void)
+{
+}
+
+static void
+compute_image_info (pixman_image_t *image)
+{
+    pixman_format_code_t code;
+    uint32_t flags = 0;
+
+    /* Transform */
+    if (!image->common.transform)
+    {
+	flags |= (FAST_PATH_ID_TRANSFORM	|
+		  FAST_PATH_X_UNIT_POSITIVE	|
+		  FAST_PATH_Y_UNIT_ZERO		|
+		  FAST_PATH_AFFINE_TRANSFORM);
+    }
+    else
+    {
+	flags |= FAST_PATH_HAS_TRANSFORM;
+
+	if (image->common.transform->matrix[2][0] == 0			&&
+	    image->common.transform->matrix[2][1] == 0			&&
+	    image->common.transform->matrix[2][2] == pixman_fixed_1)
+	{
+	    flags |= FAST_PATH_AFFINE_TRANSFORM;
+
+	    if (image->common.transform->matrix[0][1] == 0 &&
+		image->common.transform->matrix[1][0] == 0)
+	    {
+		if (image->common.transform->matrix[0][0] == -pixman_fixed_1 &&
+		    image->common.transform->matrix[1][1] == -pixman_fixed_1)
+		{
+		    flags |= FAST_PATH_ROTATE_180_TRANSFORM;
+		}
+		flags |= FAST_PATH_SCALE_TRANSFORM;
+	    }
+	    else if (image->common.transform->matrix[0][0] == 0 &&
+	             image->common.transform->matrix[1][1] == 0)
+	    {
+		pixman_fixed_t m01 = image->common.transform->matrix[0][1];
+		if (m01 == -image->common.transform->matrix[1][0])
+		{
+			if (m01 == -pixman_fixed_1)
+			    flags |= FAST_PATH_ROTATE_90_TRANSFORM;
+			else if (m01 == pixman_fixed_1)
+			    flags |= FAST_PATH_ROTATE_270_TRANSFORM;
+		}
+	    }
+	}
+
+	if (image->common.transform->matrix[0][0] > 0)
+	    flags |= FAST_PATH_X_UNIT_POSITIVE;
+
+	if (image->common.transform->matrix[1][0] == 0)
+	    flags |= FAST_PATH_Y_UNIT_ZERO;
+    }
+
+    /* Filter */
+    switch (image->common.filter)
+    {
+    case PIXMAN_FILTER_NEAREST:
+    case PIXMAN_FILTER_FAST:
+	flags |= (FAST_PATH_NEAREST_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
+	break;
+
+    case PIXMAN_FILTER_BILINEAR:
+    case PIXMAN_FILTER_GOOD:
+    case PIXMAN_FILTER_BEST:
+	flags |= (FAST_PATH_BILINEAR_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
+	break;
+
+    case PIXMAN_FILTER_CONVOLUTION:
+	break;
+
+    default:
+	flags |= FAST_PATH_NO_CONVOLUTION_FILTER;
+	break;
+    }
+
+    /* Repeat mode */
+    switch (image->common.repeat)
+    {
+    case PIXMAN_REPEAT_NONE:
+	flags |=
+	    FAST_PATH_NO_REFLECT_REPEAT		|
+	    FAST_PATH_NO_PAD_REPEAT		|
+	    FAST_PATH_NO_NORMAL_REPEAT;
+	break;
+
+    case PIXMAN_REPEAT_REFLECT:
+	flags |=
+	    FAST_PATH_NO_PAD_REPEAT		|
+	    FAST_PATH_NO_NONE_REPEAT		|
+	    FAST_PATH_NO_NORMAL_REPEAT;
+	break;
+
+    case PIXMAN_REPEAT_PAD:
+	flags |=
+	    FAST_PATH_NO_REFLECT_REPEAT		|
+	    FAST_PATH_NO_NONE_REPEAT		|
+	    FAST_PATH_NO_NORMAL_REPEAT;
+	break;
+
+    default:
+	flags |=
+	    FAST_PATH_NO_REFLECT_REPEAT		|
+	    FAST_PATH_NO_PAD_REPEAT		|
+	    FAST_PATH_NO_NONE_REPEAT;
+	break;
+    }
+
+    /* Component alpha */
+    if (image->common.component_alpha)
+	flags |= FAST_PATH_COMPONENT_ALPHA;
+    else
+	flags |= FAST_PATH_UNIFIED_ALPHA;
+
+    flags |= (FAST_PATH_NO_ACCESSORS | FAST_PATH_NARROW_FORMAT);
+
+    /* Type specific checks */
+    switch (image->type)
+    {
+    case SOLID:
+	code = PIXMAN_solid;
+
+	if (image->solid.color.alpha == 0xffff)
+	    flags |= FAST_PATH_IS_OPAQUE;
+	break;
+
+    case BITS:
+	if (image->bits.width == 1	&&
+	    image->bits.height == 1	&&
+	    image->common.repeat != PIXMAN_REPEAT_NONE)
+	{
+	    code = PIXMAN_solid;
+	}
+	else
+	{
+	    code = image->bits.format;
+	}
+
+	if (!PIXMAN_FORMAT_A (image->bits.format)				&&
+	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_GRAY		&&
+	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_COLOR)
+	{
+	    flags |= FAST_PATH_SAMPLES_OPAQUE;
+
+	    if (image->common.repeat != PIXMAN_REPEAT_NONE)
+		flags |= FAST_PATH_IS_OPAQUE;
+	}
+
+	if (image->bits.read_func || image->bits.write_func)
+	    flags &= ~FAST_PATH_NO_ACCESSORS;
+
+	if (PIXMAN_FORMAT_IS_WIDE (image->bits.format))
+	    flags &= ~FAST_PATH_NARROW_FORMAT;
+	break;
+
+    case RADIAL:
+	code = PIXMAN_unknown;
+
+	/*
+	 * As explained in pixman-radial-gradient.c, every point of
+	 * the plane has a valid associated radius (and thus will be
+	 * colored) if and only if a is negative (i.e. one of the two
+	 * circles contains the other one).
+	 */
+
+        if (image->radial.a >= 0)
+	    break;
+
+	/* Fall through */
+
+    case CONICAL:
+    case LINEAR:
+	code = PIXMAN_unknown;
+
+	if (image->common.repeat != PIXMAN_REPEAT_NONE)
+	{
+	    int i;
+
+	    flags |= FAST_PATH_IS_OPAQUE;
+	    for (i = 0; i < image->gradient.n_stops; ++i)
+	    {
+		if (image->gradient.stops[i].color.alpha != 0xffff)
+		{
+		    flags &= ~FAST_PATH_IS_OPAQUE;
+		    break;
+		}
+	    }
+	}
+	break;
+
+    default:
+	code = PIXMAN_unknown;
+	break;
+    }
+
+    /* Alpha map */
+    if (!image->common.alpha_map)
+    {
+	flags |= FAST_PATH_NO_ALPHA_MAP;
+    }
+    else
+    {
+	if (PIXMAN_FORMAT_IS_WIDE (image->common.alpha_map->format))
+	    flags &= ~FAST_PATH_NARROW_FORMAT;
+    }
+
+    /* Both alpha maps and convolution filters can introduce
+     * non-opaqueness in otherwise opaque images. Also
+     * an image with component alpha turned on is only opaque
+     * if all channels are opaque, so we simply turn it off
+     * unconditionally for those images.
+     */
+    if (image->common.alpha_map					||
+	image->common.filter == PIXMAN_FILTER_CONVOLUTION	||
+	image->common.component_alpha)
+    {
+	flags &= ~(FAST_PATH_IS_OPAQUE | FAST_PATH_SAMPLES_OPAQUE);
+    }
+
+    image->common.flags = flags;
+    image->common.extended_format_code = code;
+}
+
+void
+_pixman_image_validate (pixman_image_t *image)
+{
+    if (image->common.dirty)
+    {
+	compute_image_info (image);
+
+	/* It is important that property_changed is
+	 * called *after* compute_image_info() because
+	 * property_changed() can make use of the flags
+	 * to set up accessors etc.
+	 */
+	if (image->common.property_changed)
+	    image->common.property_changed (image);
+
+	image->common.dirty = FALSE;
+    }
+
+    if (image->common.alpha_map)
+	_pixman_image_validate ((pixman_image_t *)image->common.alpha_map);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_clip_region32 (pixman_image_t *   image,
+                                pixman_region32_t *region)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (region)
+    {
+	if ((result = pixman_region32_copy (&common->clip_region, region)))
+	    image->common.have_clip_region = TRUE;
+    }
+    else
+    {
+	_pixman_image_reset_clip_region (image);
+
+	result = TRUE;
+    }
+
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_clip_region (pixman_image_t *   image,
+                              pixman_region16_t *region)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (region)
+    {
+	if ((result = pixman_region32_copy_from_region16 (&common->clip_region, region)))
+	    image->common.have_clip_region = TRUE;
+    }
+    else
+    {
+	_pixman_image_reset_clip_region (image);
+
+	result = TRUE;
+    }
+
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_has_client_clip (pixman_image_t *image,
+                                  pixman_bool_t   client_clip)
+{
+    image->common.client_clip = client_clip;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_transform (pixman_image_t *          image,
+                            const pixman_transform_t *transform)
+{
+    static const pixman_transform_t id =
+    {
+	{ { pixman_fixed_1, 0, 0 },
+	  { 0, pixman_fixed_1, 0 },
+	  { 0, 0, pixman_fixed_1 } }
+    };
+
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (common->transform == transform)
+	return TRUE;
+
+    if (!transform || memcmp (&id, transform, sizeof (pixman_transform_t)) == 0)
+    {
+	free (common->transform);
+	common->transform = NULL;
+	result = TRUE;
+
+	goto out;
+    }
+
+    if (common->transform &&
+	memcmp (common->transform, transform, sizeof (pixman_transform_t) == 0))
+    {
+	return TRUE;
+    }
+
+    if (common->transform == NULL)
+	common->transform = malloc (sizeof (pixman_transform_t));
+
+    if (common->transform == NULL)
+    {
+	result = FALSE;
+
+	goto out;
+    }
+
+    memcpy (common->transform, transform, sizeof(pixman_transform_t));
+
+    result = TRUE;
+
+out:
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_repeat (pixman_image_t *image,
+                         pixman_repeat_t repeat)
+{
+    if (image->common.repeat == repeat)
+	return;
+
+    image->common.repeat = repeat;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_filter (pixman_image_t *      image,
+                         pixman_filter_t       filter,
+                         const pixman_fixed_t *params,
+                         int                   n_params)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_fixed_t *new_params;
+
+    if (params == common->filter_params && filter == common->filter)
+	return TRUE;
+
+    new_params = NULL;
+    if (params)
+    {
+	new_params = pixman_malloc_ab (n_params, sizeof (pixman_fixed_t));
+	if (!new_params)
+	    return FALSE;
+
+	memcpy (new_params,
+	        params, n_params * sizeof (pixman_fixed_t));
+    }
+
+    common->filter = filter;
+
+    if (common->filter_params)
+	free (common->filter_params);
+
+    common->filter_params = new_params;
+    common->n_filter_params = n_params;
+
+    image_property_changed (image);
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_source_clipping (pixman_image_t *image,
+                                  pixman_bool_t   clip_sources)
+{
+    if (image->common.clip_sources == clip_sources)
+	return;
+
+    image->common.clip_sources = clip_sources;
+
+    image_property_changed (image);
+}
+
+/* Unlike all the other property setters, this function does not
+ * copy the content of indexed. Doing this copying is simply
+ * way, way too expensive.
+ */
+PIXMAN_EXPORT void
+pixman_image_set_indexed (pixman_image_t *        image,
+                          const pixman_indexed_t *indexed)
+{
+    bits_image_t *bits = (bits_image_t *)image;
+
+    if (bits->indexed == indexed)
+	return;
+
+    bits->indexed = indexed;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_alpha_map (pixman_image_t *image,
+                            pixman_image_t *alpha_map,
+                            int16_t         x,
+                            int16_t         y)
+{
+    image_common_t *common = (image_common_t *)image;
+
+    return_if_fail (!alpha_map || alpha_map->type == BITS);
+
+    if (alpha_map && common->alpha_count > 0)
+    {
+	/* If this image is being used as an alpha map itself,
+	 * then you can't give it an alpha map of its own.
+	 */
+	return;
+    }
+
+    if (alpha_map && alpha_map->common.alpha_map)
+    {
+	/* If the image has an alpha map of its own,
+	 * then it can't be used as an alpha map itself
+	 */
+	return;
+    }
+
+    if (common->alpha_map != (bits_image_t *)alpha_map)
+    {
+	if (common->alpha_map)
+	{
+	    common->alpha_map->common.alpha_count--;
+
+	    pixman_image_unref ((pixman_image_t *)common->alpha_map);
+	}
+
+	if (alpha_map)
+	{
+	    common->alpha_map = (bits_image_t *)pixman_image_ref (alpha_map);
+
+	    common->alpha_map->common.alpha_count++;
+	}
+	else
+	{
+	    common->alpha_map = NULL;
+	}
+    }
+
+    common->alpha_origin_x = x;
+    common->alpha_origin_y = y;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_component_alpha   (pixman_image_t *image,
+                                    pixman_bool_t   component_alpha)
+{
+    if (image->common.component_alpha == component_alpha)
+	return;
+
+    image->common.component_alpha = component_alpha;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_get_component_alpha   (pixman_image_t       *image)
+{
+    return image->common.component_alpha;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_accessors (pixman_image_t *           image,
+                            pixman_read_memory_func_t  read_func,
+                            pixman_write_memory_func_t write_func)
+{
+    return_if_fail (image != NULL);
+
+    if (image->type == BITS)
+    {
+	image->bits.read_func = read_func;
+	image->bits.write_func = write_func;
+
+	image_property_changed (image);
+    }
+}
+
+PIXMAN_EXPORT uint32_t *
+pixman_image_get_data (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.bits;
+
+    return NULL;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_width (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.width;
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_height (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.height;
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_stride (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.rowstride * (int) sizeof (uint32_t);
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_depth (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return PIXMAN_FORMAT_DEPTH (image->bits.format);
+
+    return 0;
+}
+
+PIXMAN_EXPORT pixman_format_code_t
+pixman_image_get_format (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.format;
+
+    return 0;
+}
+
+uint32_t
+_pixman_image_get_solid (pixman_implementation_t *imp,
+			 pixman_image_t *         image,
+                         pixman_format_code_t     format)
+{
+    uint32_t result;
+    pixman_iter_t iter;
+
+    _pixman_implementation_src_iter_init (
+	imp, &iter, image, 0, 0, 1, 1,
+	(uint8_t *)&result, ITER_NARROW);
+
+    result = *iter.get_scanline (&iter, NULL);
+
+    /* If necessary, convert RGB <--> BGR. */
+    if (PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB)
+    {
+	result = (((result & 0xff000000) >>  0) |
+	          ((result & 0x00ff0000) >> 16) |
+	          ((result & 0x0000ff00) >>  0) |
+	          ((result & 0x000000ff) << 16));
+    }
+
+    return result;
+}
diff --git a/pixman/pixman/pixman-inlines.h b/pixman/pixman/pixman-inlines.h
index 664d4a1a6..f1e0cbd77 100644
--- a/pixman/pixman/pixman-inlines.h
+++ b/pixman/pixman/pixman-inlines.h
@@ -1,1280 +1,1280 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  SuSE makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author:  Keith Packard, SuSE, Inc.
- */
-
-#ifndef PIXMAN_FAST_PATH_H__
-#define PIXMAN_FAST_PATH_H__
-
-#include "pixman-private.h"
-
-#define PIXMAN_REPEAT_COVER -1
-
-/* Flags describing input parameters to fast path macro template.
- * Turning on some flag values may indicate that
- * "some property X is available so template can use this" or
- * "some property X should be handled by template".
- *
- * FLAG_HAVE_SOLID_MASK
- *  Input mask is solid so template should handle this.
- *
- * FLAG_HAVE_NON_SOLID_MASK
- *  Input mask is bits mask so template should handle this.
- *
- * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually
- * exclusive. (It's not allowed to turn both flags on)
- */
-#define FLAG_NONE				(0)
-#define FLAG_HAVE_SOLID_MASK			(1 <<   1)
-#define FLAG_HAVE_NON_SOLID_MASK		(1 <<   2)
-
-/* To avoid too short repeated scanline function calls, extend source
- * scanlines having width less than below constant value.
- */
-#define REPEAT_NORMAL_MIN_WIDTH			64
-
-static force_inline pixman_bool_t
-repeat (pixman_repeat_t repeat, int *c, int size)
-{
-    if (repeat == PIXMAN_REPEAT_NONE)
-    {
-	if (*c < 0 || *c >= size)
-	    return FALSE;
-    }
-    else if (repeat == PIXMAN_REPEAT_NORMAL)
-    {
-	while (*c >= size)
-	    *c -= size;
-	while (*c < 0)
-	    *c += size;
-    }
-    else if (repeat == PIXMAN_REPEAT_PAD)
-    {
-	*c = CLIP (*c, 0, size - 1);
-    }
-    else /* REFLECT */
-    {
-	*c = MOD (*c, size * 2);
-	if (*c >= size)
-	    *c = size * 2 - *c - 1;
-    }
-    return TRUE;
-}
-
-#if SIZEOF_LONG > 4
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
-			uint32_t bl, uint32_t br,
-			int distx, int disty)
-{
-    uint64_t distxy, distxiy, distixy, distixiy;
-    uint64_t tl64, tr64, bl64, br64;
-    uint64_t f, r;
-
-    distxy = distx * disty;
-    distxiy = distx * (256 - disty);
-    distixy = (256 - distx) * disty;
-    distixiy = (256 - distx) * (256 - disty);
-
-    /* Alpha and Blue */
-    tl64 = tl & 0xff0000ff;
-    tr64 = tr & 0xff0000ff;
-    bl64 = bl & 0xff0000ff;
-    br64 = br & 0xff0000ff;
-
-    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
-    r = f & 0x0000ff0000ff0000ull;
-
-    /* Red and Green */
-    tl64 = tl;
-    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
-
-    tr64 = tr;
-    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
-
-    bl64 = bl;
-    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
-
-    br64 = br;
-    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
-
-    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
-    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
-
-    return (uint32_t)(r >> 16);
-}
-
-#else
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
-			uint32_t bl, uint32_t br,
-			int distx, int disty)
-{
-    int distxy, distxiy, distixy, distixiy;
-    uint32_t f, r;
-
-    distxy = distx * disty;
-    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
-    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
-    distixiy =
-	256 * 256 - (disty << 8) -
-	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
-
-    /* Blue */
-    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
-      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
-
-    /* Green */
-    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
-      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
-    r |= f & 0xff000000;
-
-    tl >>= 16;
-    tr >>= 16;
-    bl >>= 16;
-    br >>= 16;
-    r >>= 16;
-
-    /* Red */
-    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
-      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
-    r |= f & 0x00ff0000;
-
-    /* Alpha */
-    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
-      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
-    r |= f & 0xff000000;
-
-    return r;
-}
-
-#endif
-
-/*
- * For each scanline fetched from source image with PAD repeat:
- * - calculate how many pixels need to be padded on the left side
- * - calculate how many pixels need to be padded on the right side
- * - update width to only count pixels which are fetched from the image
- * All this information is returned via 'width', 'left_pad', 'right_pad'
- * arguments. The code is assuming that 'unit_x' is positive.
- *
- * Note: 64-bit math is used in order to avoid potential overflows, which
- *       is probably excessive in many cases. This particular function
- *       may need its own correctness test and performance tuning.
- */
-static force_inline void
-pad_repeat_get_scanline_bounds (int32_t         source_image_width,
-				pixman_fixed_t  vx,
-				pixman_fixed_t  unit_x,
-				int32_t *       width,
-				int32_t *       left_pad,
-				int32_t *       right_pad)
-{
-    int64_t max_vx = (int64_t) source_image_width << 16;
-    int64_t tmp;
-    if (vx < 0)
-    {
-	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
-	if (tmp > *width)
-	{
-	    *left_pad = *width;
-	    *width = 0;
-	}
-	else
-	{
-	    *left_pad = (int32_t) tmp;
-	    *width -= (int32_t) tmp;
-	}
-    }
-    else
-    {
-	*left_pad = 0;
-    }
-    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
-    if (tmp < 0)
-    {
-	*right_pad = *width;
-	*width = 0;
-    }
-    else if (tmp >= *width)
-    {
-	*right_pad = 0;
-    }
-    else
-    {
-	*right_pad = *width - (int32_t) tmp;
-	*width = (int32_t) tmp;
-    }
-}
-
-/* A macroified version of specialized nearest scalers for some
- * common 8888 and 565 formats. It supports SRC and OVER ops.
- *
- * There are two repeat versions, one that handles repeat normal,
- * and one without repeat handling that only works if the src region
- * used is completely covered by the pre-repeated source samples.
- *
- * The loops are unrolled to process two pixels per iteration for better
- * performance on most CPU architectures (superscalar processors
- * can issue several operations simultaneously, other processors can hide
- * instructions latencies by pipelining operations). Unrolling more
- * does not make much sense because the compiler will start running out
- * of spare registers soon.
- */
-
-#define GET_8888_ALPHA(s) ((s) >> 24)
- /* This is not actually used since we don't have an OVER with
-    565 source, but it is needed to build. */
-#define GET_0565_ALPHA(s) 0xff
-#define GET_x888_ALPHA(s) 0xff
-
-#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
-			      src_type_t, dst_type_t, OP, repeat_mode)				\
-static force_inline void									\
-scanline_func_name (dst_type_t       *dst,							\
-		    const src_type_t *src,							\
-		    int32_t           w,							\
-		    pixman_fixed_t    vx,							\
-		    pixman_fixed_t    unit_x,							\
-		    pixman_fixed_t    max_vx,							\
-		    pixman_bool_t     fully_transparent_src)					\
-{												\
-	uint32_t   d;										\
-	src_type_t s1, s2;									\
-	uint8_t    a1, a2;									\
-	int        x1, x2;									\
-												\
-	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
-	    return;										\
-												\
-	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
-	    abort();										\
-												\
-	while ((w -= 2) >= 0)									\
-	{											\
-	    x1 = vx >> 16;									\
-	    vx += unit_x;									\
-	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    {											\
-		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
-	    }											\
-	    s1 = src[x1];									\
-												\
-	    x2 = vx >> 16;									\
-	    vx += unit_x;									\
-	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    {											\
-		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
-	    }											\
-	    s2 = src[x2];									\
-												\
-	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
-	    {											\
-		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
-		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
-												\
-		if (a1 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		}										\
-		else if (s1)									\
-		{										\
-		    d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
-		    a1 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-												\
-		if (a2 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
-		}										\
-		else if (s2)									\
-		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);				\
-		    a2 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-	    }											\
-	    else /* PIXMAN_OP_SRC */								\
-	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
-	    }											\
-	}											\
-												\
-	if (w & 1)										\
-	{											\
-	    x1 = vx >> 16;									\
-	    s1 = src[x1];									\
-												\
-	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
-	    {											\
-		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
-												\
-		if (a1 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		}										\
-		else if (s1)									\
-		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
-		    a1 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-	    }											\
-	    else /* PIXMAN_OP_SRC */								\
-	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-	    }											\
-	}											\
-}
-
-#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
-				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
-static void											\
-fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
-						   pixman_composite_info_t *info)               \
-{												\
-    PIXMAN_COMPOSITE_ARGS (info);					                        \
-    dst_type_t *dst_line;						                        \
-    mask_type_t *mask_line;									\
-    src_type_t *src_first_line;									\
-    int       y;										\
-    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
-    pixman_fixed_t max_vy;									\
-    pixman_vector_t v;										\
-    pixman_fixed_t vx, vy;									\
-    pixman_fixed_t unit_x, unit_y;								\
-    int32_t left_pad, right_pad;								\
-												\
-    src_type_t *src;										\
-    dst_type_t *dst;										\
-    mask_type_t solid_mask;									\
-    const mask_type_t *mask = &solid_mask;							\
-    int src_stride, mask_stride, dst_stride;							\
-												\
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
-    if (have_mask)										\
-    {												\
-	if (mask_is_solid)									\
-	    solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
-	else											\
-	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
-				   mask_stride, mask_line, 1);					\
-    }												\
-    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
-     * transformed from destination space to source space */					\
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
-												\
-    /* reference point is the center of the pixel */						\
-    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
-    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
-    v.vector[2] = pixman_fixed_1;								\
-												\
-    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
-	return;											\
-												\
-    unit_x = src_image->common.transform->matrix[0][0];						\
-    unit_y = src_image->common.transform->matrix[1][1];						\
-												\
-    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
-    v.vector[0] -= pixman_fixed_e;								\
-    v.vector[1] -= pixman_fixed_e;								\
-												\
-    vx = v.vector[0];										\
-    vy = v.vector[1];										\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
-    {												\
-	/* Clamp repeating positions inside the actual samples */				\
-	max_vx = src_image->bits.width << 16;							\
-	max_vy = src_image->bits.height << 16;							\
-												\
-	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
-	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-    }												\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
-	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
-    {												\
-	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
-					&width, &left_pad, &right_pad);				\
-	vx += left_pad * unit_x;								\
-    }												\
-												\
-    while (--height >= 0)									\
-    {												\
-	dst = dst_line;										\
-	dst_line += dst_stride;									\
-	if (have_mask && !mask_is_solid)							\
-	{											\
-	    mask = mask_line;									\
-	    mask_line += mask_stride;								\
-	}											\
-												\
-	y = vy >> 16;										\
-	vy += unit_y;										\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
-	{											\
-	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
-	    src = src_first_line + src_stride * y;						\
-	    if (left_pad > 0)									\
-	    {											\
-		scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE);			\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
-			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
-			       dst + left_pad + width, src + src_image->bits.width - 1,		\
-			       right_pad, 0, 0, 0, FALSE);					\
-	    }											\
-	}											\
-	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
-	{											\
-	    static const src_type_t zero[1] = { 0 };						\
-	    if (y < 0 || y >= src_image->bits.height)						\
-	    {											\
-		scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE);	\
-		continue;									\
-	    }											\
-	    src = src_first_line + src_stride * y;						\
-	    if (left_pad > 0)									\
-	    {											\
-		scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE);			\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
-			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
-			       dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE);		\
-	    }											\
-	}											\
-	else											\
-	{											\
-	    src = src_first_line + src_stride * y;						\
-	    scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE);			\
-	}											\
-    }												\
-}
-
-/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
-#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
-				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
-	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
-				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
-
-#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
-			      repeat_mode)							\
-    static force_inline void									\
-    scanline_func##scale_func_name##_wrapper (							\
-		    const uint8_t    *mask,							\
-		    dst_type_t       *dst,							\
-		    const src_type_t *src,							\
-		    int32_t          w,								\
-		    pixman_fixed_t   vx,							\
-		    pixman_fixed_t   unit_x,							\
-		    pixman_fixed_t   max_vx,							\
-		    pixman_bool_t    fully_transparent_src)					\
-    {												\
-	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\
-    }												\
-    FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\
-			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
-
-#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
-			      repeat_mode)							\
-	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\
-			      dst_type_t, repeat_mode)
-
-#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
-		     src_type_t, dst_type_t, OP, repeat_mode)				\
-    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
-			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
-			  OP, repeat_mode)						\
-    FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\
-			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
-			  src_type_t, dst_type_t, repeat_mode)
-
-
-#define SCALED_NEAREST_FLAGS						\
-    (FAST_PATH_SCALE_TRANSFORM	|					\
-     FAST_PATH_NO_ALPHA_MAP	|					\
-     FAST_PATH_NEAREST_FILTER	|					\
-     FAST_PATH_NO_ACCESSORS	|					\
-     FAST_PATH_NARROW_FORMAT)
-
-#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
-    }
-
-/* Prefer the use of 'cover' variant, because it is faster */
-#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
-    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
-    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
-    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
-
-#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
-
-#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
-
-/*****************************************************************************/
-
-/*
- * Identify 5 zones in each scanline for bilinear scaling. Depending on
- * whether 2 pixels to be interpolated are fetched from the image itself,
- * from the padding area around it or from both image and padding area.
- */
-static force_inline void
-bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
-					 pixman_fixed_t  vx,
-					 pixman_fixed_t  unit_x,
-					 int32_t *       left_pad,
-					 int32_t *       left_tz,
-					 int32_t *       width,
-					 int32_t *       right_tz,
-					 int32_t *       right_pad)
-{
-	int width1 = *width, left_pad1, right_pad1;
-	int width2 = *width, left_pad2, right_pad2;
-
-	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
-					&width1, &left_pad1, &right_pad1);
-	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
-					unit_x, &width2, &left_pad2, &right_pad2);
-
-	*left_pad = left_pad2;
-	*left_tz = left_pad1 - left_pad2;
-	*right_tz = right_pad2 - right_pad1;
-	*right_pad = right_pad1;
-	*width -= *left_pad + *left_tz + *right_tz + *right_pad;
-}
-
-/*
- * Main loop template for single pass bilinear scaling. It needs to be
- * provided with 'scanline_func' which should do the compositing operation.
- * The needed function has the following prototype:
- *
- *	scanline_func (dst_type_t *       dst,
- *		       const mask_type_ * mask,
- *		       const src_type_t * src_top,
- *		       const src_type_t * src_bottom,
- *		       int32_t            width,
- *		       int                weight_top,
- *		       int                weight_bottom,
- *		       pixman_fixed_t     vx,
- *		       pixman_fixed_t     unit_x,
- *		       pixman_fixed_t     max_vx,
- *		       pixman_bool_t      zero_src)
- *
- * Where:
- *  dst                 - destination scanline buffer for storing results
- *  mask                - mask buffer (or single value for solid mask)
- *  src_top, src_bottom - two source scanlines
- *  width               - number of pixels to process
- *  weight_top          - weight of the top row for interpolation
- *  weight_bottom       - weight of the bottom row for interpolation
- *  vx                  - initial position for fetching the first pair of
- *                        pixels from the source buffer
- *  unit_x              - position increment needed to move to the next pair
- *                        of pixels
- *  max_vx              - image size as a fixed point value, can be used for
- *                        implementing NORMAL repeat (when it is supported)
- *  zero_src            - boolean hint variable, which is set to TRUE when
- *                        all source pixels are fetched from zero padding
- *                        zone for NONE repeat
- *
- * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
- *       but sometimes it may be less than that for NONE repeat when handling
- *       fuzzy antialiased top or bottom image edges. Also both top and
- *       bottom weight variables are guaranteed to have value in 0-255
- *       range and can fit into unsigned byte or be used with 8-bit SIMD
- *       multiplication instructions.
- */
-#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
-				  dst_type_t, repeat_mode, flags)				\
-static void											\
-fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
-						   pixman_composite_info_t *info)		\
-{												\
-    PIXMAN_COMPOSITE_ARGS (info);								\
-    dst_type_t *dst_line;									\
-    mask_type_t *mask_line;									\
-    src_type_t *src_first_line;									\
-    int       y1, y2;										\
-    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
-    pixman_vector_t v;										\
-    pixman_fixed_t vx, vy;									\
-    pixman_fixed_t unit_x, unit_y;								\
-    int32_t left_pad, left_tz, right_tz, right_pad;						\
-												\
-    dst_type_t *dst;										\
-    mask_type_t solid_mask;									\
-    const mask_type_t *mask = &solid_mask;							\
-    int src_stride, mask_stride, dst_stride;							\
-												\
-    int src_width;										\
-    pixman_fixed_t src_width_fixed;								\
-    int max_x;											\
-    pixman_bool_t need_src_extension;								\
-												\
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
-    if (flags & FLAG_HAVE_SOLID_MASK)								\
-    {												\
-	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
-	mask_stride = 0;									\
-    }												\
-    else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
-    {												\
-	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,				\
-			       mask_stride, mask_line, 1);					\
-    }												\
-												\
-    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
-     * transformed from destination space to source space */					\
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
-												\
-    /* reference point is the center of the pixel */						\
-    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
-    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
-    v.vector[2] = pixman_fixed_1;								\
-												\
-    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
-	return;											\
-												\
-    unit_x = src_image->common.transform->matrix[0][0];						\
-    unit_y = src_image->common.transform->matrix[1][1];						\
-												\
-    v.vector[0] -= pixman_fixed_1 / 2;								\
-    v.vector[1] -= pixman_fixed_1 / 2;								\
-												\
-    vy = v.vector[1];										\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
-	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
-    {												\
-	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\
-					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
-	{											\
-	    /* PAD repeat does not need special handling for 'transition zones' and */		\
-	    /* they can be combined with 'padding zones' safely */				\
-	    left_pad += left_tz;								\
-	    right_pad += right_tz;								\
-	    left_tz = right_tz = 0;								\
-	}											\
-	v.vector[0] += left_pad * unit_x;							\
-    }												\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
-    {												\
-	vx = v.vector[0];									\
-	repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));		\
-	max_x = pixman_fixed_to_int (vx + (width - 1) * unit_x) + 1;				\
-												\
-	if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)					\
-	{											\
-	    src_width = 0;									\
-												\
-	    while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)			\
-		src_width += src_image->bits.width;						\
-												\
-	    need_src_extension = TRUE;								\
-	}											\
-	else											\
-	{											\
-	    src_width = src_image->bits.width;							\
-	    need_src_extension = FALSE;								\
-	}											\
-												\
-	src_width_fixed = pixman_int_to_fixed (src_width);					\
-    }												\
-												\
-    while (--height >= 0)									\
-    {												\
-	int weight1, weight2;									\
-	dst = dst_line;										\
-	dst_line += dst_stride;									\
-	vx = v.vector[0];									\
-	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
-	{											\
-	    mask = mask_line;									\
-	    mask_line += mask_stride;								\
-	}											\
-												\
-	y1 = pixman_fixed_to_int (vy);								\
-	weight2 = (vy >> 8) & 0xff;								\
-	if (weight2)										\
-	{											\
-	    /* normal case, both row weights are in 0-255 range and fit unsigned byte */	\
-	    y2 = y1 + 1;									\
-	    weight1 = 256 - weight2;								\
-	}											\
-	else											\
-	{											\
-	    /* set both top and bottom row to the same scanline, and weights to 128+128 */	\
-	    y2 = y1;										\
-	    weight1 = weight2 = 128;								\
-	}											\
-	vy += unit_y;										\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
-	{											\
-	    src_type_t *src1, *src2;								\
-	    src_type_t buf1[2];									\
-	    src_type_t buf2[2];									\
-	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\
-	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
-	    src1 = src_first_line + src_stride * y1;						\
-	    src2 = src_first_line + src_stride * y2;						\
-												\
-	    if (left_pad > 0)									\
-	    {											\
-		buf1[0] = buf1[1] = src1[0];							\
-		buf2[0] = buf2[1] = src2[0];							\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
-		dst += left_pad;								\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += left_pad;								\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (dst, mask,							\
-			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
-		dst += width;									\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += width;								\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
-		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
-	    }											\
-	}											\
-	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
-	{											\
-	    src_type_t *src1, *src2;								\
-	    src_type_t buf1[2];									\
-	    src_type_t buf2[2];									\
-	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
-	    if (y1 < 0)										\
-	    {											\
-		weight1 = 0;									\
-		y1 = 0;										\
-	    }											\
-	    if (y1 >= src_image->bits.height)							\
-	    {											\
-		weight1 = 0;									\
-		y1 = src_image->bits.height - 1;						\
-	    }											\
-	    if (y2 < 0)										\
-	    {											\
-		weight2 = 0;									\
-		y2 = 0;										\
-	    }											\
-	    if (y2 >= src_image->bits.height)							\
-	    {											\
-		weight2 = 0;									\
-		y2 = src_image->bits.height - 1;						\
-	    }											\
-	    src1 = src_first_line + src_stride * y1;						\
-	    src2 = src_first_line + src_stride * y2;						\
-												\
-	    if (left_pad > 0)									\
-	    {											\
-		buf1[0] = buf1[1] = 0;								\
-		buf2[0] = buf2[1] = 0;								\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
-		dst += left_pad;								\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += left_pad;								\
-	    }											\
-	    if (left_tz > 0)									\
-	    {											\
-		buf1[0] = 0;									\
-		buf1[1] = src1[0];								\
-		buf2[0] = 0;									\
-		buf2[1] = src2[0];								\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, left_tz, weight1, weight2,				\
-			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
-		dst += left_tz;									\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += left_tz;								\
-		vx += left_tz * unit_x;								\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (dst, mask,							\
-			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
-		dst += width;									\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += width;								\
-		vx += width * unit_x;								\
-	    }											\
-	    if (right_tz > 0)									\
-	    {											\
-		buf1[0] = src1[src_image->bits.width - 1];					\
-		buf1[1] = 0;									\
-		buf2[0] = src2[src_image->bits.width - 1];					\
-		buf2[1] = 0;									\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, right_tz, weight1, weight2,				\
-			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
-		dst += right_tz;								\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += right_tz;								\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		buf1[0] = buf1[1] = 0;								\
-		buf2[0] = buf2[1] = 0;								\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
-	    }											\
-	}											\
-	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	{											\
-	    int32_t	    num_pixels;								\
-	    int32_t	    width_remain;							\
-	    src_type_t *    src_line_top;							\
-	    src_type_t *    src_line_bottom;							\
-	    src_type_t	    buf1[2];								\
-	    src_type_t	    buf2[2];								\
-	    src_type_t	    extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];			\
-	    src_type_t	    extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];			\
-	    int		    i, j;								\
-												\
-	    repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);				\
-	    repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);				\
-	    src_line_top = src_first_line + src_stride * y1;					\
-	    src_line_bottom = src_first_line + src_stride * y2;					\
-												\
-	    if (need_src_extension)								\
-	    {											\
-		for (i=0; i<src_width;)								\
-		{										\
-		    for (j=0; j<src_image->bits.width; j++, i++)				\
-		    {										\
-			extended_src_line0[i] = src_line_top[j];				\
-			extended_src_line1[i] = src_line_bottom[j];				\
-		    }										\
-		}										\
-												\
-		src_line_top = &extended_src_line0[0];						\
-		src_line_bottom = &extended_src_line1[0];					\
-	    }											\
-												\
-	    /* Top & Bottom wrap around buffer */						\
-	    buf1[0] = src_line_top[src_width - 1];						\
-	    buf1[1] = src_line_top[0];								\
-	    buf2[0] = src_line_bottom[src_width - 1];						\
-	    buf2[1] = src_line_bottom[0];							\
-												\
-	    width_remain = width;								\
-												\
-	    while (width_remain > 0)								\
-	    {											\
-		/* We use src_width_fixed because it can make vx in original source range */	\
-		repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);				\
-												\
-		/* Wrap around part */								\
-		if (pixman_fixed_to_int (vx) == src_width - 1)					\
-		{										\
-		    /* for positive unit_x							\
-		     * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed		\
-		     *										\
-		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
-		     * So we are safe from overflow.						\
-		     */										\
-		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
-												\
-		    if (num_pixels > width_remain)						\
-			num_pixels = width_remain;						\
-												\
-		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
-				   weight1, weight2, pixman_fixed_frac(vx),			\
-				   unit_x, src_width_fixed, FALSE);				\
-												\
-		    width_remain -= num_pixels;							\
-		    vx += num_pixels * unit_x;							\
-		    dst += num_pixels;								\
-												\
-		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
-			mask += num_pixels;							\
-												\
-		    repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);			\
-		}										\
-												\
-		/* Normal scanline composite */							\
-		if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)		\
-		{										\
-		    /* for positive unit_x							\
-		     * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)	\
-		     *										\
-		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
-		     * So we are safe from overflow here.					\
-		     */										\
-		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
-				  / unit_x) + 1;						\
-												\
-		    if (num_pixels > width_remain)						\
-			num_pixels = width_remain;						\
-												\
-		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
-				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
-												\
-		    width_remain -= num_pixels;							\
-		    vx += num_pixels * unit_x;							\
-		    dst += num_pixels;								\
-												\
-		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
-		        mask += num_pixels;							\
-		}										\
-	    }											\
-	}											\
-	else											\
-	{											\
-	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
-			   src_first_line + src_stride * y2, width,				\
-			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
-	}											\
-    }												\
-}
-
-/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
-#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
-				  dst_type_t, repeat_mode, flags)				\
-	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
-				  dst_type_t, repeat_mode, flags)
-
-#define SCALED_BILINEAR_FLAGS						\
-    (FAST_PATH_SCALE_TRANSFORM	|					\
-     FAST_PATH_NO_ALPHA_MAP	|					\
-     FAST_PATH_BILINEAR_FILTER	|					\
-     FAST_PATH_NO_ACCESSORS	|					\
-     FAST_PATH_NARROW_FORMAT)
-
-#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)	\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
-    }
-
-/* Prefer the use of 'cover' variant, because it is faster */
-#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\
-    SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\
-    SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\
-    SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),			\
-    SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func)
-
-#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),		\
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
-
-#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\
-    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
-    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
-    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),		\
-    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
-
-#endif
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifndef PIXMAN_FAST_PATH_H__
+#define PIXMAN_FAST_PATH_H__
+
+#include "pixman-private.h"
+
+#define PIXMAN_REPEAT_COVER -1
+
+/* Flags describing input parameters to fast path macro template.
+ * Turning on some flag values may indicate that
+ * "some property X is available so template can use this" or
+ * "some property X should be handled by template".
+ *
+ * FLAG_HAVE_SOLID_MASK
+ *  Input mask is solid so template should handle this.
+ *
+ * FLAG_HAVE_NON_SOLID_MASK
+ *  Input mask is bits mask so template should handle this.
+ *
+ * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually
+ * exclusive. (It's not allowed to turn both flags on)
+ */
+#define FLAG_NONE				(0)
+#define FLAG_HAVE_SOLID_MASK			(1 <<   1)
+#define FLAG_HAVE_NON_SOLID_MASK		(1 <<   2)
+
+/* To avoid too short repeated scanline function calls, extend source
+ * scanlines having width less than below constant value.
+ */
+#define REPEAT_NORMAL_MIN_WIDTH			64
+
+static force_inline pixman_bool_t
+repeat (pixman_repeat_t repeat, int *c, int size)
+{
+    if (repeat == PIXMAN_REPEAT_NONE)
+    {
+	if (*c < 0 || *c >= size)
+	    return FALSE;
+    }
+    else if (repeat == PIXMAN_REPEAT_NORMAL)
+    {
+	while (*c >= size)
+	    *c -= size;
+	while (*c < 0)
+	    *c += size;
+    }
+    else if (repeat == PIXMAN_REPEAT_PAD)
+    {
+	*c = CLIP (*c, 0, size - 1);
+    }
+    else /* REFLECT */
+    {
+	*c = MOD (*c, size * 2);
+	if (*c >= size)
+	    *c = size * 2 - *c - 1;
+    }
+    return TRUE;
+}
+
+#if SIZEOF_LONG > 4
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    uint64_t distxy, distxiy, distixy, distixiy;
+    uint64_t tl64, tr64, bl64, br64;
+    uint64_t f, r;
+
+    distxy = distx * disty;
+    distxiy = distx * (256 - disty);
+    distixy = (256 - distx) * disty;
+    distixiy = (256 - distx) * (256 - disty);
+
+    /* Alpha and Blue */
+    tl64 = tl & 0xff0000ff;
+    tr64 = tr & 0xff0000ff;
+    bl64 = bl & 0xff0000ff;
+    br64 = br & 0xff0000ff;
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r = f & 0x0000ff0000ff0000ull;
+
+    /* Red and Green */
+    tl64 = tl;
+    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+    tr64 = tr;
+    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+    bl64 = bl;
+    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+    br64 = br;
+    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+    return (uint32_t)(r >> 16);
+}
+
+#else
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t f, r;
+
+    distxy = distx * disty;
+    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
+    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
+    distixiy =
+	256 * 256 - (disty << 8) -
+	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
+
+    /* Blue */
+    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+
+    /* Green */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    tl >>= 16;
+    tr >>= 16;
+    bl >>= 16;
+    br >>= 16;
+    r >>= 16;
+
+    /* Red */
+    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+    r |= f & 0x00ff0000;
+
+    /* Alpha */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    return r;
+}
+
+#endif
+
+/*
+ * For each scanline fetched from source image with PAD repeat:
+ * - calculate how many pixels need to be padded on the left side
+ * - calculate how many pixels need to be padded on the right side
+ * - update width to only count pixels which are fetched from the image
+ * All this information is returned via 'width', 'left_pad', 'right_pad'
+ * arguments. The code is assuming that 'unit_x' is positive.
+ *
+ * Note: 64-bit math is used in order to avoid potential overflows, which
+ *       is probably excessive in many cases. This particular function
+ *       may need its own correctness test and performance tuning.
+ */
+static force_inline void
+pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+				pixman_fixed_t  vx,
+				pixman_fixed_t  unit_x,
+				int32_t *       width,
+				int32_t *       left_pad,
+				int32_t *       right_pad)
+{
+    int64_t max_vx = (int64_t) source_image_width << 16;
+    int64_t tmp;
+    if (vx < 0)
+    {
+	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
+	if (tmp > *width)
+	{
+	    *left_pad = *width;
+	    *width = 0;
+	}
+	else
+	{
+	    *left_pad = (int32_t) tmp;
+	    *width -= (int32_t) tmp;
+	}
+    }
+    else
+    {
+	*left_pad = 0;
+    }
+    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
+    if (tmp < 0)
+    {
+	*right_pad = *width;
+	*width = 0;
+    }
+    else if (tmp >= *width)
+    {
+	*right_pad = 0;
+    }
+    else
+    {
+	*right_pad = *width - (int32_t) tmp;
+	*width = (int32_t) tmp;
+    }
+}
+
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+    565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+#define GET_x888_ALPHA(s) 0xff
+
+#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
+			      src_type_t, dst_type_t, OP, repeat_mode)				\
+static force_inline void									\
+scanline_func_name (dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t           w,							\
+		    pixman_fixed_t    vx,							\
+		    pixman_fixed_t    unit_x,							\
+		    pixman_fixed_t    max_vx,							\
+		    pixman_bool_t     fully_transparent_src)					\
+{												\
+	uint32_t   d;										\
+	src_type_t s1, s2;									\
+	uint8_t    a1, a2;									\
+	int        x1, x2;									\
+												\
+	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
+	    return;										\
+												\
+	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
+	    abort();										\
+												\
+	while ((w -= 2) >= 0)									\
+	{											\
+	    x1 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s1 = src[x1];									\
+												\
+	    x2 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s2 = src[x2];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+												\
+		if (a2 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+		}										\
+		else if (s2)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);				\
+		    a2 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+	    }											\
+	}											\
+												\
+	if (w & 1)										\
+	{											\
+	    x1 = vx >> 16;									\
+	    s1 = src[x1];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+	    }											\
+	}											\
+}
+
+#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+static void											\
+fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_composite_info_t *info)               \
+{												\
+    PIXMAN_COMPOSITE_ARGS (info);					                        \
+    dst_type_t *dst_line;						                        \
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y;										\
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_fixed_t max_vy;									\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, right_pad;								\
+												\
+    src_type_t *src;										\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (have_mask)										\
+    {												\
+	if (mask_is_solid)									\
+	    solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
+	else											\
+	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
+				   mask_stride, mask_line, 1);					\
+    }												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
+    v.vector[0] -= pixman_fixed_e;								\
+    v.vector[1] -= pixman_fixed_e;								\
+												\
+    vx = v.vector[0];										\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	/* Clamp repeating positions inside the actual samples */				\
+	max_vx = src_image->bits.width << 16;							\
+	max_vy = src_image->bits.height << 16;							\
+												\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
+	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+    }												\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
+					&width, &left_pad, &right_pad);				\
+	vx += left_pad * unit_x;								\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	if (have_mask && !mask_is_solid)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y = vy >> 16;										\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE);			\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, src + src_image->bits.width - 1,		\
+			       right_pad, 0, 0, 0, FALSE);					\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    static const src_type_t zero[1] = { 0 };						\
+	    if (y < 0 || y >= src_image->bits.height)						\
+	    {											\
+		scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE);	\
+		continue;									\
+	    }											\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE);			\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE);		\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    src = src_first_line + src_stride * y;						\
+	    scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE);			\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
+
+#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
+			      repeat_mode)							\
+    static force_inline void									\
+    scanline_func##scale_func_name##_wrapper (							\
+		    const uint8_t    *mask,							\
+		    dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t          w,								\
+		    pixman_fixed_t   vx,							\
+		    pixman_fixed_t   unit_x,							\
+		    pixman_fixed_t   max_vx,							\
+		    pixman_bool_t    fully_transparent_src)					\
+    {												\
+	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\
+    }												\
+    FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\
+			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
+
+#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
+			      repeat_mode)							\
+	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\
+			      dst_type_t, repeat_mode)
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
+		     src_type_t, dst_type_t, OP, repeat_mode)				\
+    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
+			  OP, repeat_mode)						\
+    FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\
+			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  src_type_t, dst_type_t, repeat_mode)
+
+
+#define SCALED_NEAREST_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_NEAREST_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+
+/*****************************************************************************/
+
+/*
+ * Identify 5 zones in each scanline for bilinear scaling. Depending on
+ * whether 2 pixels to be interpolated are fetched from the image itself,
+ * from the padding area around it or from both image and padding area.
+ */
+static force_inline void
+bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+					 pixman_fixed_t  vx,
+					 pixman_fixed_t  unit_x,
+					 int32_t *       left_pad,
+					 int32_t *       left_tz,
+					 int32_t *       width,
+					 int32_t *       right_tz,
+					 int32_t *       right_pad)
+{
+	int width1 = *width, left_pad1, right_pad1;
+	int width2 = *width, left_pad2, right_pad2;
+
+	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
+					&width1, &left_pad1, &right_pad1);
+	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
+					unit_x, &width2, &left_pad2, &right_pad2);
+
+	*left_pad = left_pad2;
+	*left_tz = left_pad1 - left_pad2;
+	*right_tz = right_pad2 - right_pad1;
+	*right_pad = right_pad1;
+	*width -= *left_pad + *left_tz + *right_tz + *right_pad;
+}
+
+/*
+ * Main loop template for single pass bilinear scaling. It needs to be
+ * provided with 'scanline_func' which should do the compositing operation.
+ * The needed function has the following prototype:
+ *
+ *	scanline_func (dst_type_t *       dst,
+ *		       const mask_type_ * mask,
+ *		       const src_type_t * src_top,
+ *		       const src_type_t * src_bottom,
+ *		       int32_t            width,
+ *		       int                weight_top,
+ *		       int                weight_bottom,
+ *		       pixman_fixed_t     vx,
+ *		       pixman_fixed_t     unit_x,
+ *		       pixman_fixed_t     max_vx,
+ *		       pixman_bool_t      zero_src)
+ *
+ * Where:
+ *  dst                 - destination scanline buffer for storing results
+ *  mask                - mask buffer (or single value for solid mask)
+ *  src_top, src_bottom - two source scanlines
+ *  width               - number of pixels to process
+ *  weight_top          - weight of the top row for interpolation
+ *  weight_bottom       - weight of the bottom row for interpolation
+ *  vx                  - initial position for fetching the first pair of
+ *                        pixels from the source buffer
+ *  unit_x              - position increment needed to move to the next pair
+ *                        of pixels
+ *  max_vx              - image size as a fixed point value, can be used for
+ *                        implementing NORMAL repeat (when it is supported)
+ *  zero_src            - boolean hint variable, which is set to TRUE when
+ *                        all source pixels are fetched from zero padding
+ *                        zone for NONE repeat
+ *
+ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
+ *       but sometimes it may be less than that for NONE repeat when handling
+ *       fuzzy antialiased top or bottom image edges. Also both top and
+ *       bottom weight variables are guaranteed to have value in 0-255
+ *       range and can fit into unsigned byte or be used with 8-bit SIMD
+ *       multiplication instructions.
+ */
+#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, flags)				\
+static void											\
+fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_composite_info_t *info)		\
+{												\
+    PIXMAN_COMPOSITE_ARGS (info);								\
+    dst_type_t *dst_line;									\
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y1, y2;										\
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, left_tz, right_tz, right_pad;						\
+												\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    int src_width;										\
+    pixman_fixed_t src_width_fixed;								\
+    int max_x;											\
+    pixman_bool_t need_src_extension;								\
+												\
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (flags & FLAG_HAVE_SOLID_MASK)								\
+    {												\
+	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
+	mask_stride = 0;									\
+    }												\
+    else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
+    {												\
+	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,				\
+			       mask_stride, mask_line, 1);					\
+    }												\
+												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    v.vector[0] -= pixman_fixed_1 / 2;								\
+    v.vector[1] -= pixman_fixed_1 / 2;								\
+												\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\
+					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    /* PAD repeat does not need special handling for 'transition zones' and */		\
+	    /* they can be combined with 'padding zones' safely */				\
+	    left_pad += left_tz;								\
+	    right_pad += right_tz;								\
+	    left_tz = right_tz = 0;								\
+	}											\
+	v.vector[0] += left_pad * unit_x;							\
+    }												\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	vx = v.vector[0];									\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));		\
+	max_x = pixman_fixed_to_int (vx + (width - 1) * unit_x) + 1;				\
+												\
+	if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)					\
+	{											\
+	    src_width = 0;									\
+												\
+	    while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)			\
+		src_width += src_image->bits.width;						\
+												\
+	    need_src_extension = TRUE;								\
+	}											\
+	else											\
+	{											\
+	    src_width = src_image->bits.width;							\
+	    need_src_extension = FALSE;								\
+	}											\
+												\
+	src_width_fixed = pixman_int_to_fixed (src_width);					\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	int weight1, weight2;									\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	vx = v.vector[0];									\
+	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y1 = pixman_fixed_to_int (vy);								\
+	weight2 = (vy >> 8) & 0xff;								\
+	if (weight2)										\
+	{											\
+	    /* normal case, both row weights are in 0-255 range and fit unsigned byte */	\
+	    y2 = y1 + 1;									\
+	    weight1 = 256 - weight2;								\
+	}											\
+	else											\
+	{											\
+	    /* set both top and bottom row to the same scanline, and weights to 128+128 */	\
+	    y2 = y1;										\
+	    weight1 = weight2 = 128;								\
+	}											\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\
+	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[0];							\
+		buf2[0] = buf2[1] = src2[0];							\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
+		dst += left_pad;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += width;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
+		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
+	    if (y1 < 0)										\
+	    {											\
+		weight1 = 0;									\
+		y1 = 0;										\
+	    }											\
+	    if (y1 >= src_image->bits.height)							\
+	    {											\
+		weight1 = 0;									\
+		y1 = src_image->bits.height - 1;						\
+	    }											\
+	    if (y2 < 0)										\
+	    {											\
+		weight2 = 0;									\
+		y2 = 0;										\
+	    }											\
+	    if (y2 >= src_image->bits.height)							\
+	    {											\
+		weight2 = 0;									\
+		y2 = src_image->bits.height - 1;						\
+	    }											\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+		dst += left_pad;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (left_tz > 0)									\
+	    {											\
+		buf1[0] = 0;									\
+		buf1[1] = src1[0];								\
+		buf2[0] = 0;									\
+		buf2[1] = src2[0];								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += left_tz;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_tz;								\
+		vx += left_tz * unit_x;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += width;								\
+		vx += width * unit_x;								\
+	    }											\
+	    if (right_tz > 0)									\
+	    {											\
+		buf1[0] = src1[src_image->bits.width - 1];					\
+		buf1[1] = 0;									\
+		buf2[0] = src2[src_image->bits.width - 1];					\
+		buf2[1] = 0;									\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += right_tz;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += right_tz;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	{											\
+	    int32_t	    num_pixels;								\
+	    int32_t	    width_remain;							\
+	    src_type_t *    src_line_top;							\
+	    src_type_t *    src_line_bottom;							\
+	    src_type_t	    buf1[2];								\
+	    src_type_t	    buf2[2];								\
+	    src_type_t	    extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];			\
+	    src_type_t	    extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];			\
+	    int		    i, j;								\
+												\
+	    repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);				\
+	    src_line_top = src_first_line + src_stride * y1;					\
+	    src_line_bottom = src_first_line + src_stride * y2;					\
+												\
+	    if (need_src_extension)								\
+	    {											\
+		for (i=0; i<src_width;)								\
+		{										\
+		    for (j=0; j<src_image->bits.width; j++, i++)				\
+		    {										\
+			extended_src_line0[i] = src_line_top[j];				\
+			extended_src_line1[i] = src_line_bottom[j];				\
+		    }										\
+		}										\
+												\
+		src_line_top = &extended_src_line0[0];						\
+		src_line_bottom = &extended_src_line1[0];					\
+	    }											\
+												\
+	    /* Top & Bottom wrap around buffer */						\
+	    buf1[0] = src_line_top[src_width - 1];						\
+	    buf1[1] = src_line_top[0];								\
+	    buf2[0] = src_line_bottom[src_width - 1];						\
+	    buf2[1] = src_line_bottom[0];							\
+												\
+	    width_remain = width;								\
+												\
+	    while (width_remain > 0)								\
+	    {											\
+		/* We use src_width_fixed because it can make vx in original source range */	\
+		repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);				\
+												\
+		/* Wrap around part */								\
+		if (pixman_fixed_to_int (vx) == src_width - 1)					\
+		{										\
+		    /* for positive unit_x							\
+		     * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed		\
+		     *										\
+		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
+		     * So we are safe from overflow.						\
+		     */										\
+		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
+												\
+		    if (num_pixels > width_remain)						\
+			num_pixels = width_remain;						\
+												\
+		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
+				   weight1, weight2, pixman_fixed_frac(vx),			\
+				   unit_x, src_width_fixed, FALSE);				\
+												\
+		    width_remain -= num_pixels;							\
+		    vx += num_pixels * unit_x;							\
+		    dst += num_pixels;								\
+												\
+		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
+			mask += num_pixels;							\
+												\
+		    repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);			\
+		}										\
+												\
+		/* Normal scanline composite */							\
+		if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)		\
+		{										\
+		    /* for positive unit_x							\
+		     * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)	\
+		     *										\
+		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
+		     * So we are safe from overflow here.					\
+		     */										\
+		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
+				  / unit_x) + 1;						\
+												\
+		    if (num_pixels > width_remain)						\
+			num_pixels = width_remain;						\
+												\
+		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
+				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
+												\
+		    width_remain -= num_pixels;							\
+		    vx += num_pixels * unit_x;							\
+		    dst += num_pixels;								\
+												\
+		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
+		        mask += num_pixels;							\
+		}										\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
+			   src_first_line + src_stride * y2, width,				\
+			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, flags)				\
+	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
+				  dst_type_t, repeat_mode, flags)
+
+#define SCALED_BILINEAR_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_BILINEAR_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)	\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\
+    SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#endif
diff --git a/pixman/pixman/pixman-trap.c b/pixman/pixman/pixman-trap.c
index f57d0caa9..c6d90da24 100644
--- a/pixman/pixman/pixman-trap.c
+++ b/pixman/pixman/pixman-trap.c
@@ -1,668 +1,668 @@
-/*
- * Copyright © 2002 Keith Packard, member of The XFree86 Project, Inc.
- * Copyright © 2004 Keith Packard
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "pixman-private.h"
-
-/*
- * Compute the smallest value greater than or equal to y which is on a
- * grid row.
- */
-
-PIXMAN_EXPORT pixman_fixed_t
-pixman_sample_ceil_y (pixman_fixed_t y, int n)
-{
-    pixman_fixed_t f = pixman_fixed_frac (y);
-    pixman_fixed_t i = pixman_fixed_floor (y);
-
-    f = DIV (f - Y_FRAC_FIRST (n) + (STEP_Y_SMALL (n) - pixman_fixed_e), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
-	Y_FRAC_FIRST (n);
-    
-    if (f > Y_FRAC_LAST (n))
-    {
-	if (pixman_fixed_to_int (i) == 0x7fff)
-	{
-	    f = 0xffff; /* saturate */
-	}
-	else
-	{
-	    f = Y_FRAC_FIRST (n);
-	    i += pixman_fixed_1;
-	}
-    }
-    return (i | f);
-}
-
-/*
- * Compute the largest value strictly less than y which is on a
- * grid row.
- */
-PIXMAN_EXPORT pixman_fixed_t
-pixman_sample_floor_y (pixman_fixed_t y,
-                       int            n)
-{
-    pixman_fixed_t f = pixman_fixed_frac (y);
-    pixman_fixed_t i = pixman_fixed_floor (y);
-
-    f = DIV (f - pixman_fixed_e - Y_FRAC_FIRST (n), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
-	Y_FRAC_FIRST (n);
-
-    if (f < Y_FRAC_FIRST (n))
-    {
-	if (pixman_fixed_to_int (i) == 0x8000)
-	{
-	    f = 0; /* saturate */
-	}
-	else
-	{
-	    f = Y_FRAC_LAST (n);
-	    i -= pixman_fixed_1;
-	}
-    }
-    return (i | f);
-}
-
-/*
- * Step an edge by any amount (including negative values)
- */
-PIXMAN_EXPORT void
-pixman_edge_step (pixman_edge_t *e,
-                  int            n)
-{
-    pixman_fixed_48_16_t ne;
-
-    e->x += n * e->stepx;
-
-    ne = e->e + n * (pixman_fixed_48_16_t) e->dx;
-
-    if (n >= 0)
-    {
-	if (ne > 0)
-	{
-	    int nx = (ne + e->dy - 1) / e->dy;
-	    e->e = ne - nx * (pixman_fixed_48_16_t) e->dy;
-	    e->x += nx * e->signdx;
-	}
-    }
-    else
-    {
-	if (ne <= -e->dy)
-	{
-	    int nx = (-ne) / e->dy;
-	    e->e = ne + nx * (pixman_fixed_48_16_t) e->dy;
-	    e->x -= nx * e->signdx;
-	}
-    }
-}
-
-/*
- * A private routine to initialize the multi-step
- * elements of an edge structure
- */
-static void
-_pixman_edge_multi_init (pixman_edge_t * e,
-                         int             n,
-                         pixman_fixed_t *stepx_p,
-                         pixman_fixed_t *dx_p)
-{
-    pixman_fixed_t stepx;
-    pixman_fixed_48_16_t ne;
-
-    ne = n * (pixman_fixed_48_16_t) e->dx;
-    stepx = n * e->stepx;
-
-    if (ne > 0)
-    {
-	pixman_fixed_48_16_t nx = ne / e->dy;
-	ne -= nx * (pixman_fixed_48_16_t)e->dy;
-	stepx += nx * e->signdx;
-    }
-
-    *dx_p = ne;
-    *stepx_p = stepx;
-}
-
-/*
- * Initialize one edge structure given the line endpoints and a
- * starting y value
- */
-PIXMAN_EXPORT void
-pixman_edge_init (pixman_edge_t *e,
-                  int            n,
-                  pixman_fixed_t y_start,
-                  pixman_fixed_t x_top,
-                  pixman_fixed_t y_top,
-                  pixman_fixed_t x_bot,
-                  pixman_fixed_t y_bot)
-{
-    pixman_fixed_t dx, dy;
-
-    e->x = x_top;
-    e->e = 0;
-    dx = x_bot - x_top;
-    dy = y_bot - y_top;
-    e->dy = dy;
-    e->dx = 0;
-
-    if (dy)
-    {
-	if (dx >= 0)
-	{
-	    e->signdx = 1;
-	    e->stepx = dx / dy;
-	    e->dx = dx % dy;
-	    e->e = -dy;
-	}
-	else
-	{
-	    e->signdx = -1;
-	    e->stepx = -(-dx / dy);
-	    e->dx = -dx % dy;
-	    e->e = 0;
-	}
-
-	_pixman_edge_multi_init (e, STEP_Y_SMALL (n),
-				 &e->stepx_small, &e->dx_small);
-
-	_pixman_edge_multi_init (e, STEP_Y_BIG (n),
-				 &e->stepx_big, &e->dx_big);
-    }
-    pixman_edge_step (e, y_start - y_top);
-}
-
-/*
- * Initialize one edge structure given a line, starting y value
- * and a pixel offset for the line
- */
-PIXMAN_EXPORT void
-pixman_line_fixed_edge_init (pixman_edge_t *            e,
-                             int                        n,
-                             pixman_fixed_t             y,
-                             const pixman_line_fixed_t *line,
-                             int                        x_off,
-                             int                        y_off)
-{
-    pixman_fixed_t x_off_fixed = pixman_int_to_fixed (x_off);
-    pixman_fixed_t y_off_fixed = pixman_int_to_fixed (y_off);
-    const pixman_point_fixed_t *top, *bot;
-
-    if (line->p1.y <= line->p2.y)
-    {
-	top = &line->p1;
-	bot = &line->p2;
-    }
-    else
-    {
-	top = &line->p2;
-	bot = &line->p1;
-    }
-    
-    pixman_edge_init (e, n, y,
-                      top->x + x_off_fixed,
-                      top->y + y_off_fixed,
-                      bot->x + x_off_fixed,
-                      bot->y + y_off_fixed);
-}
-
-PIXMAN_EXPORT void
-pixman_add_traps (pixman_image_t * image,
-                  int16_t          x_off,
-                  int16_t          y_off,
-                  int              ntrap,
-                  pixman_trap_t *  traps)
-{
-    int bpp;
-    int height;
-
-    pixman_fixed_t x_off_fixed;
-    pixman_fixed_t y_off_fixed;
-    pixman_edge_t l, r;
-    pixman_fixed_t t, b;
-
-    _pixman_image_validate (image);
-    
-    height = image->bits.height;
-    bpp = PIXMAN_FORMAT_BPP (image->bits.format);
-
-    x_off_fixed = pixman_int_to_fixed (x_off);
-    y_off_fixed = pixman_int_to_fixed (y_off);
-
-    while (ntrap--)
-    {
-	t = traps->top.y + y_off_fixed;
-	if (t < 0)
-	    t = 0;
-	t = pixman_sample_ceil_y (t, bpp);
-
-	b = traps->bot.y + y_off_fixed;
-	if (pixman_fixed_to_int (b) >= height)
-	    b = pixman_int_to_fixed (height) - 1;
-	b = pixman_sample_floor_y (b, bpp);
-
-	if (b >= t)
-	{
-	    /* initialize edge walkers */
-	    pixman_edge_init (&l, bpp, t,
-	                      traps->top.l + x_off_fixed,
-	                      traps->top.y + y_off_fixed,
-	                      traps->bot.l + x_off_fixed,
-	                      traps->bot.y + y_off_fixed);
-
-	    pixman_edge_init (&r, bpp, t,
-	                      traps->top.r + x_off_fixed,
-	                      traps->top.y + y_off_fixed,
-	                      traps->bot.r + x_off_fixed,
-	                      traps->bot.y + y_off_fixed);
-
-	    pixman_rasterize_edges (image, &l, &r, t, b);
-	}
-
-	traps++;
-    }
-}
-
-#if 0
-static void
-dump_image (pixman_image_t *image,
-            const char *    title)
-{
-    int i, j;
-
-    if (!image->type == BITS)
-	printf ("%s is not a regular image\n", title);
-
-    if (!image->bits.format == PIXMAN_a8)
-	printf ("%s is not an alpha mask\n", title);
-
-    printf ("\n\n\n%s: \n", title);
-
-    for (i = 0; i < image->bits.height; ++i)
-    {
-	uint8_t *line =
-	    (uint8_t *)&(image->bits.bits[i * image->bits.rowstride]);
-
-	for (j = 0; j < image->bits.width; ++j)
-	    printf ("%c", line[j] ? '#' : ' ');
-
-	printf ("\n");
-    }
-}
-#endif
-
-PIXMAN_EXPORT void
-pixman_add_trapezoids (pixman_image_t *          image,
-                       int16_t                   x_off,
-                       int                       y_off,
-                       int                       ntraps,
-                       const pixman_trapezoid_t *traps)
-{
-    int i;
-
-#if 0
-    dump_image (image, "before");
-#endif
-
-    for (i = 0; i < ntraps; ++i)
-    {
-	const pixman_trapezoid_t *trap = &(traps[i]);
-
-	if (!pixman_trapezoid_valid (trap))
-	    continue;
-
-	pixman_rasterize_trapezoid (image, trap, x_off, y_off);
-    }
-
-#if 0
-    dump_image (image, "after");
-#endif
-}
-
-PIXMAN_EXPORT void
-pixman_rasterize_trapezoid (pixman_image_t *          image,
-                            const pixman_trapezoid_t *trap,
-                            int                       x_off,
-                            int                       y_off)
-{
-    int bpp;
-    int height;
-
-    pixman_fixed_t y_off_fixed;
-    pixman_edge_t l, r;
-    pixman_fixed_t t, b;
-
-    return_if_fail (image->type == BITS);
-
-    _pixman_image_validate (image);
-    
-    if (!pixman_trapezoid_valid (trap))
-	return;
-
-    height = image->bits.height;
-    bpp = PIXMAN_FORMAT_BPP (image->bits.format);
-
-    y_off_fixed = pixman_int_to_fixed (y_off);
-
-    t = trap->top + y_off_fixed;
-    if (t < 0)
-	t = 0;
-    t = pixman_sample_ceil_y (t, bpp);
-
-    b = trap->bottom + y_off_fixed;
-    if (pixman_fixed_to_int (b) >= height)
-	b = pixman_int_to_fixed (height) - 1;
-    b = pixman_sample_floor_y (b, bpp);
-    
-    if (b >= t)
-    {
-	/* initialize edge walkers */
-	pixman_line_fixed_edge_init (&l, bpp, t, &trap->left, x_off, y_off);
-	pixman_line_fixed_edge_init (&r, bpp, t, &trap->right, x_off, y_off);
-
-	pixman_rasterize_edges (image, &l, &r, t, b);
-    }
-}
-
-/*
- * pixman_composite_trapezoids()
- *
- * All the trapezoids are conceptually rendered to an infinitely big image.
- * The (0, 0) coordinates of this image are then aligned with the (x, y)
- * coordinates of the source image, and then both images are aligned with
- * the (x, y) coordinates of the destination. Then, in principle, compositing
- * of these three images takes place across the entire destination.
- *
- * FIXME: However, there is currently a bug, where we restrict this compositing
- * to the bounding box of the trapezoids. This is incorrect for operators such
- * as SRC and IN where blank source pixels do have an effect on the destination.
- */
-PIXMAN_EXPORT void
-pixman_composite_trapezoids (pixman_op_t		op,
-			     pixman_image_t *		src,
-			     pixman_image_t *		dst,
-			     pixman_format_code_t	mask_format,
-			     int			x_src,
-			     int			y_src,
-			     int			x_dst,
-			     int			y_dst,
-			     int			n_traps,
-			     const pixman_trapezoid_t *	traps)
-{
-    int i;
-
-    if (n_traps <= 0)
-	return;
-
-    _pixman_image_validate (src);
-    _pixman_image_validate (dst);
-
-    if (op == PIXMAN_OP_ADD &&
-	(src->common.flags & FAST_PATH_IS_OPAQUE)		&&
-	(mask_format == dst->common.extended_format_code)	&&
-	!(dst->common.have_clip_region))
-    {
-	for (i = 0; i < n_traps; ++i)
-	{
-	    const pixman_trapezoid_t *trap = &(traps[i]);
-	    
-	    if (!pixman_trapezoid_valid (trap))
-		continue;
-	    
-	    pixman_rasterize_trapezoid (dst, trap, x_dst, y_dst);
-	}
-    }
-    else
-    {
-	pixman_image_t *tmp;
-	pixman_box32_t box;
-	
-	box.x1 = INT32_MAX;
-	box.y1 = INT32_MAX;
-	box.x2 = INT32_MIN;
-	box.y2 = INT32_MIN;
-	
-	for (i = 0; i < n_traps; ++i)
-	{
-	    const pixman_trapezoid_t *trap = &(traps[i]);
-	    int y1, y2;
-	    
-	    if (!pixman_trapezoid_valid (trap))
-		continue;
-	    
-	    y1 = pixman_fixed_to_int (trap->top);
-	    if (y1 < box.y1)
-		box.y1 = y1;
-	    
-	    y2 = pixman_fixed_to_int (pixman_fixed_ceil (trap->bottom));
-	    if (y2 > box.y2)
-		box.y2 = y2;
-	    
-#define EXTEND_MIN(x)							\
-	    if (pixman_fixed_to_int ((x)) < box.x1)			\
-		box.x1 = pixman_fixed_to_int ((x));
-#define EXTEND_MAX(x)							\
-	    if (pixman_fixed_to_int (pixman_fixed_ceil ((x))) > box.x2)	\
-		box.x2 = pixman_fixed_to_int (pixman_fixed_ceil ((x)));
-	    
-#define EXTEND(x)							\
-	    EXTEND_MIN(x);						\
-	    EXTEND_MAX(x);
-	    
-	    EXTEND(trap->left.p1.x);
-	    EXTEND(trap->left.p2.x);
-	    EXTEND(trap->right.p1.x);
-	    EXTEND(trap->right.p2.x);
-	}
-	
-	if (box.x1 >= box.x2 || box.y1 >= box.y2)
-	    return;
-	
-	tmp = pixman_image_create_bits (
-	    mask_format, box.x2 - box.x1, box.y2 - box.y1, NULL, -1);
-	
-	for (i = 0; i < n_traps; ++i)
-	{
-	    const pixman_trapezoid_t *trap = &(traps[i]);
-	    
-	    if (!pixman_trapezoid_valid (trap))
-		continue;
-	    
-	    pixman_rasterize_trapezoid (tmp, trap, - box.x1, - box.y1);
-	}
-	
-	pixman_image_composite (op, src, tmp, dst,
-				x_src + box.x1, y_src + box.y1,
-				0, 0,
-				x_dst + box.x1, y_dst + box.y1,
-				box.x2 - box.x1, box.y2 - box.y1);
-	
-	pixman_image_unref (tmp);
-    }
-}
-
-static int
-greater_y (const pixman_point_fixed_t *a, const pixman_point_fixed_t *b)
-{
-    if (a->y == b->y)
-	return a->x > b->x;
-    return a->y > b->y;
-}
-
-/*
- * Note that the definition of this function is a bit odd because
- * of the X coordinate space (y increasing downwards).
- */
-static int
-clockwise (const pixman_point_fixed_t *ref,
-	   const pixman_point_fixed_t *a,
-	   const pixman_point_fixed_t *b)
-{
-    pixman_point_fixed_t	ad, bd;
-
-    ad.x = a->x - ref->x;
-    ad.y = a->y - ref->y;
-    bd.x = b->x - ref->x;
-    bd.y = b->y - ref->y;
-
-    return ((pixman_fixed_32_32_t) bd.y * ad.x -
-	    (pixman_fixed_32_32_t) ad.y * bd.x) < 0;
-}
-
-static void
-triangle_to_trapezoids (const pixman_triangle_t *tri, pixman_trapezoid_t *traps)
-{
-    const pixman_point_fixed_t *top, *left, *right, *tmp;
-
-    top = &tri->p1;
-    left = &tri->p2;
-    right = &tri->p3;
-
-    if (greater_y (top, left))
-    {
-	tmp = left;
-	left = top;
-	top = tmp;
-    }
-
-    if (greater_y (top, right))
-    {
-	tmp = right;
-	right = top;
-	top = tmp;
-    }
-
-    if (clockwise (top, right, left))
-    {
-	tmp = right;
-	right = left;
-	left = tmp;
-    }
-    
-    /*
-     * Two cases:
-     *
-     *		+		+
-     *	       / \             / \
-     *	      /   \           /	  \
-     *	     /     +         +	   \
-     *      /    --           --    \
-     *     /   --               --   \
-     *    / ---                   --- \
-     *	 +--                         --+
-     */
-
-    traps->top = top->y;
-    traps->left.p1 = *top;
-    traps->left.p2 = *left;
-    traps->right.p1 = *top;
-    traps->right.p2 = *right;
-
-    if (right->y < left->y)
-	traps->bottom = right->y;
-    else
-	traps->bottom = left->y;
-
-    traps++;
-
-    *traps = *(traps - 1);
-    
-    if (right->y < left->y)
-    {
-	traps->top = right->y;
-	traps->bottom = left->y;
-	traps->right.p1 = *right;
-	traps->right.p2 = *left;
-    }
-    else
-    {
-	traps->top = left->y;
-	traps->bottom = right->y;
-	traps->left.p1 = *left;
-	traps->left.p2 = *right;
-    }
-}
-
-static pixman_trapezoid_t *
-convert_triangles (int n_tris, const pixman_triangle_t *tris)
-{
-    pixman_trapezoid_t *traps;
-    int i;
-
-    if (n_tris <= 0)
-	return NULL;
-    
-    traps = pixman_malloc_ab (n_tris, 2 * sizeof (pixman_trapezoid_t));
-    if (!traps)
-	return NULL;
-
-    for (i = 0; i < n_tris; ++i)
-	triangle_to_trapezoids (&(tris[i]), traps + 2 * i);
-
-    return traps;
-}
-
-PIXMAN_EXPORT void
-pixman_composite_triangles (pixman_op_t			op,
-			    pixman_image_t *		src,
-			    pixman_image_t *		dst,
-			    pixman_format_code_t	mask_format,
-			    int				x_src,
-			    int				y_src,
-			    int				x_dst,
-			    int				y_dst,
-			    int				n_tris,
-			    const pixman_triangle_t *	tris)
-{
-    pixman_trapezoid_t *traps;
-
-    if ((traps = convert_triangles (n_tris, tris)))
-    {
-	pixman_composite_trapezoids (op, src, dst, mask_format,
-				     x_src, y_src, x_dst, y_dst,
-				     n_tris * 2, traps);
-	
-	free (traps);
-    }
-}
-
-PIXMAN_EXPORT void
-pixman_add_triangles (pixman_image_t          *image,
-		      int32_t	               x_off,
-		      int32_t	               y_off,
-		      int	               n_tris,
-		      const pixman_triangle_t *tris)
-{
-    pixman_trapezoid_t *traps;
-
-    if ((traps = convert_triangles (n_tris, tris)))
-    {
-	pixman_add_trapezoids (image, x_off, y_off,
-			       n_tris * 2, traps);
-
-	free (traps);
-    }
-}
+/*
+ * Copyright © 2002 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright © 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+
+/*
+ * Compute the smallest value greater than or equal to y which is on a
+ * grid row.
+ */
+
+PIXMAN_EXPORT pixman_fixed_t
+pixman_sample_ceil_y (pixman_fixed_t y, int n)
+{
+    pixman_fixed_t f = pixman_fixed_frac (y);
+    pixman_fixed_t i = pixman_fixed_floor (y);
+
+    f = DIV (f - Y_FRAC_FIRST (n) + (STEP_Y_SMALL (n) - pixman_fixed_e), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+	Y_FRAC_FIRST (n);
+    
+    if (f > Y_FRAC_LAST (n))
+    {
+	if (pixman_fixed_to_int (i) == 0x7fff)
+	{
+	    f = 0xffff; /* saturate */
+	}
+	else
+	{
+	    f = Y_FRAC_FIRST (n);
+	    i += pixman_fixed_1;
+	}
+    }
+    return (i | f);
+}
+
+/*
+ * Compute the largest value strictly less than y which is on a
+ * grid row.
+ */
+PIXMAN_EXPORT pixman_fixed_t
+pixman_sample_floor_y (pixman_fixed_t y,
+                       int            n)
+{
+    pixman_fixed_t f = pixman_fixed_frac (y);
+    pixman_fixed_t i = pixman_fixed_floor (y);
+
+    f = DIV (f - pixman_fixed_e - Y_FRAC_FIRST (n), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+	Y_FRAC_FIRST (n);
+
+    if (f < Y_FRAC_FIRST (n))
+    {
+	if (pixman_fixed_to_int (i) == 0x8000)
+	{
+	    f = 0; /* saturate */
+	}
+	else
+	{
+	    f = Y_FRAC_LAST (n);
+	    i -= pixman_fixed_1;
+	}
+    }
+    return (i | f);
+}
+
+/*
+ * Step an edge by any amount (including negative values)
+ */
+PIXMAN_EXPORT void
+pixman_edge_step (pixman_edge_t *e,
+                  int            n)
+{
+    pixman_fixed_48_16_t ne;
+
+    e->x += n * e->stepx;
+
+    ne = e->e + n * (pixman_fixed_48_16_t) e->dx;
+
+    if (n >= 0)
+    {
+	if (ne > 0)
+	{
+	    int nx = (ne + e->dy - 1) / e->dy;
+	    e->e = ne - nx * (pixman_fixed_48_16_t) e->dy;
+	    e->x += nx * e->signdx;
+	}
+    }
+    else
+    {
+	if (ne <= -e->dy)
+	{
+	    int nx = (-ne) / e->dy;
+	    e->e = ne + nx * (pixman_fixed_48_16_t) e->dy;
+	    e->x -= nx * e->signdx;
+	}
+    }
+}
+
+/*
+ * A private routine to initialize the multi-step
+ * elements of an edge structure
+ */
+static void
+_pixman_edge_multi_init (pixman_edge_t * e,
+                         int             n,
+                         pixman_fixed_t *stepx_p,
+                         pixman_fixed_t *dx_p)
+{
+    pixman_fixed_t stepx;
+    pixman_fixed_48_16_t ne;
+
+    ne = n * (pixman_fixed_48_16_t) e->dx;
+    stepx = n * e->stepx;
+
+    if (ne > 0)
+    {
+	pixman_fixed_48_16_t nx = ne / e->dy;
+	ne -= nx * (pixman_fixed_48_16_t)e->dy;
+	stepx += nx * e->signdx;
+    }
+
+    *dx_p = ne;
+    *stepx_p = stepx;
+}
+
+/*
+ * Initialize one edge structure given the line endpoints and a
+ * starting y value
+ */
+PIXMAN_EXPORT void
+pixman_edge_init (pixman_edge_t *e,
+                  int            n,
+                  pixman_fixed_t y_start,
+                  pixman_fixed_t x_top,
+                  pixman_fixed_t y_top,
+                  pixman_fixed_t x_bot,
+                  pixman_fixed_t y_bot)
+{
+    pixman_fixed_t dx, dy;
+
+    e->x = x_top;
+    e->e = 0;
+    dx = x_bot - x_top;
+    dy = y_bot - y_top;
+    e->dy = dy;
+    e->dx = 0;
+
+    if (dy)
+    {
+	if (dx >= 0)
+	{
+	    e->signdx = 1;
+	    e->stepx = dx / dy;
+	    e->dx = dx % dy;
+	    e->e = -dy;
+	}
+	else
+	{
+	    e->signdx = -1;
+	    e->stepx = -(-dx / dy);
+	    e->dx = -dx % dy;
+	    e->e = 0;
+	}
+
+	_pixman_edge_multi_init (e, STEP_Y_SMALL (n),
+				 &e->stepx_small, &e->dx_small);
+
+	_pixman_edge_multi_init (e, STEP_Y_BIG (n),
+				 &e->stepx_big, &e->dx_big);
+    }
+    pixman_edge_step (e, y_start - y_top);
+}
+
+/*
+ * Initialize one edge structure given a line, starting y value
+ * and a pixel offset for the line
+ */
+PIXMAN_EXPORT void
+pixman_line_fixed_edge_init (pixman_edge_t *            e,
+                             int                        n,
+                             pixman_fixed_t             y,
+                             const pixman_line_fixed_t *line,
+                             int                        x_off,
+                             int                        y_off)
+{
+    pixman_fixed_t x_off_fixed = pixman_int_to_fixed (x_off);
+    pixman_fixed_t y_off_fixed = pixman_int_to_fixed (y_off);
+    const pixman_point_fixed_t *top, *bot;
+
+    if (line->p1.y <= line->p2.y)
+    {
+	top = &line->p1;
+	bot = &line->p2;
+    }
+    else
+    {
+	top = &line->p2;
+	bot = &line->p1;
+    }
+    
+    pixman_edge_init (e, n, y,
+                      top->x + x_off_fixed,
+                      top->y + y_off_fixed,
+                      bot->x + x_off_fixed,
+                      bot->y + y_off_fixed);
+}
+
+PIXMAN_EXPORT void
+pixman_add_traps (pixman_image_t * image,
+                  int16_t          x_off,
+                  int16_t          y_off,
+                  int              ntrap,
+                  pixman_trap_t *  traps)
+{
+    int bpp;
+    int height;
+
+    pixman_fixed_t x_off_fixed;
+    pixman_fixed_t y_off_fixed;
+    pixman_edge_t l, r;
+    pixman_fixed_t t, b;
+
+    _pixman_image_validate (image);
+    
+    height = image->bits.height;
+    bpp = PIXMAN_FORMAT_BPP (image->bits.format);
+
+    x_off_fixed = pixman_int_to_fixed (x_off);
+    y_off_fixed = pixman_int_to_fixed (y_off);
+
+    while (ntrap--)
+    {
+	t = traps->top.y + y_off_fixed;
+	if (t < 0)
+	    t = 0;
+	t = pixman_sample_ceil_y (t, bpp);
+
+	b = traps->bot.y + y_off_fixed;
+	if (pixman_fixed_to_int (b) >= height)
+	    b = pixman_int_to_fixed (height) - 1;
+	b = pixman_sample_floor_y (b, bpp);
+
+	if (b >= t)
+	{
+	    /* initialize edge walkers */
+	    pixman_edge_init (&l, bpp, t,
+	                      traps->top.l + x_off_fixed,
+	                      traps->top.y + y_off_fixed,
+	                      traps->bot.l + x_off_fixed,
+	                      traps->bot.y + y_off_fixed);
+
+	    pixman_edge_init (&r, bpp, t,
+	                      traps->top.r + x_off_fixed,
+	                      traps->top.y + y_off_fixed,
+	                      traps->bot.r + x_off_fixed,
+	                      traps->bot.y + y_off_fixed);
+
+	    pixman_rasterize_edges (image, &l, &r, t, b);
+	}
+
+	traps++;
+    }
+}
+
+#if 0
+static void
+dump_image (pixman_image_t *image,
+            const char *    title)
+{
+    int i, j;
+
+    if (!image->type == BITS)
+	printf ("%s is not a regular image\n", title);
+
+    if (!image->bits.format == PIXMAN_a8)
+	printf ("%s is not an alpha mask\n", title);
+
+    printf ("\n\n\n%s: \n", title);
+
+    for (i = 0; i < image->bits.height; ++i)
+    {
+	uint8_t *line =
+	    (uint8_t *)&(image->bits.bits[i * image->bits.rowstride]);
+
+	for (j = 0; j < image->bits.width; ++j)
+	    printf ("%c", line[j] ? '#' : ' ');
+
+	printf ("\n");
+    }
+}
+#endif
+
+PIXMAN_EXPORT void
+pixman_add_trapezoids (pixman_image_t *          image,
+                       int16_t                   x_off,
+                       int                       y_off,
+                       int                       ntraps,
+                       const pixman_trapezoid_t *traps)
+{
+    int i;
+
+#if 0
+    dump_image (image, "before");
+#endif
+
+    for (i = 0; i < ntraps; ++i)
+    {
+	const pixman_trapezoid_t *trap = &(traps[i]);
+
+	if (!pixman_trapezoid_valid (trap))
+	    continue;
+
+	pixman_rasterize_trapezoid (image, trap, x_off, y_off);
+    }
+
+#if 0
+    dump_image (image, "after");
+#endif
+}
+
+PIXMAN_EXPORT void
+pixman_rasterize_trapezoid (pixman_image_t *          image,
+                            const pixman_trapezoid_t *trap,
+                            int                       x_off,
+                            int                       y_off)
+{
+    int bpp;
+    int height;
+
+    pixman_fixed_t y_off_fixed;
+    pixman_edge_t l, r;
+    pixman_fixed_t t, b;
+
+    return_if_fail (image->type == BITS);
+
+    _pixman_image_validate (image);
+    
+    if (!pixman_trapezoid_valid (trap))
+	return;
+
+    height = image->bits.height;
+    bpp = PIXMAN_FORMAT_BPP (image->bits.format);
+
+    y_off_fixed = pixman_int_to_fixed (y_off);
+
+    t = trap->top + y_off_fixed;
+    if (t < 0)
+	t = 0;
+    t = pixman_sample_ceil_y (t, bpp);
+
+    b = trap->bottom + y_off_fixed;
+    if (pixman_fixed_to_int (b) >= height)
+	b = pixman_int_to_fixed (height) - 1;
+    b = pixman_sample_floor_y (b, bpp);
+    
+    if (b >= t)
+    {
+	/* initialize edge walkers */
+	pixman_line_fixed_edge_init (&l, bpp, t, &trap->left, x_off, y_off);
+	pixman_line_fixed_edge_init (&r, bpp, t, &trap->right, x_off, y_off);
+
+	pixman_rasterize_edges (image, &l, &r, t, b);
+    }
+}
+
+/*
+ * pixman_composite_trapezoids()
+ *
+ * All the trapezoids are conceptually rendered to an infinitely big image.
+ * The (0, 0) coordinates of this image are then aligned with the (x, y)
+ * coordinates of the source image, and then both images are aligned with
+ * the (x, y) coordinates of the destination. Then, in principle, compositing
+ * of these three images takes place across the entire destination.
+ *
+ * FIXME: However, there is currently a bug, where we restrict this compositing
+ * to the bounding box of the trapezoids. This is incorrect for operators such
+ * as SRC and IN where blank source pixels do have an effect on the destination.
+ */
+PIXMAN_EXPORT void
+pixman_composite_trapezoids (pixman_op_t		op,
+			     pixman_image_t *		src,
+			     pixman_image_t *		dst,
+			     pixman_format_code_t	mask_format,
+			     int			x_src,
+			     int			y_src,
+			     int			x_dst,
+			     int			y_dst,
+			     int			n_traps,
+			     const pixman_trapezoid_t *	traps)
+{
+    int i;
+
+    if (n_traps <= 0)
+	return;
+
+    _pixman_image_validate (src);
+    _pixman_image_validate (dst);
+
+    if (op == PIXMAN_OP_ADD &&
+	(src->common.flags & FAST_PATH_IS_OPAQUE)		&&
+	(mask_format == dst->common.extended_format_code)	&&
+	!(dst->common.have_clip_region))
+    {
+	for (i = 0; i < n_traps; ++i)
+	{
+	    const pixman_trapezoid_t *trap = &(traps[i]);
+	    
+	    if (!pixman_trapezoid_valid (trap))
+		continue;
+	    
+	    pixman_rasterize_trapezoid (dst, trap, x_dst, y_dst);
+	}
+    }
+    else
+    {
+	pixman_image_t *tmp;
+	pixman_box32_t box;
+	
+	box.x1 = INT32_MAX;
+	box.y1 = INT32_MAX;
+	box.x2 = INT32_MIN;
+	box.y2 = INT32_MIN;
+	
+	for (i = 0; i < n_traps; ++i)
+	{
+	    const pixman_trapezoid_t *trap = &(traps[i]);
+	    int y1, y2;
+	    
+	    if (!pixman_trapezoid_valid (trap))
+		continue;
+	    
+	    y1 = pixman_fixed_to_int (trap->top);
+	    if (y1 < box.y1)
+		box.y1 = y1;
+	    
+	    y2 = pixman_fixed_to_int (pixman_fixed_ceil (trap->bottom));
+	    if (y2 > box.y2)
+		box.y2 = y2;
+	    
+#define EXTEND_MIN(x)							\
+	    if (pixman_fixed_to_int ((x)) < box.x1)			\
+		box.x1 = pixman_fixed_to_int ((x));
+#define EXTEND_MAX(x)							\
+	    if (pixman_fixed_to_int (pixman_fixed_ceil ((x))) > box.x2)	\
+		box.x2 = pixman_fixed_to_int (pixman_fixed_ceil ((x)));
+	    
+#define EXTEND(x)							\
+	    EXTEND_MIN(x);						\
+	    EXTEND_MAX(x);
+	    
+	    EXTEND(trap->left.p1.x);
+	    EXTEND(trap->left.p2.x);
+	    EXTEND(trap->right.p1.x);
+	    EXTEND(trap->right.p2.x);
+	}
+	
+	if (box.x1 >= box.x2 || box.y1 >= box.y2)
+	    return;
+	
+	tmp = pixman_image_create_bits (
+	    mask_format, box.x2 - box.x1, box.y2 - box.y1, NULL, -1);
+	
+	for (i = 0; i < n_traps; ++i)
+	{
+	    const pixman_trapezoid_t *trap = &(traps[i]);
+	    
+	    if (!pixman_trapezoid_valid (trap))
+		continue;
+	    
+	    pixman_rasterize_trapezoid (tmp, trap, - box.x1, - box.y1);
+	}
+	
+	pixman_image_composite (op, src, tmp, dst,
+				x_src + box.x1, y_src + box.y1,
+				0, 0,
+				x_dst + box.x1, y_dst + box.y1,
+				box.x2 - box.x1, box.y2 - box.y1);
+	
+	pixman_image_unref (tmp);
+    }
+}
+
+static int
+greater_y (const pixman_point_fixed_t *a, const pixman_point_fixed_t *b)
+{
+    if (a->y == b->y)
+	return a->x > b->x;
+    return a->y > b->y;
+}
+
+/*
+ * Note that the definition of this function is a bit odd because
+ * of the X coordinate space (y increasing downwards).
+ */
+static int
+clockwise (const pixman_point_fixed_t *ref,
+	   const pixman_point_fixed_t *a,
+	   const pixman_point_fixed_t *b)
+{
+    pixman_point_fixed_t	ad, bd;
+
+    ad.x = a->x - ref->x;
+    ad.y = a->y - ref->y;
+    bd.x = b->x - ref->x;
+    bd.y = b->y - ref->y;
+
+    return ((pixman_fixed_32_32_t) bd.y * ad.x -
+	    (pixman_fixed_32_32_t) ad.y * bd.x) < 0;
+}
+
+static void
+triangle_to_trapezoids (const pixman_triangle_t *tri, pixman_trapezoid_t *traps)
+{
+    const pixman_point_fixed_t *top, *left, *right, *tmp;
+
+    top = &tri->p1;
+    left = &tri->p2;
+    right = &tri->p3;
+
+    if (greater_y (top, left))
+    {
+	tmp = left;
+	left = top;
+	top = tmp;
+    }
+
+    if (greater_y (top, right))
+    {
+	tmp = right;
+	right = top;
+	top = tmp;
+    }
+
+    if (clockwise (top, right, left))
+    {
+	tmp = right;
+	right = left;
+	left = tmp;
+    }
+    
+    /*
+     * Two cases:
+     *
+     *		+		+
+     *	       / \             / \
+     *	      /   \           /	  \
+     *	     /     +         +	   \
+     *      /    --           --    \
+     *     /   --               --   \
+     *    / ---                   --- \
+     *	 +--                         --+
+     */
+
+    traps->top = top->y;
+    traps->left.p1 = *top;
+    traps->left.p2 = *left;
+    traps->right.p1 = *top;
+    traps->right.p2 = *right;
+
+    if (right->y < left->y)
+	traps->bottom = right->y;
+    else
+	traps->bottom = left->y;
+
+    traps++;
+
+    *traps = *(traps - 1);
+    
+    if (right->y < left->y)
+    {
+	traps->top = right->y;
+	traps->bottom = left->y;
+	traps->right.p1 = *right;
+	traps->right.p2 = *left;
+    }
+    else
+    {
+	traps->top = left->y;
+	traps->bottom = right->y;
+	traps->left.p1 = *left;
+	traps->left.p2 = *right;
+    }
+}
+
+static pixman_trapezoid_t *
+convert_triangles (int n_tris, const pixman_triangle_t *tris)
+{
+    pixman_trapezoid_t *traps;
+    int i;
+
+    if (n_tris <= 0)
+	return NULL;
+    
+    traps = pixman_malloc_ab (n_tris, 2 * sizeof (pixman_trapezoid_t));
+    if (!traps)
+	return NULL;
+
+    for (i = 0; i < n_tris; ++i)
+	triangle_to_trapezoids (&(tris[i]), traps + 2 * i);
+
+    return traps;
+}
+
+PIXMAN_EXPORT void
+pixman_composite_triangles (pixman_op_t			op,
+			    pixman_image_t *		src,
+			    pixman_image_t *		dst,
+			    pixman_format_code_t	mask_format,
+			    int				x_src,
+			    int				y_src,
+			    int				x_dst,
+			    int				y_dst,
+			    int				n_tris,
+			    const pixman_triangle_t *	tris)
+{
+    pixman_trapezoid_t *traps;
+
+    if ((traps = convert_triangles (n_tris, tris)))
+    {
+	pixman_composite_trapezoids (op, src, dst, mask_format,
+				     x_src, y_src, x_dst, y_dst,
+				     n_tris * 2, traps);
+	
+	free (traps);
+    }
+}
+
+PIXMAN_EXPORT void
+pixman_add_triangles (pixman_image_t          *image,
+		      int32_t	               x_off,
+		      int32_t	               y_off,
+		      int	               n_tris,
+		      const pixman_triangle_t *tris)
+{
+    pixman_trapezoid_t *traps;
+
+    if ((traps = convert_triangles (n_tris, tris)))
+    {
+	pixman_add_trapezoids (image, x_off, y_off,
+			       n_tris * 2, traps);
+
+	free (traps);
+    }
+}
diff --git a/pixman/pixman/pixman.h b/pixman/pixman/pixman.h
index fb281b0c9..c57092a4c 100644
--- a/pixman/pixman/pixman.h
+++ b/pixman/pixman/pixman.h
@@ -1,990 +1,990 @@
-/***********************************************************
-
-Copyright 1987, 1998  The Open Group
-
-Permission to use, copy, modify, distribute, and sell this software and its
-documentation for any purpose is hereby granted without fee, provided that
-the above copyright notice appear in all copies and that both that
-copyright notice and this permission notice appear in supporting
-documentation.
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
-AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-Except as contained in this notice, the name of The Open Group shall not be
-used in advertising or otherwise to promote the sale, use or other dealings
-in this Software without prior written authorization from The Open Group.
-
-Copyright 1987 by Digital Equipment Corporation, Maynard, Massachusetts.
-
-                        All Rights Reserved
-
-Permission to use, copy, modify, and distribute this software and its
-documentation for any purpose and without fee is hereby granted,
-provided that the above copyright notice appear in all copies and that
-both that copyright notice and this permission notice appear in
-supporting documentation, and that the name of Digital not be
-used in advertising or publicity pertaining to distribution of the
-software without specific, written prior permission.
-
-DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
-ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
-DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
-ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
-ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
-SOFTWARE.
-
-******************************************************************/
-/*
- * Copyright © 1998, 2004 Keith Packard
- * Copyright   2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifndef PIXMAN_H__
-#define PIXMAN_H__
-
-#include <pixman-version.h>
-
-#ifdef  __cplusplus
-#define PIXMAN_BEGIN_DECLS extern "C" {
-#define PIXMAN_END_DECLS }
-#else
-#define PIXMAN_BEGIN_DECLS
-#define PIXMAN_END_DECLS
-#endif
-
-PIXMAN_BEGIN_DECLS
-
-/*
- * Standard integers
- */
-
-#if !defined (PIXMAN_DONT_DEFINE_STDINT)
-
-#if defined (_SVR4) || defined (SVR4) || defined (__OpenBSD__) || defined (_sgi) || defined (__sun) || defined (sun) || defined (__digital__) || defined (__HP_cc)
-#  include <inttypes.h>
-/* VS 2010 (_MSC_VER 1600) has stdint.h */
-#elif defined (_MSC_VER) && _MSC_VER < 1600
-typedef __int8 int8_t;
-typedef unsigned __int8 uint8_t;
-typedef __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-#elif defined (_AIX)
-#  include <sys/inttypes.h>
-#else
-#  include <stdint.h>
-#endif
-
-#endif
-
-/*
- * Boolean
- */
-typedef int pixman_bool_t;
-
-/*
- * Fixpoint numbers
- */
-typedef int64_t			pixman_fixed_32_32_t;
-typedef pixman_fixed_32_32_t	pixman_fixed_48_16_t;
-typedef uint32_t		pixman_fixed_1_31_t;
-typedef uint32_t		pixman_fixed_1_16_t;
-typedef int32_t			pixman_fixed_16_16_t;
-typedef pixman_fixed_16_16_t	pixman_fixed_t;
-
-#define pixman_fixed_e			((pixman_fixed_t) 1)
-#define pixman_fixed_1			(pixman_int_to_fixed(1))
-#define pixman_fixed_1_minus_e		(pixman_fixed_1 - pixman_fixed_e)
-#define pixman_fixed_minus_1		(pixman_int_to_fixed(-1))
-#define pixman_fixed_to_int(f)		((int) ((f) >> 16))
-#define pixman_int_to_fixed(i)		((pixman_fixed_t) ((i) << 16))
-#define pixman_fixed_to_double(f)	(double) ((f) / (double) pixman_fixed_1)
-#define pixman_double_to_fixed(d)	((pixman_fixed_t) ((d) * 65536.0))
-#define pixman_fixed_frac(f)		((f) & pixman_fixed_1_minus_e)
-#define pixman_fixed_floor(f)		((f) & ~pixman_fixed_1_minus_e)
-#define pixman_fixed_ceil(f)		pixman_fixed_floor ((f) + pixman_fixed_1_minus_e)
-#define pixman_fixed_fraction(f)	((f) & pixman_fixed_1_minus_e)
-#define pixman_fixed_mod_2(f)		((f) & (pixman_fixed1 | pixman_fixed_1_minus_e))
-#define pixman_max_fixed_48_16		((pixman_fixed_48_16_t) 0x7fffffff)
-#define pixman_min_fixed_48_16		(-((pixman_fixed_48_16_t) 1 << 31))
-
-/*
- * Misc structs
- */
-typedef struct pixman_color pixman_color_t;
-typedef struct pixman_point_fixed pixman_point_fixed_t;
-typedef struct pixman_line_fixed pixman_line_fixed_t;
-typedef struct pixman_vector pixman_vector_t;
-typedef struct pixman_transform pixman_transform_t;
-
-struct pixman_color
-{
-    uint16_t	red;
-    uint16_t    green;
-    uint16_t    blue;
-    uint16_t    alpha;
-};
-
-struct pixman_point_fixed
-{
-    pixman_fixed_t	x;
-    pixman_fixed_t	y;
-};
-
-struct pixman_line_fixed
-{
-    pixman_point_fixed_t	p1, p2;
-};
-
-/*
- * Fixed point matrices
- */
-
-struct pixman_vector
-{
-    pixman_fixed_t	vector[3];
-};
-
-struct pixman_transform
-{
-    pixman_fixed_t	matrix[3][3];
-};
-
-/* forward declaration (sorry) */
-struct pixman_box16;
-typedef  union pixman_image		pixman_image_t;
-
-void          pixman_transform_init_identity    (struct pixman_transform       *matrix);
-pixman_bool_t pixman_transform_point_3d         (const struct pixman_transform *transform,
-						 struct pixman_vector          *vector);
-pixman_bool_t pixman_transform_point            (const struct pixman_transform *transform,
-						 struct pixman_vector          *vector);
-pixman_bool_t pixman_transform_multiply         (struct pixman_transform       *dst,
-						 const struct pixman_transform *l,
-						 const struct pixman_transform *r);
-void          pixman_transform_init_scale       (struct pixman_transform       *t,
-						 pixman_fixed_t                 sx,
-						 pixman_fixed_t                 sy);
-pixman_bool_t pixman_transform_scale            (struct pixman_transform       *forward,
-						 struct pixman_transform       *reverse,
-						 pixman_fixed_t                 sx,
-						 pixman_fixed_t                 sy);
-void          pixman_transform_init_rotate      (struct pixman_transform       *t,
-						 pixman_fixed_t                 cos,
-						 pixman_fixed_t                 sin);
-pixman_bool_t pixman_transform_rotate           (struct pixman_transform       *forward,
-						 struct pixman_transform       *reverse,
-						 pixman_fixed_t                 c,
-						 pixman_fixed_t                 s);
-void          pixman_transform_init_translate   (struct pixman_transform       *t,
-						 pixman_fixed_t                 tx,
-						 pixman_fixed_t                 ty);
-pixman_bool_t pixman_transform_translate        (struct pixman_transform       *forward,
-						 struct pixman_transform       *reverse,
-						 pixman_fixed_t                 tx,
-						 pixman_fixed_t                 ty);
-pixman_bool_t pixman_transform_bounds           (const struct pixman_transform *matrix,
-						 struct pixman_box16           *b);
-pixman_bool_t pixman_transform_invert           (struct pixman_transform       *dst,
-						 const struct pixman_transform *src);
-pixman_bool_t pixman_transform_is_identity      (const struct pixman_transform *t);
-pixman_bool_t pixman_transform_is_scale         (const struct pixman_transform *t);
-pixman_bool_t pixman_transform_is_int_translate (const struct pixman_transform *t);
-pixman_bool_t pixman_transform_is_inverse       (const struct pixman_transform *a,
-						 const struct pixman_transform *b);
-
-/*
- * Floating point matrices
- */
-struct pixman_f_vector
-{
-    double  v[3];
-};
-
-struct pixman_f_transform
-{
-    double  m[3][3];
-};
-
-pixman_bool_t pixman_transform_from_pixman_f_transform (struct pixman_transform         *t,
-							const struct pixman_f_transform *ft);
-void          pixman_f_transform_from_pixman_transform (struct pixman_f_transform       *ft,
-							const struct pixman_transform   *t);
-pixman_bool_t pixman_f_transform_invert                (struct pixman_f_transform       *dst,
-							const struct pixman_f_transform *src);
-pixman_bool_t pixman_f_transform_point                 (const struct pixman_f_transform *t,
-							struct pixman_f_vector          *v);
-void          pixman_f_transform_point_3d              (const struct pixman_f_transform *t,
-							struct pixman_f_vector          *v);
-void          pixman_f_transform_multiply              (struct pixman_f_transform       *dst,
-							const struct pixman_f_transform *l,
-							const struct pixman_f_transform *r);
-void          pixman_f_transform_init_scale            (struct pixman_f_transform       *t,
-							double                           sx,
-							double                           sy);
-pixman_bool_t pixman_f_transform_scale                 (struct pixman_f_transform       *forward,
-							struct pixman_f_transform       *reverse,
-							double                           sx,
-							double                           sy);
-void          pixman_f_transform_init_rotate           (struct pixman_f_transform       *t,
-							double                           cos,
-							double                           sin);
-pixman_bool_t pixman_f_transform_rotate                (struct pixman_f_transform       *forward,
-							struct pixman_f_transform       *reverse,
-							double                           c,
-							double                           s);
-void          pixman_f_transform_init_translate        (struct pixman_f_transform       *t,
-							double                           tx,
-							double                           ty);
-pixman_bool_t pixman_f_transform_translate             (struct pixman_f_transform       *forward,
-							struct pixman_f_transform       *reverse,
-							double                           tx,
-							double                           ty);
-pixman_bool_t pixman_f_transform_bounds                (const struct pixman_f_transform *t,
-							struct pixman_box16             *b);
-void          pixman_f_transform_init_identity         (struct pixman_f_transform       *t);
-
-typedef enum
-{
-    PIXMAN_REPEAT_NONE,
-    PIXMAN_REPEAT_NORMAL,
-    PIXMAN_REPEAT_PAD,
-    PIXMAN_REPEAT_REFLECT
-} pixman_repeat_t;
-
-typedef enum
-{
-    PIXMAN_FILTER_FAST,
-    PIXMAN_FILTER_GOOD,
-    PIXMAN_FILTER_BEST,
-    PIXMAN_FILTER_NEAREST,
-    PIXMAN_FILTER_BILINEAR,
-    PIXMAN_FILTER_CONVOLUTION
-} pixman_filter_t;
-
-typedef enum
-{
-    PIXMAN_OP_CLEAR			= 0x00,
-    PIXMAN_OP_SRC			= 0x01,
-    PIXMAN_OP_DST			= 0x02,
-    PIXMAN_OP_OVER			= 0x03,
-    PIXMAN_OP_OVER_REVERSE		= 0x04,
-    PIXMAN_OP_IN			= 0x05,
-    PIXMAN_OP_IN_REVERSE		= 0x06,
-    PIXMAN_OP_OUT			= 0x07,
-    PIXMAN_OP_OUT_REVERSE		= 0x08,
-    PIXMAN_OP_ATOP			= 0x09,
-    PIXMAN_OP_ATOP_REVERSE		= 0x0a,
-    PIXMAN_OP_XOR			= 0x0b,
-    PIXMAN_OP_ADD			= 0x0c,
-    PIXMAN_OP_SATURATE			= 0x0d,
-
-    PIXMAN_OP_DISJOINT_CLEAR		= 0x10,
-    PIXMAN_OP_DISJOINT_SRC		= 0x11,
-    PIXMAN_OP_DISJOINT_DST		= 0x12,
-    PIXMAN_OP_DISJOINT_OVER		= 0x13,
-    PIXMAN_OP_DISJOINT_OVER_REVERSE	= 0x14,
-    PIXMAN_OP_DISJOINT_IN		= 0x15,
-    PIXMAN_OP_DISJOINT_IN_REVERSE	= 0x16,
-    PIXMAN_OP_DISJOINT_OUT		= 0x17,
-    PIXMAN_OP_DISJOINT_OUT_REVERSE	= 0x18,
-    PIXMAN_OP_DISJOINT_ATOP		= 0x19,
-    PIXMAN_OP_DISJOINT_ATOP_REVERSE	= 0x1a,
-    PIXMAN_OP_DISJOINT_XOR		= 0x1b,
-
-    PIXMAN_OP_CONJOINT_CLEAR		= 0x20,
-    PIXMAN_OP_CONJOINT_SRC		= 0x21,
-    PIXMAN_OP_CONJOINT_DST		= 0x22,
-    PIXMAN_OP_CONJOINT_OVER		= 0x23,
-    PIXMAN_OP_CONJOINT_OVER_REVERSE	= 0x24,
-    PIXMAN_OP_CONJOINT_IN		= 0x25,
-    PIXMAN_OP_CONJOINT_IN_REVERSE	= 0x26,
-    PIXMAN_OP_CONJOINT_OUT		= 0x27,
-    PIXMAN_OP_CONJOINT_OUT_REVERSE	= 0x28,
-    PIXMAN_OP_CONJOINT_ATOP		= 0x29,
-    PIXMAN_OP_CONJOINT_ATOP_REVERSE	= 0x2a,
-    PIXMAN_OP_CONJOINT_XOR		= 0x2b,
-
-    PIXMAN_OP_MULTIPLY                  = 0x30,
-    PIXMAN_OP_SCREEN                    = 0x31,
-    PIXMAN_OP_OVERLAY                   = 0x32,
-    PIXMAN_OP_DARKEN                    = 0x33,
-    PIXMAN_OP_LIGHTEN                   = 0x34,
-    PIXMAN_OP_COLOR_DODGE               = 0x35,
-    PIXMAN_OP_COLOR_BURN                = 0x36,
-    PIXMAN_OP_HARD_LIGHT                = 0x37,
-    PIXMAN_OP_SOFT_LIGHT                = 0x38,
-    PIXMAN_OP_DIFFERENCE                = 0x39,
-    PIXMAN_OP_EXCLUSION                 = 0x3a,
-    PIXMAN_OP_HSL_HUE			= 0x3b,
-    PIXMAN_OP_HSL_SATURATION		= 0x3c,
-    PIXMAN_OP_HSL_COLOR			= 0x3d,
-    PIXMAN_OP_HSL_LUMINOSITY		= 0x3e
-
-#ifdef PIXMAN_USE_INTERNAL_API
-    ,
-    PIXMAN_N_OPERATORS,
-    PIXMAN_OP_NONE = PIXMAN_N_OPERATORS
-#endif
-} pixman_op_t;
-
-/*
- * Regions
- */
-typedef struct pixman_region16_data	pixman_region16_data_t;
-typedef struct pixman_box16		pixman_box16_t;
-typedef struct pixman_rectangle16	pixman_rectangle16_t;
-typedef struct pixman_region16		pixman_region16_t;
-
-struct pixman_region16_data {
-    long		size;
-    long		numRects;
-/*  pixman_box16_t	rects[size];   in memory but not explicitly declared */
-};
-
-struct pixman_rectangle16
-{
-    int16_t	x, y;
-    uint16_t	width, height;
-};
-
-struct pixman_box16
-{
-    int16_t x1, y1, x2, y2;
-};
-
-struct pixman_region16
-{
-    pixman_box16_t          extents;
-    pixman_region16_data_t *data;
-};
-
-typedef enum
-{
-    PIXMAN_REGION_OUT,
-    PIXMAN_REGION_IN,
-    PIXMAN_REGION_PART
-} pixman_region_overlap_t;
-
-/* This function exists only to make it possible to preserve
- * the X ABI - it should go away at first opportunity.
- */
-void pixman_region_set_static_pointers (pixman_box16_t         *empty_box,
-					pixman_region16_data_t *empty_data,
-					pixman_region16_data_t *broken_data);
-
-/* creation/destruction */
-void                    pixman_region_init               (pixman_region16_t *region);
-void                    pixman_region_init_rect          (pixman_region16_t *region,
-							  int                x,
-							  int                y,
-							  unsigned int       width,
-							  unsigned int       height);
-pixman_bool_t           pixman_region_init_rects         (pixman_region16_t *region,
-							  const pixman_box16_t *boxes,
-							  int                count);
-void                    pixman_region_init_with_extents  (pixman_region16_t *region,
-							  pixman_box16_t    *extents);
-void                    pixman_region_init_from_image    (pixman_region16_t *region,
-							  pixman_image_t    *image);
-void                    pixman_region_fini               (pixman_region16_t *region);
-
-
-/* manipulation */
-void                    pixman_region_translate          (pixman_region16_t *region,
-							  int                x,
-							  int                y);
-pixman_bool_t           pixman_region_copy               (pixman_region16_t *dest,
-							  pixman_region16_t *source);
-pixman_bool_t           pixman_region_intersect          (pixman_region16_t *new_reg,
-							  pixman_region16_t *reg1,
-							  pixman_region16_t *reg2);
-pixman_bool_t           pixman_region_union              (pixman_region16_t *new_reg,
-							  pixman_region16_t *reg1,
-							  pixman_region16_t *reg2);
-pixman_bool_t           pixman_region_union_rect         (pixman_region16_t *dest,
-							  pixman_region16_t *source,
-							  int                x,
-							  int                y,
-							  unsigned int       width,
-							  unsigned int       height);
-pixman_bool_t		pixman_region_intersect_rect     (pixman_region16_t *dest,
-							  pixman_region16_t *source,
-							  int                x,
-							  int                y,
-							  unsigned int       width,
-							  unsigned int       height);
-pixman_bool_t           pixman_region_subtract           (pixman_region16_t *reg_d,
-							  pixman_region16_t *reg_m,
-							  pixman_region16_t *reg_s);
-pixman_bool_t           pixman_region_inverse            (pixman_region16_t *new_reg,
-							  pixman_region16_t *reg1,
-							  pixman_box16_t    *inv_rect);
-pixman_bool_t           pixman_region_contains_point     (pixman_region16_t *region,
-							  int                x,
-							  int                y,
-							  pixman_box16_t    *box);
-pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *region,
-							  pixman_box16_t    *prect);
-pixman_bool_t           pixman_region_not_empty          (pixman_region16_t *region);
-pixman_box16_t *        pixman_region_extents            (pixman_region16_t *region);
-int                     pixman_region_n_rects            (pixman_region16_t *region);
-pixman_box16_t *        pixman_region_rectangles         (pixman_region16_t *region,
-							  int               *n_rects);
-pixman_bool_t           pixman_region_equal              (pixman_region16_t *region1,
-							  pixman_region16_t *region2);
-pixman_bool_t           pixman_region_selfcheck          (pixman_region16_t *region);
-void                    pixman_region_reset              (pixman_region16_t *region,
-							  pixman_box16_t    *box);
-/*
- * 32 bit regions
- */
-typedef struct pixman_region32_data	pixman_region32_data_t;
-typedef struct pixman_box32		pixman_box32_t;
-typedef struct pixman_rectangle32	pixman_rectangle32_t;
-typedef struct pixman_region32		pixman_region32_t;
-
-struct pixman_region32_data {
-    long		size;
-    long		numRects;
-/*  pixman_box32_t	rects[size];   in memory but not explicitly declared */
-};
-
-struct pixman_rectangle32
-{
-    int32_t x, y;
-    uint32_t width, height;
-};
-
-struct pixman_box32
-{
-    int32_t x1, y1, x2, y2;
-};
-
-struct pixman_region32
-{
-    pixman_box32_t          extents;
-    pixman_region32_data_t  *data;
-};
-
-/* creation/destruction */
-void                    pixman_region32_init               (pixman_region32_t *region);
-void                    pixman_region32_init_rect          (pixman_region32_t *region,
-							    int                x,
-							    int                y,
-							    unsigned int       width,
-							    unsigned int       height);
-pixman_bool_t           pixman_region32_init_rects         (pixman_region32_t *region,
-							    const pixman_box32_t *boxes,
-							    int                count);
-void                    pixman_region32_init_with_extents  (pixman_region32_t *region,
-							    pixman_box32_t    *extents);
-void                    pixman_region32_init_from_image    (pixman_region32_t *region,
-							    pixman_image_t    *image);
-void                    pixman_region32_fini               (pixman_region32_t *region);
-
-
-/* manipulation */
-void                    pixman_region32_translate          (pixman_region32_t *region,
-							    int                x,
-							    int                y);
-pixman_bool_t           pixman_region32_copy               (pixman_region32_t *dest,
-							    pixman_region32_t *source);
-pixman_bool_t           pixman_region32_intersect          (pixman_region32_t *new_reg,
-							    pixman_region32_t *reg1,
-							    pixman_region32_t *reg2);
-pixman_bool_t           pixman_region32_union              (pixman_region32_t *new_reg,
-							    pixman_region32_t *reg1,
-							    pixman_region32_t *reg2);
-pixman_bool_t		pixman_region32_intersect_rect     (pixman_region32_t *dest,
-							    pixman_region32_t *source,
-							    int                x,
-							    int                y,
-							    unsigned int       width,
-							    unsigned int       height);
-pixman_bool_t           pixman_region32_union_rect         (pixman_region32_t *dest,
-							    pixman_region32_t *source,
-							    int                x,
-							    int                y,
-							    unsigned int       width,
-							    unsigned int       height);
-pixman_bool_t           pixman_region32_subtract           (pixman_region32_t *reg_d,
-							    pixman_region32_t *reg_m,
-							    pixman_region32_t *reg_s);
-pixman_bool_t           pixman_region32_inverse            (pixman_region32_t *new_reg,
-							    pixman_region32_t *reg1,
-							    pixman_box32_t    *inv_rect);
-pixman_bool_t           pixman_region32_contains_point     (pixman_region32_t *region,
-							    int                x,
-							    int                y,
-							    pixman_box32_t    *box);
-pixman_region_overlap_t pixman_region32_contains_rectangle (pixman_region32_t *region,
-							    pixman_box32_t    *prect);
-pixman_bool_t           pixman_region32_not_empty          (pixman_region32_t *region);
-pixman_box32_t *        pixman_region32_extents            (pixman_region32_t *region);
-int                     pixman_region32_n_rects            (pixman_region32_t *region);
-pixman_box32_t *        pixman_region32_rectangles         (pixman_region32_t *region,
-							    int               *n_rects);
-pixman_bool_t           pixman_region32_equal              (pixman_region32_t *region1,
-							    pixman_region32_t *region2);
-pixman_bool_t           pixman_region32_selfcheck          (pixman_region32_t *region);
-void                    pixman_region32_reset              (pixman_region32_t *region,
-							    pixman_box32_t    *box);
-
-
-/* Copy / Fill / Misc */
-pixman_bool_t pixman_blt                (uint32_t           *src_bits,
-					 uint32_t           *dst_bits,
-					 int                 src_stride,
-					 int                 dst_stride,
-					 int                 src_bpp,
-					 int                 dst_bpp,
-					 int                 src_x,
-					 int                 src_y,
-					 int                 dest_x,
-					 int                 dest_y,
-					 int                 width,
-					 int                 height);
-pixman_bool_t pixman_fill               (uint32_t           *bits,
-					 int                 stride,
-					 int                 bpp,
-					 int                 x,
-					 int                 y,
-					 int                 width,
-					 int                 height,
-					 uint32_t            _xor);
-
-int           pixman_version            (void);
-const char*   pixman_version_string     (void);
-
-/*
- * Images
- */
-typedef struct pixman_indexed		pixman_indexed_t;
-typedef struct pixman_gradient_stop	pixman_gradient_stop_t;
-
-typedef uint32_t (* pixman_read_memory_func_t) (const void *src, int size);
-typedef void     (* pixman_write_memory_func_t) (void *dst, uint32_t value, int size);
-
-typedef void     (* pixman_image_destroy_func_t) (pixman_image_t *image, void *data);
-
-struct pixman_gradient_stop {
-    pixman_fixed_t x;
-    pixman_color_t color;
-};
-
-#define PIXMAN_MAX_INDEXED  256 /* XXX depth must be <= 8 */
-
-#if PIXMAN_MAX_INDEXED <= 256
-typedef uint8_t pixman_index_type;
-#endif
-
-struct pixman_indexed
-{
-    pixman_bool_t       color;
-    uint32_t		rgba[PIXMAN_MAX_INDEXED];
-    pixman_index_type	ent[32768];
-};
-
-/*
- * While the protocol is generous in format support, the
- * sample implementation allows only packed RGB and GBR
- * representations for data to simplify software rendering,
- */
-#define PIXMAN_FORMAT(bpp,type,a,r,g,b)	(((bpp) << 24) |  \
-					 ((type) << 16) | \
-					 ((a) << 12) |	  \
-					 ((r) << 8) |	  \
-					 ((g) << 4) |	  \
-					 ((b)))
-
-#define PIXMAN_FORMAT_BPP(f)	(((f) >> 24)       )
-#define PIXMAN_FORMAT_TYPE(f)	(((f) >> 16) & 0xff)
-#define PIXMAN_FORMAT_A(f)	(((f) >> 12) & 0x0f)
-#define PIXMAN_FORMAT_R(f)	(((f) >>  8) & 0x0f)
-#define PIXMAN_FORMAT_G(f)	(((f) >>  4) & 0x0f)
-#define PIXMAN_FORMAT_B(f)	(((f)      ) & 0x0f)
-#define PIXMAN_FORMAT_RGB(f)	(((f)      ) & 0xfff)
-#define PIXMAN_FORMAT_VIS(f)	(((f)      ) & 0xffff)
-#define PIXMAN_FORMAT_DEPTH(f)	(PIXMAN_FORMAT_A(f) +	\
-				 PIXMAN_FORMAT_R(f) +	\
-				 PIXMAN_FORMAT_G(f) +	\
-				 PIXMAN_FORMAT_B(f))
-
-#define PIXMAN_TYPE_OTHER	0
-#define PIXMAN_TYPE_A		1
-#define PIXMAN_TYPE_ARGB	2
-#define PIXMAN_TYPE_ABGR	3
-#define PIXMAN_TYPE_COLOR	4
-#define PIXMAN_TYPE_GRAY	5
-#define PIXMAN_TYPE_YUY2	6
-#define PIXMAN_TYPE_YV12	7
-#define PIXMAN_TYPE_BGRA	8
-#define PIXMAN_TYPE_RGBA	9
-
-#define PIXMAN_FORMAT_COLOR(f)				\
-	(PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ARGB ||	\
-	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ABGR ||	\
-	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA ||	\
-	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA)
-
-/* 32bpp formats */
-typedef enum {
-    PIXMAN_a8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
-    PIXMAN_x8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
-    PIXMAN_a8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8),
-    PIXMAN_x8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8),
-    PIXMAN_b8g8r8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,8,8,8,8),
-    PIXMAN_b8g8r8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,0,8,8,8),
-    PIXMAN_r8g8b8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,8,8,8,8),
-    PIXMAN_r8g8b8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,0,8,8,8),
-    PIXMAN_x14r6g6b6 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,6,6,6),
-    PIXMAN_x2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,10,10,10),
-    PIXMAN_a2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,2,10,10,10),
-    PIXMAN_x2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,10,10,10),
-    PIXMAN_a2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,2,10,10,10),
-
-/* 24bpp formats */
-    PIXMAN_r8g8b8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
-    PIXMAN_b8g8r8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8),
-
-/* 16bpp formats */
-    PIXMAN_r5g6b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,6,5),
-    PIXMAN_b5g6r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,6,5),
-
-    PIXMAN_a1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,1,5,5,5),
-    PIXMAN_x1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,5,5),
-    PIXMAN_a1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,1,5,5,5),
-    PIXMAN_x1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,5,5),
-    PIXMAN_a4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,4,4,4,4),
-    PIXMAN_x4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,4,4,4),
-    PIXMAN_a4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,4,4,4,4),
-    PIXMAN_x4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,4,4,4),
-
-/* 8bpp formats */
-    PIXMAN_a8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,8,0,0,0),
-    PIXMAN_r3g3b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,0,3,3,2),
-    PIXMAN_b2g3r3 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,0,3,3,2),
-    PIXMAN_a2r2g2b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,2,2,2,2),
-    PIXMAN_a2b2g2r2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,2,2,2,2),
-
-    PIXMAN_c8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
-    PIXMAN_g8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
-
-    PIXMAN_x4a4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,4,0,0,0),
-
-    PIXMAN_x4c4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
-    PIXMAN_x4g4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
-
-/* 4bpp formats */
-    PIXMAN_a4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_A,4,0,0,0),
-    PIXMAN_r1g2b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,0,1,2,1),
-    PIXMAN_b1g2r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,0,1,2,1),
-    PIXMAN_a1r1g1b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,1,1,1,1),
-    PIXMAN_a1b1g1r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,1,1,1,1),
-
-    PIXMAN_c4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_COLOR,0,0,0,0),
-    PIXMAN_g4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_GRAY,0,0,0,0),
-
-/* 1bpp formats */
-    PIXMAN_a1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_A,1,0,0,0),
-
-    PIXMAN_g1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_GRAY,0,0,0,0),
-
-/* YUV formats */
-    PIXMAN_yuy2 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_YUY2,0,0,0,0),
-    PIXMAN_yv12 =	 PIXMAN_FORMAT(12,PIXMAN_TYPE_YV12,0,0,0,0)
-} pixman_format_code_t;
-
-/* Querying supported format values. */
-pixman_bool_t pixman_format_supported_destination (pixman_format_code_t format);
-pixman_bool_t pixman_format_supported_source      (pixman_format_code_t format);
-
-/* Constructors */
-pixman_image_t *pixman_image_create_solid_fill       (pixman_color_t               *color);
-pixman_image_t *pixman_image_create_linear_gradient  (pixman_point_fixed_t         *p1,
-						      pixman_point_fixed_t         *p2,
-						      const pixman_gradient_stop_t *stops,
-						      int                           n_stops);
-pixman_image_t *pixman_image_create_radial_gradient  (pixman_point_fixed_t         *inner,
-						      pixman_point_fixed_t         *outer,
-						      pixman_fixed_t                inner_radius,
-						      pixman_fixed_t                outer_radius,
-						      const pixman_gradient_stop_t *stops,
-						      int                           n_stops);
-pixman_image_t *pixman_image_create_conical_gradient (pixman_point_fixed_t         *center,
-						      pixman_fixed_t                angle,
-						      const pixman_gradient_stop_t *stops,
-						      int                           n_stops);
-pixman_image_t *pixman_image_create_bits             (pixman_format_code_t          format,
-						      int                           width,
-						      int                           height,
-						      uint32_t                     *bits,
-						      int                           rowstride_bytes);
-
-/* Destructor */
-pixman_image_t *pixman_image_ref                     (pixman_image_t               *image);
-pixman_bool_t   pixman_image_unref                   (pixman_image_t               *image);
-
-void		pixman_image_set_destroy_function    (pixman_image_t		   *image,
-						      pixman_image_destroy_func_t   function,
-						      void			   *data);
-void *		pixman_image_get_destroy_data        (pixman_image_t		   *image);
-
-/* Set properties */
-pixman_bool_t   pixman_image_set_clip_region         (pixman_image_t               *image,
-						      pixman_region16_t            *region);
-pixman_bool_t   pixman_image_set_clip_region32       (pixman_image_t               *image,
-						      pixman_region32_t            *region);
-void		pixman_image_set_has_client_clip     (pixman_image_t               *image,
-						      pixman_bool_t		    clien_clip);
-pixman_bool_t   pixman_image_set_transform           (pixman_image_t               *image,
-						      const pixman_transform_t     *transform);
-void            pixman_image_set_repeat              (pixman_image_t               *image,
-						      pixman_repeat_t               repeat);
-pixman_bool_t   pixman_image_set_filter              (pixman_image_t               *image,
-						      pixman_filter_t               filter,
-						      const pixman_fixed_t         *filter_params,
-						      int                           n_filter_params);
-void		pixman_image_set_source_clipping     (pixman_image_t		   *image,
-						      pixman_bool_t                 source_clipping);
-void            pixman_image_set_alpha_map           (pixman_image_t               *image,
-						      pixman_image_t               *alpha_map,
-						      int16_t                       x,
-						      int16_t                       y);
-void            pixman_image_set_component_alpha     (pixman_image_t               *image,
-						      pixman_bool_t                 component_alpha);
-pixman_bool_t   pixman_image_get_component_alpha     (pixman_image_t               *image);
-void		pixman_image_set_accessors	     (pixman_image_t		   *image,
-						      pixman_read_memory_func_t	    read_func,
-						      pixman_write_memory_func_t    write_func);
-void		pixman_image_set_indexed	     (pixman_image_t		   *image,
-						      const pixman_indexed_t	   *indexed);
-uint32_t       *pixman_image_get_data                (pixman_image_t               *image);
-int		pixman_image_get_width               (pixman_image_t               *image);
-int             pixman_image_get_height              (pixman_image_t               *image);
-int		pixman_image_get_stride              (pixman_image_t               *image); /* in bytes */
-int		pixman_image_get_depth               (pixman_image_t		   *image);
-pixman_format_code_t pixman_image_get_format	     (pixman_image_t		   *image);
-pixman_bool_t	pixman_image_fill_rectangles	     (pixman_op_t		    op,
-						      pixman_image_t		   *image,
-						      pixman_color_t		   *color,
-						      int			    n_rects,
-						      const pixman_rectangle16_t   *rects);
-pixman_bool_t   pixman_image_fill_boxes              (pixman_op_t                   op,
-                                                      pixman_image_t               *dest,
-                                                      pixman_color_t               *color,
-                                                      int                           n_boxes,
-                                                      const pixman_box32_t         *boxes);
-
-/* Composite */
-pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
-					       pixman_image_t    *src_image,
-					       pixman_image_t    *mask_image,
-					       pixman_image_t    *dest_image,
-					       int16_t            src_x,
-					       int16_t            src_y,
-					       int16_t            mask_x,
-					       int16_t            mask_y,
-					       int16_t            dest_x,
-					       int16_t            dest_y,
-					       uint16_t           width,
-					       uint16_t           height);
-void          pixman_image_composite          (pixman_op_t        op,
-					       pixman_image_t    *src,
-					       pixman_image_t    *mask,
-					       pixman_image_t    *dest,
-					       int16_t            src_x,
-					       int16_t            src_y,
-					       int16_t            mask_x,
-					       int16_t            mask_y,
-					       int16_t            dest_x,
-					       int16_t            dest_y,
-					       uint16_t           width,
-					       uint16_t           height);
-void          pixman_image_composite32        (pixman_op_t        op,
-					       pixman_image_t    *src,
-					       pixman_image_t    *mask,
-					       pixman_image_t    *dest,
-					       int32_t            src_x,
-					       int32_t            src_y,
-					       int32_t            mask_x,
-					       int32_t            mask_y,
-					       int32_t            dest_x,
-					       int32_t            dest_y,
-					       int32_t            width,
-					       int32_t            height);
-
-/* Executive Summary: This function is a no-op that only exists
- * for historical reasons.
- *
- * There used to be a bug in the X server where it would rely on
- * out-of-bounds accesses when it was asked to composite with a
- * window as the source. It would create a pixman image pointing
- * to some bogus position in memory, but then set a clip region
- * to the position where the actual bits were.
- *
- * Due to a bug in old versions of pixman, where it would not clip
- * against the image bounds when a clip region was set, this would
- * actually work. So when the pixman bug was fixed, a workaround was
- * added to allow certain out-of-bound accesses. This function disabled
- * those workarounds.
- *
- * Since 0.21.2, pixman doesn't do these workarounds anymore, so now this
- * function is a no-op.
- */
-void pixman_disable_out_of_bounds_workaround (void);
-
-/*
- * Trapezoids
- */
-typedef struct pixman_edge pixman_edge_t;
-typedef struct pixman_trapezoid pixman_trapezoid_t;
-typedef struct pixman_trap pixman_trap_t;
-typedef struct pixman_span_fix pixman_span_fix_t;
-typedef struct pixman_triangle pixman_triangle_t;
-
-/*
- * An edge structure.  This represents a single polygon edge
- * and can be quickly stepped across small or large gaps in the
- * sample grid
- */
-struct pixman_edge
-{
-    pixman_fixed_t	x;
-    pixman_fixed_t	e;
-    pixman_fixed_t	stepx;
-    pixman_fixed_t	signdx;
-    pixman_fixed_t	dy;
-    pixman_fixed_t	dx;
-
-    pixman_fixed_t	stepx_small;
-    pixman_fixed_t	stepx_big;
-    pixman_fixed_t	dx_small;
-    pixman_fixed_t	dx_big;
-};
-
-struct pixman_trapezoid
-{
-    pixman_fixed_t	top, bottom;
-    pixman_line_fixed_t	left, right;
-};
-
-struct pixman_triangle
-{
-    pixman_point_fixed_t p1, p2, p3;
-};
-
-/* whether 't' is a well defined not obviously empty trapezoid */
-#define pixman_trapezoid_valid(t)				   \
-    ((t)->left.p1.y != (t)->left.p2.y &&			   \
-     (t)->right.p1.y != (t)->right.p2.y &&			   \
-     (int) ((t)->bottom - (t)->top) > 0)
-
-struct pixman_span_fix
-{
-    pixman_fixed_t	l, r, y;
-};
-
-struct pixman_trap
-{
-    pixman_span_fix_t	top, bot;
-};
-
-pixman_fixed_t pixman_sample_ceil_y        (pixman_fixed_t             y,
-					    int                        bpp);
-pixman_fixed_t pixman_sample_floor_y       (pixman_fixed_t             y,
-					    int                        bpp);
-void           pixman_edge_step            (pixman_edge_t             *e,
-					    int                        n);
-void           pixman_edge_init            (pixman_edge_t             *e,
-					    int                        bpp,
-					    pixman_fixed_t             y_start,
-					    pixman_fixed_t             x_top,
-					    pixman_fixed_t             y_top,
-					    pixman_fixed_t             x_bot,
-					    pixman_fixed_t             y_bot);
-void           pixman_line_fixed_edge_init (pixman_edge_t             *e,
-					    int                        bpp,
-					    pixman_fixed_t             y,
-					    const pixman_line_fixed_t *line,
-					    int                        x_off,
-					    int                        y_off);
-void           pixman_rasterize_edges      (pixman_image_t            *image,
-					    pixman_edge_t             *l,
-					    pixman_edge_t             *r,
-					    pixman_fixed_t             t,
-					    pixman_fixed_t             b);
-void           pixman_add_traps            (pixman_image_t            *image,
-					    int16_t                    x_off,
-					    int16_t                    y_off,
-					    int                        ntrap,
-					    pixman_trap_t             *traps);
-void           pixman_add_trapezoids       (pixman_image_t            *image,
-					    int16_t                    x_off,
-					    int                        y_off,
-					    int                        ntraps,
-					    const pixman_trapezoid_t  *traps);
-void           pixman_rasterize_trapezoid  (pixman_image_t            *image,
-					    const pixman_trapezoid_t  *trap,
-					    int                        x_off,
-					    int                        y_off);
-void          pixman_composite_trapezoids (pixman_op_t		       op,
-					   pixman_image_t *	       src,
-					   pixman_image_t *	       dst,
-					   pixman_format_code_t	       mask_format,
-					   int			       x_src,
-					   int			       y_src,
-					   int			       x_dst,
-					   int			       y_dst,
-					   int			       n_traps,
-					   const pixman_trapezoid_t *  traps);
-void          pixman_composite_triangles (pixman_op_t		       op,
-					  pixman_image_t *	       src,
-					  pixman_image_t *	       dst,
-					  pixman_format_code_t	       mask_format,
-					  int			       x_src,
-					  int			       y_src,
-					  int			       x_dst,
-					  int			       y_dst,
-					  int			       n_tris,
-					  const pixman_triangle_t *    tris);
-void	      pixman_add_triangles       (pixman_image_t              *image,
-					  int32_t	               x_off,
-					  int32_t	               y_off,
-					  int	                       n_tris,
-					  const pixman_triangle_t     *tris);
-
-PIXMAN_END_DECLS
-
-#endif /* PIXMAN_H__ */
+/***********************************************************
+
+Copyright 1987, 1998  The Open Group
+
+Permission to use, copy, modify, distribute, and sell this software and its
+documentation for any purpose is hereby granted without fee, provided that
+the above copyright notice appear in all copies and that both that
+copyright notice and this permission notice appear in supporting
+documentation.
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Except as contained in this notice, the name of The Open Group shall not be
+used in advertising or otherwise to promote the sale, use or other dealings
+in this Software without prior written authorization from The Open Group.
+
+Copyright 1987 by Digital Equipment Corporation, Maynard, Massachusetts.
+
+                        All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Digital not be
+used in advertising or publicity pertaining to distribution of the
+software without specific, written prior permission.
+
+DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
+DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+SOFTWARE.
+
+******************************************************************/
+/*
+ * Copyright © 1998, 2004 Keith Packard
+ * Copyright   2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef PIXMAN_H__
+#define PIXMAN_H__
+
+#include <pixman-version.h>
+
+#ifdef  __cplusplus
+#define PIXMAN_BEGIN_DECLS extern "C" {
+#define PIXMAN_END_DECLS }
+#else
+#define PIXMAN_BEGIN_DECLS
+#define PIXMAN_END_DECLS
+#endif
+
+PIXMAN_BEGIN_DECLS
+
+/*
+ * Standard integers
+ */
+
+#if !defined (PIXMAN_DONT_DEFINE_STDINT)
+
+#if defined (_SVR4) || defined (SVR4) || defined (__OpenBSD__) || defined (_sgi) || defined (__sun) || defined (sun) || defined (__digital__) || defined (__HP_cc)
+#  include <inttypes.h>
+/* VS 2010 (_MSC_VER 1600) has stdint.h */
+#elif defined (_MSC_VER) && _MSC_VER < 1600
+typedef __int8 int8_t;
+typedef unsigned __int8 uint8_t;
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#elif defined (_AIX)
+#  include <sys/inttypes.h>
+#else
+#  include <stdint.h>
+#endif
+
+#endif
+
+/*
+ * Boolean
+ */
+typedef int pixman_bool_t;
+
+/*
+ * Fixpoint numbers
+ */
+typedef int64_t			pixman_fixed_32_32_t;
+typedef pixman_fixed_32_32_t	pixman_fixed_48_16_t;
+typedef uint32_t		pixman_fixed_1_31_t;
+typedef uint32_t		pixman_fixed_1_16_t;
+typedef int32_t			pixman_fixed_16_16_t;
+typedef pixman_fixed_16_16_t	pixman_fixed_t;
+
+#define pixman_fixed_e			((pixman_fixed_t) 1)
+#define pixman_fixed_1			(pixman_int_to_fixed(1))
+#define pixman_fixed_1_minus_e		(pixman_fixed_1 - pixman_fixed_e)
+#define pixman_fixed_minus_1		(pixman_int_to_fixed(-1))
+#define pixman_fixed_to_int(f)		((int) ((f) >> 16))
+#define pixman_int_to_fixed(i)		((pixman_fixed_t) ((i) << 16))
+#define pixman_fixed_to_double(f)	(double) ((f) / (double) pixman_fixed_1)
+#define pixman_double_to_fixed(d)	((pixman_fixed_t) ((d) * 65536.0))
+#define pixman_fixed_frac(f)		((f) & pixman_fixed_1_minus_e)
+#define pixman_fixed_floor(f)		((f) & ~pixman_fixed_1_minus_e)
+#define pixman_fixed_ceil(f)		pixman_fixed_floor ((f) + pixman_fixed_1_minus_e)
+#define pixman_fixed_fraction(f)	((f) & pixman_fixed_1_minus_e)
+#define pixman_fixed_mod_2(f)		((f) & (pixman_fixed1 | pixman_fixed_1_minus_e))
+#define pixman_max_fixed_48_16		((pixman_fixed_48_16_t) 0x7fffffff)
+#define pixman_min_fixed_48_16		(-((pixman_fixed_48_16_t) 1 << 31))
+
+/*
+ * Misc structs
+ */
+typedef struct pixman_color pixman_color_t;
+typedef struct pixman_point_fixed pixman_point_fixed_t;
+typedef struct pixman_line_fixed pixman_line_fixed_t;
+typedef struct pixman_vector pixman_vector_t;
+typedef struct pixman_transform pixman_transform_t;
+
+struct pixman_color
+{
+    uint16_t	red;
+    uint16_t    green;
+    uint16_t    blue;
+    uint16_t    alpha;
+};
+
+struct pixman_point_fixed
+{
+    pixman_fixed_t	x;
+    pixman_fixed_t	y;
+};
+
+struct pixman_line_fixed
+{
+    pixman_point_fixed_t	p1, p2;
+};
+
+/*
+ * Fixed point matrices
+ */
+
+struct pixman_vector
+{
+    pixman_fixed_t	vector[3];
+};
+
+struct pixman_transform
+{
+    pixman_fixed_t	matrix[3][3];
+};
+
+/* forward declaration (sorry) */
+struct pixman_box16;
+typedef  union pixman_image		pixman_image_t;
+
+void          pixman_transform_init_identity    (struct pixman_transform       *matrix);
+pixman_bool_t pixman_transform_point_3d         (const struct pixman_transform *transform,
+						 struct pixman_vector          *vector);
+pixman_bool_t pixman_transform_point            (const struct pixman_transform *transform,
+						 struct pixman_vector          *vector);
+pixman_bool_t pixman_transform_multiply         (struct pixman_transform       *dst,
+						 const struct pixman_transform *l,
+						 const struct pixman_transform *r);
+void          pixman_transform_init_scale       (struct pixman_transform       *t,
+						 pixman_fixed_t                 sx,
+						 pixman_fixed_t                 sy);
+pixman_bool_t pixman_transform_scale            (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 sx,
+						 pixman_fixed_t                 sy);
+void          pixman_transform_init_rotate      (struct pixman_transform       *t,
+						 pixman_fixed_t                 cos,
+						 pixman_fixed_t                 sin);
+pixman_bool_t pixman_transform_rotate           (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 c,
+						 pixman_fixed_t                 s);
+void          pixman_transform_init_translate   (struct pixman_transform       *t,
+						 pixman_fixed_t                 tx,
+						 pixman_fixed_t                 ty);
+pixman_bool_t pixman_transform_translate        (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 tx,
+						 pixman_fixed_t                 ty);
+pixman_bool_t pixman_transform_bounds           (const struct pixman_transform *matrix,
+						 struct pixman_box16           *b);
+pixman_bool_t pixman_transform_invert           (struct pixman_transform       *dst,
+						 const struct pixman_transform *src);
+pixman_bool_t pixman_transform_is_identity      (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_scale         (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_int_translate (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_inverse       (const struct pixman_transform *a,
+						 const struct pixman_transform *b);
+
+/*
+ * Floating point matrices
+ */
+struct pixman_f_vector
+{
+    double  v[3];
+};
+
+struct pixman_f_transform
+{
+    double  m[3][3];
+};
+
+pixman_bool_t pixman_transform_from_pixman_f_transform (struct pixman_transform         *t,
+							const struct pixman_f_transform *ft);
+void          pixman_f_transform_from_pixman_transform (struct pixman_f_transform       *ft,
+							const struct pixman_transform   *t);
+pixman_bool_t pixman_f_transform_invert                (struct pixman_f_transform       *dst,
+							const struct pixman_f_transform *src);
+pixman_bool_t pixman_f_transform_point                 (const struct pixman_f_transform *t,
+							struct pixman_f_vector          *v);
+void          pixman_f_transform_point_3d              (const struct pixman_f_transform *t,
+							struct pixman_f_vector          *v);
+void          pixman_f_transform_multiply              (struct pixman_f_transform       *dst,
+							const struct pixman_f_transform *l,
+							const struct pixman_f_transform *r);
+void          pixman_f_transform_init_scale            (struct pixman_f_transform       *t,
+							double                           sx,
+							double                           sy);
+pixman_bool_t pixman_f_transform_scale                 (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           sx,
+							double                           sy);
+void          pixman_f_transform_init_rotate           (struct pixman_f_transform       *t,
+							double                           cos,
+							double                           sin);
+pixman_bool_t pixman_f_transform_rotate                (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           c,
+							double                           s);
+void          pixman_f_transform_init_translate        (struct pixman_f_transform       *t,
+							double                           tx,
+							double                           ty);
+pixman_bool_t pixman_f_transform_translate             (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           tx,
+							double                           ty);
+pixman_bool_t pixman_f_transform_bounds                (const struct pixman_f_transform *t,
+							struct pixman_box16             *b);
+void          pixman_f_transform_init_identity         (struct pixman_f_transform       *t);
+
+typedef enum
+{
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_PAD,
+    PIXMAN_REPEAT_REFLECT
+} pixman_repeat_t;
+
+typedef enum
+{
+    PIXMAN_FILTER_FAST,
+    PIXMAN_FILTER_GOOD,
+    PIXMAN_FILTER_BEST,
+    PIXMAN_FILTER_NEAREST,
+    PIXMAN_FILTER_BILINEAR,
+    PIXMAN_FILTER_CONVOLUTION
+} pixman_filter_t;
+
+typedef enum
+{
+    PIXMAN_OP_CLEAR			= 0x00,
+    PIXMAN_OP_SRC			= 0x01,
+    PIXMAN_OP_DST			= 0x02,
+    PIXMAN_OP_OVER			= 0x03,
+    PIXMAN_OP_OVER_REVERSE		= 0x04,
+    PIXMAN_OP_IN			= 0x05,
+    PIXMAN_OP_IN_REVERSE		= 0x06,
+    PIXMAN_OP_OUT			= 0x07,
+    PIXMAN_OP_OUT_REVERSE		= 0x08,
+    PIXMAN_OP_ATOP			= 0x09,
+    PIXMAN_OP_ATOP_REVERSE		= 0x0a,
+    PIXMAN_OP_XOR			= 0x0b,
+    PIXMAN_OP_ADD			= 0x0c,
+    PIXMAN_OP_SATURATE			= 0x0d,
+
+    PIXMAN_OP_DISJOINT_CLEAR		= 0x10,
+    PIXMAN_OP_DISJOINT_SRC		= 0x11,
+    PIXMAN_OP_DISJOINT_DST		= 0x12,
+    PIXMAN_OP_DISJOINT_OVER		= 0x13,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE	= 0x14,
+    PIXMAN_OP_DISJOINT_IN		= 0x15,
+    PIXMAN_OP_DISJOINT_IN_REVERSE	= 0x16,
+    PIXMAN_OP_DISJOINT_OUT		= 0x17,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE	= 0x18,
+    PIXMAN_OP_DISJOINT_ATOP		= 0x19,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE	= 0x1a,
+    PIXMAN_OP_DISJOINT_XOR		= 0x1b,
+
+    PIXMAN_OP_CONJOINT_CLEAR		= 0x20,
+    PIXMAN_OP_CONJOINT_SRC		= 0x21,
+    PIXMAN_OP_CONJOINT_DST		= 0x22,
+    PIXMAN_OP_CONJOINT_OVER		= 0x23,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE	= 0x24,
+    PIXMAN_OP_CONJOINT_IN		= 0x25,
+    PIXMAN_OP_CONJOINT_IN_REVERSE	= 0x26,
+    PIXMAN_OP_CONJOINT_OUT		= 0x27,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE	= 0x28,
+    PIXMAN_OP_CONJOINT_ATOP		= 0x29,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE	= 0x2a,
+    PIXMAN_OP_CONJOINT_XOR		= 0x2b,
+
+    PIXMAN_OP_MULTIPLY                  = 0x30,
+    PIXMAN_OP_SCREEN                    = 0x31,
+    PIXMAN_OP_OVERLAY                   = 0x32,
+    PIXMAN_OP_DARKEN                    = 0x33,
+    PIXMAN_OP_LIGHTEN                   = 0x34,
+    PIXMAN_OP_COLOR_DODGE               = 0x35,
+    PIXMAN_OP_COLOR_BURN                = 0x36,
+    PIXMAN_OP_HARD_LIGHT                = 0x37,
+    PIXMAN_OP_SOFT_LIGHT                = 0x38,
+    PIXMAN_OP_DIFFERENCE                = 0x39,
+    PIXMAN_OP_EXCLUSION                 = 0x3a,
+    PIXMAN_OP_HSL_HUE			= 0x3b,
+    PIXMAN_OP_HSL_SATURATION		= 0x3c,
+    PIXMAN_OP_HSL_COLOR			= 0x3d,
+    PIXMAN_OP_HSL_LUMINOSITY		= 0x3e
+
+#ifdef PIXMAN_USE_INTERNAL_API
+    ,
+    PIXMAN_N_OPERATORS,
+    PIXMAN_OP_NONE = PIXMAN_N_OPERATORS
+#endif
+} pixman_op_t;
+
+/*
+ * Regions
+ */
+typedef struct pixman_region16_data	pixman_region16_data_t;
+typedef struct pixman_box16		pixman_box16_t;
+typedef struct pixman_rectangle16	pixman_rectangle16_t;
+typedef struct pixman_region16		pixman_region16_t;
+
+struct pixman_region16_data {
+    long		size;
+    long		numRects;
+/*  pixman_box16_t	rects[size];   in memory but not explicitly declared */
+};
+
+struct pixman_rectangle16
+{
+    int16_t	x, y;
+    uint16_t	width, height;
+};
+
+struct pixman_box16
+{
+    int16_t x1, y1, x2, y2;
+};
+
+struct pixman_region16
+{
+    pixman_box16_t          extents;
+    pixman_region16_data_t *data;
+};
+
+typedef enum
+{
+    PIXMAN_REGION_OUT,
+    PIXMAN_REGION_IN,
+    PIXMAN_REGION_PART
+} pixman_region_overlap_t;
+
+/* This function exists only to make it possible to preserve
+ * the X ABI - it should go away at first opportunity.
+ */
+void pixman_region_set_static_pointers (pixman_box16_t         *empty_box,
+					pixman_region16_data_t *empty_data,
+					pixman_region16_data_t *broken_data);
+
+/* creation/destruction */
+void                    pixman_region_init               (pixman_region16_t *region);
+void                    pixman_region_init_rect          (pixman_region16_t *region,
+							  int                x,
+							  int                y,
+							  unsigned int       width,
+							  unsigned int       height);
+pixman_bool_t           pixman_region_init_rects         (pixman_region16_t *region,
+							  const pixman_box16_t *boxes,
+							  int                count);
+void                    pixman_region_init_with_extents  (pixman_region16_t *region,
+							  pixman_box16_t    *extents);
+void                    pixman_region_init_from_image    (pixman_region16_t *region,
+							  pixman_image_t    *image);
+void                    pixman_region_fini               (pixman_region16_t *region);
+
+
+/* manipulation */
+void                    pixman_region_translate          (pixman_region16_t *region,
+							  int                x,
+							  int                y);
+pixman_bool_t           pixman_region_copy               (pixman_region16_t *dest,
+							  pixman_region16_t *source);
+pixman_bool_t           pixman_region_intersect          (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union              (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union_rect         (pixman_region16_t *dest,
+							  pixman_region16_t *source,
+							  int                x,
+							  int                y,
+							  unsigned int       width,
+							  unsigned int       height);
+pixman_bool_t		pixman_region_intersect_rect     (pixman_region16_t *dest,
+							  pixman_region16_t *source,
+							  int                x,
+							  int                y,
+							  unsigned int       width,
+							  unsigned int       height);
+pixman_bool_t           pixman_region_subtract           (pixman_region16_t *reg_d,
+							  pixman_region16_t *reg_m,
+							  pixman_region16_t *reg_s);
+pixman_bool_t           pixman_region_inverse            (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_box16_t    *inv_rect);
+pixman_bool_t           pixman_region_contains_point     (pixman_region16_t *region,
+							  int                x,
+							  int                y,
+							  pixman_box16_t    *box);
+pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *region,
+							  pixman_box16_t    *prect);
+pixman_bool_t           pixman_region_not_empty          (pixman_region16_t *region);
+pixman_box16_t *        pixman_region_extents            (pixman_region16_t *region);
+int                     pixman_region_n_rects            (pixman_region16_t *region);
+pixman_box16_t *        pixman_region_rectangles         (pixman_region16_t *region,
+							  int               *n_rects);
+pixman_bool_t           pixman_region_equal              (pixman_region16_t *region1,
+							  pixman_region16_t *region2);
+pixman_bool_t           pixman_region_selfcheck          (pixman_region16_t *region);
+void                    pixman_region_reset              (pixman_region16_t *region,
+							  pixman_box16_t    *box);
+/*
+ * 32 bit regions
+ */
+typedef struct pixman_region32_data	pixman_region32_data_t;
+typedef struct pixman_box32		pixman_box32_t;
+typedef struct pixman_rectangle32	pixman_rectangle32_t;
+typedef struct pixman_region32		pixman_region32_t;
+
+struct pixman_region32_data {
+    long		size;
+    long		numRects;
+/*  pixman_box32_t	rects[size];   in memory but not explicitly declared */
+};
+
+struct pixman_rectangle32
+{
+    int32_t x, y;
+    uint32_t width, height;
+};
+
+struct pixman_box32
+{
+    int32_t x1, y1, x2, y2;
+};
+
+struct pixman_region32
+{
+    pixman_box32_t          extents;
+    pixman_region32_data_t  *data;
+};
+
+/* creation/destruction */
+void                    pixman_region32_init               (pixman_region32_t *region);
+void                    pixman_region32_init_rect          (pixman_region32_t *region,
+							    int                x,
+							    int                y,
+							    unsigned int       width,
+							    unsigned int       height);
+pixman_bool_t           pixman_region32_init_rects         (pixman_region32_t *region,
+							    const pixman_box32_t *boxes,
+							    int                count);
+void                    pixman_region32_init_with_extents  (pixman_region32_t *region,
+							    pixman_box32_t    *extents);
+void                    pixman_region32_init_from_image    (pixman_region32_t *region,
+							    pixman_image_t    *image);
+void                    pixman_region32_fini               (pixman_region32_t *region);
+
+
+/* manipulation */
+void                    pixman_region32_translate          (pixman_region32_t *region,
+							    int                x,
+							    int                y);
+pixman_bool_t           pixman_region32_copy               (pixman_region32_t *dest,
+							    pixman_region32_t *source);
+pixman_bool_t           pixman_region32_intersect          (pixman_region32_t *new_reg,
+							    pixman_region32_t *reg1,
+							    pixman_region32_t *reg2);
+pixman_bool_t           pixman_region32_union              (pixman_region32_t *new_reg,
+							    pixman_region32_t *reg1,
+							    pixman_region32_t *reg2);
+pixman_bool_t		pixman_region32_intersect_rect     (pixman_region32_t *dest,
+							    pixman_region32_t *source,
+							    int                x,
+							    int                y,
+							    unsigned int       width,
+							    unsigned int       height);
+pixman_bool_t           pixman_region32_union_rect         (pixman_region32_t *dest,
+							    pixman_region32_t *source,
+							    int                x,
+							    int                y,
+							    unsigned int       width,
+							    unsigned int       height);
+pixman_bool_t           pixman_region32_subtract           (pixman_region32_t *reg_d,
+							    pixman_region32_t *reg_m,
+							    pixman_region32_t *reg_s);
+pixman_bool_t           pixman_region32_inverse            (pixman_region32_t *new_reg,
+							    pixman_region32_t *reg1,
+							    pixman_box32_t    *inv_rect);
+pixman_bool_t           pixman_region32_contains_point     (pixman_region32_t *region,
+							    int                x,
+							    int                y,
+							    pixman_box32_t    *box);
+pixman_region_overlap_t pixman_region32_contains_rectangle (pixman_region32_t *region,
+							    pixman_box32_t    *prect);
+pixman_bool_t           pixman_region32_not_empty          (pixman_region32_t *region);
+pixman_box32_t *        pixman_region32_extents            (pixman_region32_t *region);
+int                     pixman_region32_n_rects            (pixman_region32_t *region);
+pixman_box32_t *        pixman_region32_rectangles         (pixman_region32_t *region,
+							    int               *n_rects);
+pixman_bool_t           pixman_region32_equal              (pixman_region32_t *region1,
+							    pixman_region32_t *region2);
+pixman_bool_t           pixman_region32_selfcheck          (pixman_region32_t *region);
+void                    pixman_region32_reset              (pixman_region32_t *region,
+							    pixman_box32_t    *box);
+
+
+/* Copy / Fill / Misc */
+pixman_bool_t pixman_blt                (uint32_t           *src_bits,
+					 uint32_t           *dst_bits,
+					 int                 src_stride,
+					 int                 dst_stride,
+					 int                 src_bpp,
+					 int                 dst_bpp,
+					 int                 src_x,
+					 int                 src_y,
+					 int                 dest_x,
+					 int                 dest_y,
+					 int                 width,
+					 int                 height);
+pixman_bool_t pixman_fill               (uint32_t           *bits,
+					 int                 stride,
+					 int                 bpp,
+					 int                 x,
+					 int                 y,
+					 int                 width,
+					 int                 height,
+					 uint32_t            _xor);
+
+int           pixman_version            (void);
+const char*   pixman_version_string     (void);
+
+/*
+ * Images
+ */
+typedef struct pixman_indexed		pixman_indexed_t;
+typedef struct pixman_gradient_stop	pixman_gradient_stop_t;
+
+typedef uint32_t (* pixman_read_memory_func_t) (const void *src, int size);
+typedef void     (* pixman_write_memory_func_t) (void *dst, uint32_t value, int size);
+
+typedef void     (* pixman_image_destroy_func_t) (pixman_image_t *image, void *data);
+
+struct pixman_gradient_stop {
+    pixman_fixed_t x;
+    pixman_color_t color;
+};
+
+#define PIXMAN_MAX_INDEXED  256 /* XXX depth must be <= 8 */
+
+#if PIXMAN_MAX_INDEXED <= 256
+typedef uint8_t pixman_index_type;
+#endif
+
+struct pixman_indexed
+{
+    pixman_bool_t       color;
+    uint32_t		rgba[PIXMAN_MAX_INDEXED];
+    pixman_index_type	ent[32768];
+};
+
+/*
+ * While the protocol is generous in format support, the
+ * sample implementation allows only packed RGB and GBR
+ * representations for data to simplify software rendering,
+ */
+#define PIXMAN_FORMAT(bpp,type,a,r,g,b)	(((bpp) << 24) |  \
+					 ((type) << 16) | \
+					 ((a) << 12) |	  \
+					 ((r) << 8) |	  \
+					 ((g) << 4) |	  \
+					 ((b)))
+
+#define PIXMAN_FORMAT_BPP(f)	(((f) >> 24)       )
+#define PIXMAN_FORMAT_TYPE(f)	(((f) >> 16) & 0xff)
+#define PIXMAN_FORMAT_A(f)	(((f) >> 12) & 0x0f)
+#define PIXMAN_FORMAT_R(f)	(((f) >>  8) & 0x0f)
+#define PIXMAN_FORMAT_G(f)	(((f) >>  4) & 0x0f)
+#define PIXMAN_FORMAT_B(f)	(((f)      ) & 0x0f)
+#define PIXMAN_FORMAT_RGB(f)	(((f)      ) & 0xfff)
+#define PIXMAN_FORMAT_VIS(f)	(((f)      ) & 0xffff)
+#define PIXMAN_FORMAT_DEPTH(f)	(PIXMAN_FORMAT_A(f) +	\
+				 PIXMAN_FORMAT_R(f) +	\
+				 PIXMAN_FORMAT_G(f) +	\
+				 PIXMAN_FORMAT_B(f))
+
+#define PIXMAN_TYPE_OTHER	0
+#define PIXMAN_TYPE_A		1
+#define PIXMAN_TYPE_ARGB	2
+#define PIXMAN_TYPE_ABGR	3
+#define PIXMAN_TYPE_COLOR	4
+#define PIXMAN_TYPE_GRAY	5
+#define PIXMAN_TYPE_YUY2	6
+#define PIXMAN_TYPE_YV12	7
+#define PIXMAN_TYPE_BGRA	8
+#define PIXMAN_TYPE_RGBA	9
+
+#define PIXMAN_FORMAT_COLOR(f)				\
+	(PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ARGB ||	\
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ABGR ||	\
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA ||	\
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA)
+
+/* 32bpp formats */
+typedef enum {
+    PIXMAN_a8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
+    PIXMAN_x8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_a8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8),
+    PIXMAN_x8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8),
+    PIXMAN_b8g8r8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,8,8,8,8),
+    PIXMAN_b8g8r8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,0,8,8,8),
+    PIXMAN_r8g8b8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,8,8,8,8),
+    PIXMAN_r8g8b8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,0,8,8,8),
+    PIXMAN_x14r6g6b6 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,6,6,6),
+    PIXMAN_x2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,10,10,10),
+    PIXMAN_a2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,2,10,10,10),
+    PIXMAN_x2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,10,10,10),
+    PIXMAN_a2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,2,10,10,10),
+
+/* 24bpp formats */
+    PIXMAN_r8g8b8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_b8g8r8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8),
+
+/* 16bpp formats */
+    PIXMAN_r5g6b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,6,5),
+    PIXMAN_b5g6r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,6,5),
+
+    PIXMAN_a1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,1,5,5,5),
+    PIXMAN_x1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,5,5),
+    PIXMAN_a1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,1,5,5,5),
+    PIXMAN_x1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,5,5),
+    PIXMAN_a4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,4,4,4,4),
+    PIXMAN_x4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,4,4,4),
+    PIXMAN_a4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,4,4,4,4),
+    PIXMAN_x4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,4,4,4),
+
+/* 8bpp formats */
+    PIXMAN_a8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,8,0,0,0),
+    PIXMAN_r3g3b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,0,3,3,2),
+    PIXMAN_b2g3r3 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,0,3,3,2),
+    PIXMAN_a2r2g2b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,2,2,2,2),
+    PIXMAN_a2b2g2r2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,2,2,2,2),
+
+    PIXMAN_c8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+    PIXMAN_x4a4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,4,0,0,0),
+
+    PIXMAN_x4c4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_x4g4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* 4bpp formats */
+    PIXMAN_a4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_A,4,0,0,0),
+    PIXMAN_r1g2b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,0,1,2,1),
+    PIXMAN_b1g2r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,0,1,2,1),
+    PIXMAN_a1r1g1b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,1,1,1,1),
+    PIXMAN_a1b1g1r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,1,1,1,1),
+
+    PIXMAN_c4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* 1bpp formats */
+    PIXMAN_a1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_A,1,0,0,0),
+
+    PIXMAN_g1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* YUV formats */
+    PIXMAN_yuy2 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_YUY2,0,0,0,0),
+    PIXMAN_yv12 =	 PIXMAN_FORMAT(12,PIXMAN_TYPE_YV12,0,0,0,0)
+} pixman_format_code_t;
+
+/* Querying supported format values. */
+pixman_bool_t pixman_format_supported_destination (pixman_format_code_t format);
+pixman_bool_t pixman_format_supported_source      (pixman_format_code_t format);
+
+/* Constructors */
+pixman_image_t *pixman_image_create_solid_fill       (pixman_color_t               *color);
+pixman_image_t *pixman_image_create_linear_gradient  (pixman_point_fixed_t         *p1,
+						      pixman_point_fixed_t         *p2,
+						      const pixman_gradient_stop_t *stops,
+						      int                           n_stops);
+pixman_image_t *pixman_image_create_radial_gradient  (pixman_point_fixed_t         *inner,
+						      pixman_point_fixed_t         *outer,
+						      pixman_fixed_t                inner_radius,
+						      pixman_fixed_t                outer_radius,
+						      const pixman_gradient_stop_t *stops,
+						      int                           n_stops);
+pixman_image_t *pixman_image_create_conical_gradient (pixman_point_fixed_t         *center,
+						      pixman_fixed_t                angle,
+						      const pixman_gradient_stop_t *stops,
+						      int                           n_stops);
+pixman_image_t *pixman_image_create_bits             (pixman_format_code_t          format,
+						      int                           width,
+						      int                           height,
+						      uint32_t                     *bits,
+						      int                           rowstride_bytes);
+
+/* Destructor */
+pixman_image_t *pixman_image_ref                     (pixman_image_t               *image);
+pixman_bool_t   pixman_image_unref                   (pixman_image_t               *image);
+
+void		pixman_image_set_destroy_function    (pixman_image_t		   *image,
+						      pixman_image_destroy_func_t   function,
+						      void			   *data);
+void *		pixman_image_get_destroy_data        (pixman_image_t		   *image);
+
+/* Set properties */
+pixman_bool_t   pixman_image_set_clip_region         (pixman_image_t               *image,
+						      pixman_region16_t            *region);
+pixman_bool_t   pixman_image_set_clip_region32       (pixman_image_t               *image,
+						      pixman_region32_t            *region);
+void		pixman_image_set_has_client_clip     (pixman_image_t               *image,
+						      pixman_bool_t		    clien_clip);
+pixman_bool_t   pixman_image_set_transform           (pixman_image_t               *image,
+						      const pixman_transform_t     *transform);
+void            pixman_image_set_repeat              (pixman_image_t               *image,
+						      pixman_repeat_t               repeat);
+pixman_bool_t   pixman_image_set_filter              (pixman_image_t               *image,
+						      pixman_filter_t               filter,
+						      const pixman_fixed_t         *filter_params,
+						      int                           n_filter_params);
+void		pixman_image_set_source_clipping     (pixman_image_t		   *image,
+						      pixman_bool_t                 source_clipping);
+void            pixman_image_set_alpha_map           (pixman_image_t               *image,
+						      pixman_image_t               *alpha_map,
+						      int16_t                       x,
+						      int16_t                       y);
+void            pixman_image_set_component_alpha     (pixman_image_t               *image,
+						      pixman_bool_t                 component_alpha);
+pixman_bool_t   pixman_image_get_component_alpha     (pixman_image_t               *image);
+void		pixman_image_set_accessors	     (pixman_image_t		   *image,
+						      pixman_read_memory_func_t	    read_func,
+						      pixman_write_memory_func_t    write_func);
+void		pixman_image_set_indexed	     (pixman_image_t		   *image,
+						      const pixman_indexed_t	   *indexed);
+uint32_t       *pixman_image_get_data                (pixman_image_t               *image);
+int		pixman_image_get_width               (pixman_image_t               *image);
+int             pixman_image_get_height              (pixman_image_t               *image);
+int		pixman_image_get_stride              (pixman_image_t               *image); /* in bytes */
+int		pixman_image_get_depth               (pixman_image_t		   *image);
+pixman_format_code_t pixman_image_get_format	     (pixman_image_t		   *image);
+pixman_bool_t	pixman_image_fill_rectangles	     (pixman_op_t		    op,
+						      pixman_image_t		   *image,
+						      pixman_color_t		   *color,
+						      int			    n_rects,
+						      const pixman_rectangle16_t   *rects);
+pixman_bool_t   pixman_image_fill_boxes              (pixman_op_t                   op,
+                                                      pixman_image_t               *dest,
+                                                      pixman_color_t               *color,
+                                                      int                           n_boxes,
+                                                      const pixman_box32_t         *boxes);
+
+/* Composite */
+pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
+					       pixman_image_t    *src_image,
+					       pixman_image_t    *mask_image,
+					       pixman_image_t    *dest_image,
+					       int16_t            src_x,
+					       int16_t            src_y,
+					       int16_t            mask_x,
+					       int16_t            mask_y,
+					       int16_t            dest_x,
+					       int16_t            dest_y,
+					       uint16_t           width,
+					       uint16_t           height);
+void          pixman_image_composite          (pixman_op_t        op,
+					       pixman_image_t    *src,
+					       pixman_image_t    *mask,
+					       pixman_image_t    *dest,
+					       int16_t            src_x,
+					       int16_t            src_y,
+					       int16_t            mask_x,
+					       int16_t            mask_y,
+					       int16_t            dest_x,
+					       int16_t            dest_y,
+					       uint16_t           width,
+					       uint16_t           height);
+void          pixman_image_composite32        (pixman_op_t        op,
+					       pixman_image_t    *src,
+					       pixman_image_t    *mask,
+					       pixman_image_t    *dest,
+					       int32_t            src_x,
+					       int32_t            src_y,
+					       int32_t            mask_x,
+					       int32_t            mask_y,
+					       int32_t            dest_x,
+					       int32_t            dest_y,
+					       int32_t            width,
+					       int32_t            height);
+
+/* Executive Summary: This function is a no-op that only exists
+ * for historical reasons.
+ *
+ * There used to be a bug in the X server where it would rely on
+ * out-of-bounds accesses when it was asked to composite with a
+ * window as the source. It would create a pixman image pointing
+ * to some bogus position in memory, but then set a clip region
+ * to the position where the actual bits were.
+ *
+ * Due to a bug in old versions of pixman, where it would not clip
+ * against the image bounds when a clip region was set, this would
+ * actually work. So when the pixman bug was fixed, a workaround was
+ * added to allow certain out-of-bound accesses. This function disabled
+ * those workarounds.
+ *
+ * Since 0.21.2, pixman doesn't do these workarounds anymore, so now this
+ * function is a no-op.
+ */
+void pixman_disable_out_of_bounds_workaround (void);
+
+/*
+ * Trapezoids
+ */
+typedef struct pixman_edge pixman_edge_t;
+typedef struct pixman_trapezoid pixman_trapezoid_t;
+typedef struct pixman_trap pixman_trap_t;
+typedef struct pixman_span_fix pixman_span_fix_t;
+typedef struct pixman_triangle pixman_triangle_t;
+
+/*
+ * An edge structure.  This represents a single polygon edge
+ * and can be quickly stepped across small or large gaps in the
+ * sample grid
+ */
+struct pixman_edge
+{
+    pixman_fixed_t	x;
+    pixman_fixed_t	e;
+    pixman_fixed_t	stepx;
+    pixman_fixed_t	signdx;
+    pixman_fixed_t	dy;
+    pixman_fixed_t	dx;
+
+    pixman_fixed_t	stepx_small;
+    pixman_fixed_t	stepx_big;
+    pixman_fixed_t	dx_small;
+    pixman_fixed_t	dx_big;
+};
+
+struct pixman_trapezoid
+{
+    pixman_fixed_t	top, bottom;
+    pixman_line_fixed_t	left, right;
+};
+
+struct pixman_triangle
+{
+    pixman_point_fixed_t p1, p2, p3;
+};
+
+/* whether 't' is a well defined not obviously empty trapezoid */
+#define pixman_trapezoid_valid(t)				   \
+    ((t)->left.p1.y != (t)->left.p2.y &&			   \
+     (t)->right.p1.y != (t)->right.p2.y &&			   \
+     (int) ((t)->bottom - (t)->top) > 0)
+
+struct pixman_span_fix
+{
+    pixman_fixed_t	l, r, y;
+};
+
+struct pixman_trap
+{
+    pixman_span_fix_t	top, bot;
+};
+
+pixman_fixed_t pixman_sample_ceil_y        (pixman_fixed_t             y,
+					    int                        bpp);
+pixman_fixed_t pixman_sample_floor_y       (pixman_fixed_t             y,
+					    int                        bpp);
+void           pixman_edge_step            (pixman_edge_t             *e,
+					    int                        n);
+void           pixman_edge_init            (pixman_edge_t             *e,
+					    int                        bpp,
+					    pixman_fixed_t             y_start,
+					    pixman_fixed_t             x_top,
+					    pixman_fixed_t             y_top,
+					    pixman_fixed_t             x_bot,
+					    pixman_fixed_t             y_bot);
+void           pixman_line_fixed_edge_init (pixman_edge_t             *e,
+					    int                        bpp,
+					    pixman_fixed_t             y,
+					    const pixman_line_fixed_t *line,
+					    int                        x_off,
+					    int                        y_off);
+void           pixman_rasterize_edges      (pixman_image_t            *image,
+					    pixman_edge_t             *l,
+					    pixman_edge_t             *r,
+					    pixman_fixed_t             t,
+					    pixman_fixed_t             b);
+void           pixman_add_traps            (pixman_image_t            *image,
+					    int16_t                    x_off,
+					    int16_t                    y_off,
+					    int                        ntrap,
+					    pixman_trap_t             *traps);
+void           pixman_add_trapezoids       (pixman_image_t            *image,
+					    int16_t                    x_off,
+					    int                        y_off,
+					    int                        ntraps,
+					    const pixman_trapezoid_t  *traps);
+void           pixman_rasterize_trapezoid  (pixman_image_t            *image,
+					    const pixman_trapezoid_t  *trap,
+					    int                        x_off,
+					    int                        y_off);
+void          pixman_composite_trapezoids (pixman_op_t		       op,
+					   pixman_image_t *	       src,
+					   pixman_image_t *	       dst,
+					   pixman_format_code_t	       mask_format,
+					   int			       x_src,
+					   int			       y_src,
+					   int			       x_dst,
+					   int			       y_dst,
+					   int			       n_traps,
+					   const pixman_trapezoid_t *  traps);
+void          pixman_composite_triangles (pixman_op_t		       op,
+					  pixman_image_t *	       src,
+					  pixman_image_t *	       dst,
+					  pixman_format_code_t	       mask_format,
+					  int			       x_src,
+					  int			       y_src,
+					  int			       x_dst,
+					  int			       y_dst,
+					  int			       n_tris,
+					  const pixman_triangle_t *    tris);
+void	      pixman_add_triangles       (pixman_image_t              *image,
+					  int32_t	               x_off,
+					  int32_t	               y_off,
+					  int	                       n_tris,
+					  const pixman_triangle_t     *tris);
+
+PIXMAN_END_DECLS
+
+#endif /* PIXMAN_H__ */
diff --git a/pixman/test/Makefile.am b/pixman/test/Makefile.am
index 56d56aeb1..52ef8ad96 100644
--- a/pixman/test/Makefile.am
+++ b/pixman/test/Makefile.am
@@ -1,50 +1,50 @@
-AM_CFLAGS = @OPENMP_CFLAGS@
-AM_LDFLAGS = @OPENMP_CFLAGS@ @TESTPROGS_EXTRA_LDFLAGS@
-LDADD = $(top_builddir)/pixman/libpixman-1.la -lm -lpng
-INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman
-
-TESTPROGRAMS =			\
-	a1-trap-test		\
-	pdf-op-test		\
-	region-test		\
-	region-translate-test	\
-	fetch-test		\
-	oob-test		\
-	trap-crasher		\
-	alpha-loop		\
-	scaling-crash-test	\
-	scaling-helpers-test	\
-	gradient-crash-test	\
-	region-contains-test	\
-	alphamap		\
-	stress-test		\
-	composite-traps-test	\
-	blitters-test		\
-	scaling-test		\
-	affine-test		\
-	composite
-
-pdf_op_test_SOURCES = pdf-op-test.c utils.c utils.h
-region_test_SOURCES = region-test.c utils.c utils.h
-blitters_test_SOURCES = blitters-test.c utils.c utils.h
-region_contains_test_SOURCES = region-contains-test.c utils.c utils.h
-composite_traps_test_SOURCES = composite-traps-test.c utils.c utils.h
-scaling_test_SOURCES = scaling-test.c utils.c utils.h
-affine_test_SOURCES = affine-test.c utils.c utils.h
-alphamap_SOURCES = alphamap.c utils.c utils.h
-alpha_loop_SOURCES = alpha-loop.c utils.c utils.h
-composite_SOURCES = composite.c utils.c utils.h
-gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h
-stress_test_SOURCES = stress-test.c utils.c utils.h
-scaling_helpers_test_SOURCES = scaling-helpers-test.c utils.c utils.h
-
-# Benchmarks
-
-BENCHMARKS =			\
-	lowlevel-blt-bench
-
-lowlevel_blt_bench_SOURCES = lowlevel-blt-bench.c utils.c utils.h
-
-noinst_PROGRAMS = $(TESTPROGRAMS) $(BENCHMARKS)
-
-TESTS = $(TESTPROGRAMS)
+AM_CFLAGS = @OPENMP_CFLAGS@
+AM_LDFLAGS = @OPENMP_CFLAGS@ @TESTPROGS_EXTRA_LDFLAGS@
+LDADD = $(top_builddir)/pixman/libpixman-1.la -lm -lpng
+INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman
+
+TESTPROGRAMS =			\
+	a1-trap-test		\
+	pdf-op-test		\
+	region-test		\
+	region-translate-test	\
+	fetch-test		\
+	oob-test		\
+	trap-crasher		\
+	alpha-loop		\
+	scaling-crash-test	\
+	scaling-helpers-test	\
+	gradient-crash-test	\
+	region-contains-test	\
+	alphamap		\
+	stress-test		\
+	composite-traps-test	\
+	blitters-test		\
+	scaling-test		\
+	affine-test		\
+	composite
+
+pdf_op_test_SOURCES = pdf-op-test.c utils.c utils.h
+region_test_SOURCES = region-test.c utils.c utils.h
+blitters_test_SOURCES = blitters-test.c utils.c utils.h
+region_contains_test_SOURCES = region-contains-test.c utils.c utils.h
+composite_traps_test_SOURCES = composite-traps-test.c utils.c utils.h
+scaling_test_SOURCES = scaling-test.c utils.c utils.h
+affine_test_SOURCES = affine-test.c utils.c utils.h
+alphamap_SOURCES = alphamap.c utils.c utils.h
+alpha_loop_SOURCES = alpha-loop.c utils.c utils.h
+composite_SOURCES = composite.c utils.c utils.h
+gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h
+stress_test_SOURCES = stress-test.c utils.c utils.h
+scaling_helpers_test_SOURCES = scaling-helpers-test.c utils.c utils.h
+
+# Benchmarks
+
+BENCHMARKS =			\
+	lowlevel-blt-bench
+
+lowlevel_blt_bench_SOURCES = lowlevel-blt-bench.c utils.c utils.h
+
+noinst_PROGRAMS = $(TESTPROGRAMS) $(BENCHMARKS)
+
+TESTS = $(TESTPROGRAMS)
diff --git a/pixman/test/blitters-test.c b/pixman/test/blitters-test.c
index ba81fc71a..594ec548b 100644
--- a/pixman/test/blitters-test.c
+++ b/pixman/test/blitters-test.c
@@ -1,428 +1,428 @@
-/*
- * Test program, which stresses the use of different color formats and
- * compositing operations.
- *
- * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
- * the case of test failure.
- */
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <config.h>
-#include "utils.h"
-
-static pixman_indexed_t rgb_palette[9];
-static pixman_indexed_t y_palette[9];
-
-/* The first eight format in the list are by far the most widely
- * used formats, so we test those more than the others
- */
-#define N_MOST_LIKELY_FORMATS 8
-
-/* Create random image for testing purposes */
-static pixman_image_t *
-create_random_image (pixman_format_code_t *allowed_formats,
-		     int                   max_width,
-		     int                   max_height,
-		     int                   max_extra_stride,
-		     pixman_format_code_t *used_fmt)
-{
-    int n = 0, i, width, height, stride;
-    pixman_format_code_t fmt;
-    uint32_t *buf;
-    pixman_image_t *img;
-
-    while (allowed_formats[n] != PIXMAN_null)
-	n++;
-
-    if (n > N_MOST_LIKELY_FORMATS && lcg_rand_n (4) != 0)
-	n = N_MOST_LIKELY_FORMATS;
-    fmt = allowed_formats[lcg_rand_n (n)];
-
-    width = lcg_rand_n (max_width) + 1;
-    height = lcg_rand_n (max_height) + 1;
-    stride = (width * PIXMAN_FORMAT_BPP (fmt) + 7) / 8 +
-	lcg_rand_n (max_extra_stride + 1);
-    stride = (stride + 3) & ~3;
-
-    /* do the allocation */
-    buf = aligned_malloc (64, stride * height);
-
-    /* initialize image with random data */
-    for (i = 0; i < stride * height; i++)
-    {
-	/* generation is biased to having more 0 or 255 bytes as
-	 * they are more likely to be special-cased in code
-	 */
-	*((uint8_t *)buf + i) = lcg_rand_n (4) ? lcg_rand_n (256) :
-	    (lcg_rand_n (2) ? 0 : 255);
-    }
-
-    img = pixman_image_create_bits (fmt, width, height, buf, stride);
-
-    if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_COLOR)
-    {
-	pixman_image_set_indexed (img, &(rgb_palette[PIXMAN_FORMAT_BPP (fmt)]));
-    }
-    else if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_GRAY)
-    {
-	pixman_image_set_indexed (img, &(y_palette[PIXMAN_FORMAT_BPP (fmt)]));
-    }
-
-    image_endian_swap (img);
-
-    if (used_fmt) *used_fmt = fmt;
-    return img;
-}
-
-/* Free random image, and optionally update crc32 based on its data */
-static uint32_t
-free_random_image (uint32_t initcrc,
-		   pixman_image_t *img,
-		   pixman_format_code_t fmt)
-{
-    uint32_t crc32 = 0;
-    int stride = pixman_image_get_stride (img);
-    uint32_t *data = pixman_image_get_data (img);
-    int height = pixman_image_get_height (img);
-
-    if (fmt != PIXMAN_null)
-    {
-	/* mask unused 'x' part */
-	if (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt) &&
-	    PIXMAN_FORMAT_DEPTH (fmt) != 0)
-	{
-	    int i;
-	    uint32_t *data = pixman_image_get_data (img);
-	    uint32_t mask = (1 << PIXMAN_FORMAT_DEPTH (fmt)) - 1;
-
-	    if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_BGRA ||
-		PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_RGBA)
-	    {
-		mask <<= (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt));
-	    }
-
-	    for (i = 0; i < 32; i++)
-		mask |= mask << (i * PIXMAN_FORMAT_BPP (fmt));
-
-	    for (i = 0; i < stride * height / 4; i++)
-		data[i] &= mask;
-	}
-
-	/* swap endiannes in order to provide identical results on both big
-	 * and litte endian systems
-	 */
-	image_endian_swap (img);
-	crc32 = compute_crc32 (initcrc, data, stride * height);
-    }
-
-    pixman_image_unref (img);
-    free (data);
-
-    return crc32;
-}
-
-static pixman_op_t op_list[] = {
-    PIXMAN_OP_SRC,
-    PIXMAN_OP_OVER,
-    PIXMAN_OP_ADD,
-    PIXMAN_OP_CLEAR,
-    PIXMAN_OP_SRC,
-    PIXMAN_OP_DST,
-    PIXMAN_OP_OVER,
-    PIXMAN_OP_OVER_REVERSE,
-    PIXMAN_OP_IN,
-    PIXMAN_OP_IN_REVERSE,
-    PIXMAN_OP_OUT,
-    PIXMAN_OP_OUT_REVERSE,
-    PIXMAN_OP_ATOP,
-    PIXMAN_OP_ATOP_REVERSE,
-    PIXMAN_OP_XOR,
-    PIXMAN_OP_ADD,
-    PIXMAN_OP_SATURATE,
-    PIXMAN_OP_DISJOINT_CLEAR,
-    PIXMAN_OP_DISJOINT_SRC,
-    PIXMAN_OP_DISJOINT_DST,
-    PIXMAN_OP_DISJOINT_OVER,
-    PIXMAN_OP_DISJOINT_OVER_REVERSE,
-    PIXMAN_OP_DISJOINT_IN,
-    PIXMAN_OP_DISJOINT_IN_REVERSE,
-    PIXMAN_OP_DISJOINT_OUT,
-    PIXMAN_OP_DISJOINT_OUT_REVERSE,
-    PIXMAN_OP_DISJOINT_ATOP,
-    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
-    PIXMAN_OP_DISJOINT_XOR,
-    PIXMAN_OP_CONJOINT_CLEAR,
-    PIXMAN_OP_CONJOINT_SRC,
-    PIXMAN_OP_CONJOINT_DST,
-    PIXMAN_OP_CONJOINT_OVER,
-    PIXMAN_OP_CONJOINT_OVER_REVERSE,
-    PIXMAN_OP_CONJOINT_IN,
-    PIXMAN_OP_CONJOINT_IN_REVERSE,
-    PIXMAN_OP_CONJOINT_OUT,
-    PIXMAN_OP_CONJOINT_OUT_REVERSE,
-    PIXMAN_OP_CONJOINT_ATOP,
-    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
-    PIXMAN_OP_CONJOINT_XOR,
-    PIXMAN_OP_MULTIPLY,
-    PIXMAN_OP_SCREEN,
-    PIXMAN_OP_OVERLAY,
-    PIXMAN_OP_DARKEN,
-    PIXMAN_OP_LIGHTEN,
-    PIXMAN_OP_COLOR_DODGE,
-    PIXMAN_OP_COLOR_BURN,
-    PIXMAN_OP_HARD_LIGHT,
-    PIXMAN_OP_DIFFERENCE,
-    PIXMAN_OP_EXCLUSION,
-#if 0 /* these use floating point math and are not always bitexact on different platforms */
-    PIXMAN_OP_SOFT_LIGHT,
-    PIXMAN_OP_HSL_HUE,
-    PIXMAN_OP_HSL_SATURATION,
-    PIXMAN_OP_HSL_COLOR,
-    PIXMAN_OP_HSL_LUMINOSITY,
-#endif
-};
-
-static pixman_format_code_t img_fmt_list[] = {
-    PIXMAN_a8r8g8b8,
-    PIXMAN_a8b8g8r8,
-    PIXMAN_x8r8g8b8,
-    PIXMAN_x8b8g8r8,
-    PIXMAN_r5g6b5,
-    PIXMAN_b5g6r5,
-    PIXMAN_a8,
-    PIXMAN_a1,
-    PIXMAN_r3g3b2,
-    PIXMAN_b8g8r8a8,
-    PIXMAN_b8g8r8x8,
-    PIXMAN_r8g8b8a8,
-    PIXMAN_r8g8b8x8,
-    PIXMAN_x14r6g6b6,
-    PIXMAN_r8g8b8,
-    PIXMAN_b8g8r8,
-    PIXMAN_x2r10g10b10,
-    PIXMAN_a2r10g10b10,
-    PIXMAN_x2b10g10r10,
-    PIXMAN_a2b10g10r10,
-    PIXMAN_a1r5g5b5,
-    PIXMAN_x1r5g5b5,
-    PIXMAN_a1b5g5r5,
-    PIXMAN_x1b5g5r5,
-    PIXMAN_a4r4g4b4,
-    PIXMAN_x4r4g4b4,
-    PIXMAN_a4b4g4r4,
-    PIXMAN_x4b4g4r4,
-    PIXMAN_r3g3b2,
-    PIXMAN_b2g3r3,
-    PIXMAN_a2r2g2b2,
-    PIXMAN_a2b2g2r2,
-    PIXMAN_c8,
-    PIXMAN_g8,
-    PIXMAN_x4c4,
-    PIXMAN_x4g4,
-    PIXMAN_c4,
-    PIXMAN_g4,
-    PIXMAN_g1,
-    PIXMAN_x4a4,
-    PIXMAN_a4,
-    PIXMAN_r1g2b1,
-    PIXMAN_b1g2r1,
-    PIXMAN_a1r1g1b1,
-    PIXMAN_a1b1g1r1,
-    PIXMAN_null
-};
-
-static pixman_format_code_t mask_fmt_list[] = {
-    PIXMAN_a8r8g8b8,
-    PIXMAN_a8,
-    PIXMAN_a4,
-    PIXMAN_a1,
-    PIXMAN_null
-};
-
-
-/*
- * Composite operation with pseudorandom images
- */
-uint32_t
-test_composite (int testnum, int verbose)
-{
-    int i;
-    pixman_image_t *src_img = NULL;
-    pixman_image_t *dst_img = NULL;
-    pixman_image_t *mask_img = NULL;
-    int src_width, src_height;
-    int dst_width, dst_height;
-    int src_stride, dst_stride;
-    int src_x, src_y;
-    int dst_x, dst_y;
-    int mask_x, mask_y;
-    int w, h;
-    pixman_op_t op;
-    pixman_format_code_t src_fmt, dst_fmt, mask_fmt;
-    uint32_t *dstbuf, *srcbuf, *maskbuf;
-    uint32_t crc32;
-    int max_width, max_height, max_extra_stride;
-    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
-
-    max_width = max_height = 24 + testnum / 10000;
-    max_extra_stride = 4 + testnum / 1000000;
-
-    if (max_width > 256)
-	max_width = 256;
-
-    if (max_height > 16)
-	max_height = 16;
-
-    if (max_extra_stride > 8)
-	max_extra_stride = 8;
-
-    lcg_srand (testnum);
-
-    op = op_list[lcg_rand_n (sizeof (op_list) / sizeof (op_list[0]))];
-
-    if (lcg_rand_n (8))
-    {
-	/* normal image */
-	src_img = create_random_image (img_fmt_list, max_width, max_height,
-				       max_extra_stride, &src_fmt);
-    }
-    else
-    {
-	/* solid case */
-	src_img = create_random_image (img_fmt_list, 1, 1,
-				       max_extra_stride, &src_fmt);
-
-	pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
-    }
-
-    dst_img = create_random_image (img_fmt_list, max_width, max_height,
-				   max_extra_stride, &dst_fmt);
-
-    src_width = pixman_image_get_width (src_img);
-    src_height = pixman_image_get_height (src_img);
-    src_stride = pixman_image_get_stride (src_img);
-
-    dst_width = pixman_image_get_width (dst_img);
-    dst_height = pixman_image_get_height (dst_img);
-    dst_stride = pixman_image_get_stride (dst_img);
-
-    dstbuf = pixman_image_get_data (dst_img);
-    srcbuf = pixman_image_get_data (src_img);
-
-    src_x = lcg_rand_n (src_width);
-    src_y = lcg_rand_n (src_height);
-    dst_x = lcg_rand_n (dst_width);
-    dst_y = lcg_rand_n (dst_height);
-
-    mask_img = NULL;
-    mask_fmt = PIXMAN_null;
-    mask_x = 0;
-    mask_y = 0;
-    maskbuf = NULL;
-
-    if ((src_fmt == PIXMAN_x8r8g8b8 || src_fmt == PIXMAN_x8b8g8r8) &&
-	(lcg_rand_n (4) == 0))
-    {
-	/* PIXBUF */
-	mask_fmt = lcg_rand_n (2) ? PIXMAN_a8r8g8b8 : PIXMAN_a8b8g8r8;
-	mask_img = pixman_image_create_bits (mask_fmt,
-	                                     src_width,
-	                                     src_height,
-	                                     srcbuf,
-	                                     src_stride);
-	mask_x = src_x;
-	mask_y = src_y;
-	maskbuf = srcbuf;
-    }
-    else if (lcg_rand_n (2))
-    {
-	if (lcg_rand_n (2))
-	{
-	    mask_img = create_random_image (mask_fmt_list, max_width, max_height,
-					   max_extra_stride, &mask_fmt);
-	}
-	else
-	{
-	    /* solid case */
-	    mask_img = create_random_image (mask_fmt_list, 1, 1,
-					   max_extra_stride, &mask_fmt);
-	    pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
-	}
-
-	if (lcg_rand_n (2))
-	    pixman_image_set_component_alpha (mask_img, 1);
-
-	mask_x = lcg_rand_n (pixman_image_get_width (mask_img));
-	mask_y = lcg_rand_n (pixman_image_get_height (mask_img));
-    }
-
-
-    w = lcg_rand_n (dst_width - dst_x + 1);
-    h = lcg_rand_n (dst_height - dst_y + 1);
-
-    if (verbose)
-    {
-	printf ("op=%d, src_fmt=%08X, dst_fmt=%08X, mask_fmt=%08X\n",
-	    op, src_fmt, dst_fmt, mask_fmt);
-	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
-	    src_width, src_height, dst_width, dst_height);
-	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
-	    src_x, src_y, dst_x, dst_y);
-	printf ("src_stride=%d, dst_stride=%d\n",
-	    src_stride, dst_stride);
-	printf ("w=%d, h=%d\n", w, h);
-    }
-
-    pixman_image_composite (op, src_img, mask_img, dst_img,
-			    src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
-
-    if (verbose)
-    {
-	int j;
-
-	printf ("---\n");
-	for (i = 0; i < dst_height; i++)
-	{
-	    for (j = 0; j < dst_stride; j++)
-	    {
-		if (j == (dst_width * PIXMAN_FORMAT_BPP (dst_fmt) + 7) / 8)
-		    printf ("| ");
-
-		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
-	    }
-	    printf ("\n");
-	}
-	printf ("---\n");
-    }
-
-    free_random_image (0, src_img, PIXMAN_null);
-    crc32 = free_random_image (0, dst_img, dst_fmt);
-
-    if (mask_img)
-    {
-	if (srcbuf == maskbuf)
-	    pixman_image_unref(mask_img);
-	else
-	    free_random_image (0, mask_img, PIXMAN_null);
-    }
-
-    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
-    return crc32;
-}
-
-int
-main (int argc, const char *argv[])
-{
-    int i;
-
-    for (i = 1; i <= 8; i++)
-    {
-	initialize_palette (&(rgb_palette[i]), i, TRUE);
-	initialize_palette (&(y_palette[i]), i, FALSE);
-    }
-
-    return fuzzer_test_main("blitters", 2000000,
-			    0xB610300B,
-			    test_composite, argc, argv);
-}
+/*
+ * Test program, which stresses the use of different color formats and
+ * compositing operations.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <config.h>
+#include "utils.h"
+
+static pixman_indexed_t rgb_palette[9];
+static pixman_indexed_t y_palette[9];
+
+/* The first eight format in the list are by far the most widely
+ * used formats, so we test those more than the others
+ */
+#define N_MOST_LIKELY_FORMATS 8
+
+/* Create random image for testing purposes */
+static pixman_image_t *
+create_random_image (pixman_format_code_t *allowed_formats,
+		     int                   max_width,
+		     int                   max_height,
+		     int                   max_extra_stride,
+		     pixman_format_code_t *used_fmt)
+{
+    int n = 0, i, width, height, stride;
+    pixman_format_code_t fmt;
+    uint32_t *buf;
+    pixman_image_t *img;
+
+    while (allowed_formats[n] != PIXMAN_null)
+	n++;
+
+    if (n > N_MOST_LIKELY_FORMATS && lcg_rand_n (4) != 0)
+	n = N_MOST_LIKELY_FORMATS;
+    fmt = allowed_formats[lcg_rand_n (n)];
+
+    width = lcg_rand_n (max_width) + 1;
+    height = lcg_rand_n (max_height) + 1;
+    stride = (width * PIXMAN_FORMAT_BPP (fmt) + 7) / 8 +
+	lcg_rand_n (max_extra_stride + 1);
+    stride = (stride + 3) & ~3;
+
+    /* do the allocation */
+    buf = aligned_malloc (64, stride * height);
+
+    /* initialize image with random data */
+    for (i = 0; i < stride * height; i++)
+    {
+	/* generation is biased to having more 0 or 255 bytes as
+	 * they are more likely to be special-cased in code
+	 */
+	*((uint8_t *)buf + i) = lcg_rand_n (4) ? lcg_rand_n (256) :
+	    (lcg_rand_n (2) ? 0 : 255);
+    }
+
+    img = pixman_image_create_bits (fmt, width, height, buf, stride);
+
+    if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_COLOR)
+    {
+	pixman_image_set_indexed (img, &(rgb_palette[PIXMAN_FORMAT_BPP (fmt)]));
+    }
+    else if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_GRAY)
+    {
+	pixman_image_set_indexed (img, &(y_palette[PIXMAN_FORMAT_BPP (fmt)]));
+    }
+
+    image_endian_swap (img);
+
+    if (used_fmt) *used_fmt = fmt;
+    return img;
+}
+
+/* Free random image, and optionally update crc32 based on its data */
+static uint32_t
+free_random_image (uint32_t initcrc,
+		   pixman_image_t *img,
+		   pixman_format_code_t fmt)
+{
+    uint32_t crc32 = 0;
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);
+
+    if (fmt != PIXMAN_null)
+    {
+	/* mask unused 'x' part */
+	if (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt) &&
+	    PIXMAN_FORMAT_DEPTH (fmt) != 0)
+	{
+	    int i;
+	    uint32_t *data = pixman_image_get_data (img);
+	    uint32_t mask = (1 << PIXMAN_FORMAT_DEPTH (fmt)) - 1;
+
+	    if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_BGRA ||
+		PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_RGBA)
+	    {
+		mask <<= (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt));
+	    }
+
+	    for (i = 0; i < 32; i++)
+		mask |= mask << (i * PIXMAN_FORMAT_BPP (fmt));
+
+	    for (i = 0; i < stride * height / 4; i++)
+		data[i] &= mask;
+	}
+
+	/* swap endiannes in order to provide identical results on both big
+	 * and litte endian systems
+	 */
+	image_endian_swap (img);
+	crc32 = compute_crc32 (initcrc, data, stride * height);
+    }
+
+    pixman_image_unref (img);
+    free (data);
+
+    return crc32;
+}
+
+static pixman_op_t op_list[] = {
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+#if 0 /* these use floating point math and are not always bitexact on different platforms */
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+#endif
+};
+
+static pixman_format_code_t img_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_a8,
+    PIXMAN_a1,
+    PIXMAN_r3g3b2,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x14r6g6b6,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_c8,
+    PIXMAN_g8,
+    PIXMAN_x4c4,
+    PIXMAN_x4g4,
+    PIXMAN_c4,
+    PIXMAN_g4,
+    PIXMAN_g1,
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_null
+};
+
+static pixman_format_code_t mask_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8,
+    PIXMAN_a4,
+    PIXMAN_a1,
+    PIXMAN_null
+};
+
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int testnum, int verbose)
+{
+    int i;
+    pixman_image_t *src_img = NULL;
+    pixman_image_t *dst_img = NULL;
+    pixman_image_t *mask_img = NULL;
+    int src_width, src_height;
+    int dst_width, dst_height;
+    int src_stride, dst_stride;
+    int src_x, src_y;
+    int dst_x, dst_y;
+    int mask_x, mask_y;
+    int w, h;
+    pixman_op_t op;
+    pixman_format_code_t src_fmt, dst_fmt, mask_fmt;
+    uint32_t *dstbuf, *srcbuf, *maskbuf;
+    uint32_t crc32;
+    int max_width, max_height, max_extra_stride;
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    max_width = max_height = 24 + testnum / 10000;
+    max_extra_stride = 4 + testnum / 1000000;
+
+    if (max_width > 256)
+	max_width = 256;
+
+    if (max_height > 16)
+	max_height = 16;
+
+    if (max_extra_stride > 8)
+	max_extra_stride = 8;
+
+    lcg_srand (testnum);
+
+    op = op_list[lcg_rand_n (sizeof (op_list) / sizeof (op_list[0]))];
+
+    if (lcg_rand_n (8))
+    {
+	/* normal image */
+	src_img = create_random_image (img_fmt_list, max_width, max_height,
+				       max_extra_stride, &src_fmt);
+    }
+    else
+    {
+	/* solid case */
+	src_img = create_random_image (img_fmt_list, 1, 1,
+				       max_extra_stride, &src_fmt);
+
+	pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+    }
+
+    dst_img = create_random_image (img_fmt_list, max_width, max_height,
+				   max_extra_stride, &dst_fmt);
+
+    src_width = pixman_image_get_width (src_img);
+    src_height = pixman_image_get_height (src_img);
+    src_stride = pixman_image_get_stride (src_img);
+
+    dst_width = pixman_image_get_width (dst_img);
+    dst_height = pixman_image_get_height (dst_img);
+    dst_stride = pixman_image_get_stride (dst_img);
+
+    dstbuf = pixman_image_get_data (dst_img);
+    srcbuf = pixman_image_get_data (src_img);
+
+    src_x = lcg_rand_n (src_width);
+    src_y = lcg_rand_n (src_height);
+    dst_x = lcg_rand_n (dst_width);
+    dst_y = lcg_rand_n (dst_height);
+
+    mask_img = NULL;
+    mask_fmt = PIXMAN_null;
+    mask_x = 0;
+    mask_y = 0;
+    maskbuf = NULL;
+
+    if ((src_fmt == PIXMAN_x8r8g8b8 || src_fmt == PIXMAN_x8b8g8r8) &&
+	(lcg_rand_n (4) == 0))
+    {
+	/* PIXBUF */
+	mask_fmt = lcg_rand_n (2) ? PIXMAN_a8r8g8b8 : PIXMAN_a8b8g8r8;
+	mask_img = pixman_image_create_bits (mask_fmt,
+	                                     src_width,
+	                                     src_height,
+	                                     srcbuf,
+	                                     src_stride);
+	mask_x = src_x;
+	mask_y = src_y;
+	maskbuf = srcbuf;
+    }
+    else if (lcg_rand_n (2))
+    {
+	if (lcg_rand_n (2))
+	{
+	    mask_img = create_random_image (mask_fmt_list, max_width, max_height,
+					   max_extra_stride, &mask_fmt);
+	}
+	else
+	{
+	    /* solid case */
+	    mask_img = create_random_image (mask_fmt_list, 1, 1,
+					   max_extra_stride, &mask_fmt);
+	    pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
+	}
+
+	if (lcg_rand_n (2))
+	    pixman_image_set_component_alpha (mask_img, 1);
+
+	mask_x = lcg_rand_n (pixman_image_get_width (mask_img));
+	mask_y = lcg_rand_n (pixman_image_get_height (mask_img));
+    }
+
+
+    w = lcg_rand_n (dst_width - dst_x + 1);
+    h = lcg_rand_n (dst_height - dst_y + 1);
+
+    if (verbose)
+    {
+	printf ("op=%d, src_fmt=%08X, dst_fmt=%08X, mask_fmt=%08X\n",
+	    op, src_fmt, dst_fmt, mask_fmt);
+	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+	    src_width, src_height, dst_width, dst_height);
+	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+	    src_x, src_y, dst_x, dst_y);
+	printf ("src_stride=%d, dst_stride=%d\n",
+	    src_stride, dst_stride);
+	printf ("w=%d, h=%d\n", w, h);
+    }
+
+    pixman_image_composite (op, src_img, mask_img, dst_img,
+			    src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
+
+    if (verbose)
+    {
+	int j;
+
+	printf ("---\n");
+	for (i = 0; i < dst_height; i++)
+	{
+	    for (j = 0; j < dst_stride; j++)
+	    {
+		if (j == (dst_width * PIXMAN_FORMAT_BPP (dst_fmt) + 7) / 8)
+		    printf ("| ");
+
+		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+	    }
+	    printf ("\n");
+	}
+	printf ("---\n");
+    }
+
+    free_random_image (0, src_img, PIXMAN_null);
+    crc32 = free_random_image (0, dst_img, dst_fmt);
+
+    if (mask_img)
+    {
+	if (srcbuf == maskbuf)
+	    pixman_image_unref(mask_img);
+	else
+	    free_random_image (0, mask_img, PIXMAN_null);
+    }
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    int i;
+
+    for (i = 1; i <= 8; i++)
+    {
+	initialize_palette (&(rgb_palette[i]), i, TRUE);
+	initialize_palette (&(y_palette[i]), i, FALSE);
+    }
+
+    return fuzzer_test_main("blitters", 2000000,
+			    0xB610300B,
+			    test_composite, argc, argv);
+}
diff --git a/pixman/test/composite-traps-test.c b/pixman/test/composite-traps-test.c
index fceeb1c7d..fa6d8a988 100644
--- a/pixman/test/composite-traps-test.c
+++ b/pixman/test/composite-traps-test.c
@@ -1,257 +1,257 @@
-/* Based loosely on scaling-test */
-
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "utils.h"
-
-#define MAX_SRC_WIDTH  48
-#define MAX_SRC_HEIGHT 48
-#define MAX_DST_WIDTH  48
-#define MAX_DST_HEIGHT 48
-#define MAX_STRIDE     4
-
-static pixman_format_code_t formats[] =
-{
-    PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_r5g6b5, PIXMAN_a1, PIXMAN_a4
-};
-
-static pixman_format_code_t mask_formats[] =
-{
-    PIXMAN_a1, PIXMAN_a4, PIXMAN_a8,
-};
-
-static pixman_op_t operators[] =
-{
-    PIXMAN_OP_OVER, PIXMAN_OP_ADD, PIXMAN_OP_SRC, PIXMAN_OP_IN
-};
-
-#define RANDOM_ELT(array)						\
-    ((array)[lcg_rand_n(ARRAY_LENGTH((array)))])
-
-static void
-destroy_bits (pixman_image_t *image, void *data)
-{
-    fence_free (data);
-}
-
-static pixman_fixed_t
-random_fixed (int n)
-{
-    return lcg_rand_N (n << 16);
-}
-
-/*
- * Composite operation with pseudorandom images
- */
-uint32_t
-test_composite (int      testnum,
-		int      verbose)
-{
-    int                i;
-    pixman_image_t *   src_img;
-    pixman_image_t *   dst_img;
-    pixman_region16_t  clip;
-    int                dst_width, dst_height;
-    int                dst_stride;
-    int                dst_x, dst_y;
-    int                dst_bpp;
-    pixman_op_t        op;
-    uint32_t *         dst_bits;
-    uint32_t           crc32;
-    pixman_format_code_t mask_format, dst_format;
-    pixman_trapezoid_t *traps;
-    int src_x, src_y;
-    int n_traps;
-
-    static pixman_color_t colors[] =
-    {
-	{ 0xffff, 0xffff, 0xffff, 0xffff },
-	{ 0x0000, 0x0000, 0x0000, 0x0000 },
-	{ 0xabcd, 0xabcd, 0x0000, 0xabcd },
-	{ 0x0000, 0x0000, 0x0000, 0xffff },
-	{ 0x0101, 0x0101, 0x0101, 0x0101 },
-	{ 0x7777, 0x6666, 0x5555, 0x9999 },
-    };
-    
-    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
-
-    lcg_srand (testnum);
-
-    op = RANDOM_ELT (operators);
-    mask_format = RANDOM_ELT (mask_formats);
-
-    /* Create source image */
-    
-    if (lcg_rand_n (4) == 0)
-    {
-	src_img = pixman_image_create_solid_fill (
-	    &(colors[lcg_rand_n (ARRAY_LENGTH (colors))]));
-
-	src_x = 10;
-	src_y = 234;
-    }
-    else
-    {
-	pixman_format_code_t src_format = RANDOM_ELT(formats);
-	int src_bpp = (PIXMAN_FORMAT_BPP (src_format) + 7) / 8;
-	int src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
-	int src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
-	int src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
-	uint32_t *bits;
-
-	src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
-	src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
-
-	src_stride = (src_stride + 3) & ~3;
-	
-	bits = (uint32_t *)make_random_bytes (src_stride * src_height);
-
-	src_img = pixman_image_create_bits (
-	    src_format, src_width, src_height, bits, src_stride);
-
-	pixman_image_set_destroy_function (src_img, destroy_bits, bits);
-
-	if (lcg_rand_n (8) == 0)
-	{
-	    pixman_box16_t clip_boxes[2];
-	    int            n = lcg_rand_n (2) + 1;
-	    
-	    for (i = 0; i < n; i++)
-	    {
-		clip_boxes[i].x1 = lcg_rand_n (src_width);
-		clip_boxes[i].y1 = lcg_rand_n (src_height);
-		clip_boxes[i].x2 =
-		    clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
-		clip_boxes[i].y2 =
-		    clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
-		
-		if (verbose)
-		{
-		    printf ("source clip box: [%d,%d-%d,%d]\n",
-			    clip_boxes[i].x1, clip_boxes[i].y1,
-			    clip_boxes[i].x2, clip_boxes[i].y2);
-		}
-	    }
-	    
-	    pixman_region_init_rects (&clip, clip_boxes, n);
-	    pixman_image_set_clip_region (src_img, &clip);
-	    pixman_image_set_source_clipping (src_img, 1);
-	    pixman_region_fini (&clip);
-	}
-
-	image_endian_swap (src_img);
-    }
-
-    /* Create destination image */
-    {
-	dst_format = RANDOM_ELT(formats);
-	dst_bpp = (PIXMAN_FORMAT_BPP (dst_format) + 7) / 8;
-	dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
-	dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
-	dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
-	dst_stride = (dst_stride + 3) & ~3;
-	
-	dst_bits = (uint32_t *)make_random_bytes (dst_stride * dst_height);
-
-	dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
-	dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
-	
-	dst_img = pixman_image_create_bits (
-	    dst_format, dst_width, dst_height, dst_bits, dst_stride);
-
-	image_endian_swap (dst_img);
-    }
-
-    /* Create traps */
-    {
-	int i;
-
-	n_traps = lcg_rand_n (25);
-	traps = fence_malloc (n_traps * sizeof (pixman_trapezoid_t));
-
-	for (i = 0; i < n_traps; ++i)
-	{
-	    pixman_trapezoid_t *t = &(traps[i]);
-	    
-	    t->top = random_fixed (MAX_DST_HEIGHT) - MAX_DST_HEIGHT / 2;
-	    t->bottom = t->top + random_fixed (MAX_DST_HEIGHT);
-	    t->left.p1.x = random_fixed (MAX_DST_WIDTH) - MAX_DST_WIDTH / 2;
-	    t->left.p1.y = t->top - random_fixed (50);
-	    t->left.p2.x = random_fixed (MAX_DST_WIDTH) - MAX_DST_WIDTH / 2;
-	    t->left.p2.y = t->bottom + random_fixed (50);
-	    t->right.p1.x = t->left.p1.x + random_fixed (MAX_DST_WIDTH);
-	    t->right.p1.y = t->top - random_fixed (50);
-	    t->right.p2.x = t->left.p2.x + random_fixed (MAX_DST_WIDTH);
-	    t->right.p2.y = t->bottom - random_fixed (50);
-	}
-    }
-    
-    if (lcg_rand_n (8) == 0)
-    {
-	pixman_box16_t clip_boxes[2];
-	int            n = lcg_rand_n (2) + 1;
-	for (i = 0; i < n; i++)
-	{
-	    clip_boxes[i].x1 = lcg_rand_n (dst_width);
-	    clip_boxes[i].y1 = lcg_rand_n (dst_height);
-	    clip_boxes[i].x2 =
-		clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
-	    clip_boxes[i].y2 =
-		clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
-
-	    if (verbose)
-	    {
-		printf ("destination clip box: [%d,%d-%d,%d]\n",
-		        clip_boxes[i].x1, clip_boxes[i].y1,
-		        clip_boxes[i].x2, clip_boxes[i].y2);
-	    }
-	}
-	pixman_region_init_rects (&clip, clip_boxes, n);
-	pixman_image_set_clip_region (dst_img, &clip);
-	pixman_region_fini (&clip);
-    }
-
-    pixman_composite_trapezoids (op, src_img, dst_img, mask_format,
-				 src_x, src_y, dst_x, dst_y, n_traps, traps);
-
-    if (dst_format == PIXMAN_x8r8g8b8)
-    {
-	/* ignore unused part */
-	for (i = 0; i < dst_stride * dst_height / 4; i++)
-	    dst_bits[i] &= 0xFFFFFF;
-    }
-
-    image_endian_swap (dst_img);
-
-    if (verbose)
-    {
-	int j;
-	
-	for (i = 0; i < dst_height; i++)
-	{
-	    for (j = 0; j < dst_stride; j++)
-		printf ("%02X ", *((uint8_t *)dst_bits + i * dst_stride + j));
-
-	    printf ("\n");
-	}
-    }
-
-    crc32 = compute_crc32 (0, dst_bits, dst_stride * dst_height);
-
-    fence_free (dst_bits);
-    
-    pixman_image_unref (src_img);
-    pixman_image_unref (dst_img);
-    fence_free (traps);
-
-    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
-    return crc32;
-}
-
-int
-main (int argc, const char *argv[])
-{
-    return fuzzer_test_main("composite traps", 40000, 0xE3112106,
-			    test_composite, argc, argv);
-}
+/* Based loosely on scaling-test */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH  48
+#define MAX_SRC_HEIGHT 48
+#define MAX_DST_WIDTH  48
+#define MAX_DST_HEIGHT 48
+#define MAX_STRIDE     4
+
+static pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_r5g6b5, PIXMAN_a1, PIXMAN_a4
+};
+
+static pixman_format_code_t mask_formats[] =
+{
+    PIXMAN_a1, PIXMAN_a4, PIXMAN_a8,
+};
+
+static pixman_op_t operators[] =
+{
+    PIXMAN_OP_OVER, PIXMAN_OP_ADD, PIXMAN_OP_SRC, PIXMAN_OP_IN
+};
+
+#define RANDOM_ELT(array)						\
+    ((array)[lcg_rand_n(ARRAY_LENGTH((array)))])
+
+static void
+destroy_bits (pixman_image_t *image, void *data)
+{
+    fence_free (data);
+}
+
+static pixman_fixed_t
+random_fixed (int n)
+{
+    return lcg_rand_N (n << 16);
+}
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int      testnum,
+		int      verbose)
+{
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   dst_img;
+    pixman_region16_t  clip;
+    int                dst_width, dst_height;
+    int                dst_stride;
+    int                dst_x, dst_y;
+    int                dst_bpp;
+    pixman_op_t        op;
+    uint32_t *         dst_bits;
+    uint32_t           crc32;
+    pixman_format_code_t mask_format, dst_format;
+    pixman_trapezoid_t *traps;
+    int src_x, src_y;
+    int n_traps;
+
+    static pixman_color_t colors[] =
+    {
+	{ 0xffff, 0xffff, 0xffff, 0xffff },
+	{ 0x0000, 0x0000, 0x0000, 0x0000 },
+	{ 0xabcd, 0xabcd, 0x0000, 0xabcd },
+	{ 0x0000, 0x0000, 0x0000, 0xffff },
+	{ 0x0101, 0x0101, 0x0101, 0x0101 },
+	{ 0x7777, 0x6666, 0x5555, 0x9999 },
+    };
+    
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    lcg_srand (testnum);
+
+    op = RANDOM_ELT (operators);
+    mask_format = RANDOM_ELT (mask_formats);
+
+    /* Create source image */
+    
+    if (lcg_rand_n (4) == 0)
+    {
+	src_img = pixman_image_create_solid_fill (
+	    &(colors[lcg_rand_n (ARRAY_LENGTH (colors))]));
+
+	src_x = 10;
+	src_y = 234;
+    }
+    else
+    {
+	pixman_format_code_t src_format = RANDOM_ELT(formats);
+	int src_bpp = (PIXMAN_FORMAT_BPP (src_format) + 7) / 8;
+	int src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+	int src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+	int src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
+	uint32_t *bits;
+
+	src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
+	src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
+
+	src_stride = (src_stride + 3) & ~3;
+	
+	bits = (uint32_t *)make_random_bytes (src_stride * src_height);
+
+	src_img = pixman_image_create_bits (
+	    src_format, src_width, src_height, bits, src_stride);
+
+	pixman_image_set_destroy_function (src_img, destroy_bits, bits);
+
+	if (lcg_rand_n (8) == 0)
+	{
+	    pixman_box16_t clip_boxes[2];
+	    int            n = lcg_rand_n (2) + 1;
+	    
+	    for (i = 0; i < n; i++)
+	    {
+		clip_boxes[i].x1 = lcg_rand_n (src_width);
+		clip_boxes[i].y1 = lcg_rand_n (src_height);
+		clip_boxes[i].x2 =
+		    clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
+		clip_boxes[i].y2 =
+		    clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
+		
+		if (verbose)
+		{
+		    printf ("source clip box: [%d,%d-%d,%d]\n",
+			    clip_boxes[i].x1, clip_boxes[i].y1,
+			    clip_boxes[i].x2, clip_boxes[i].y2);
+		}
+	    }
+	    
+	    pixman_region_init_rects (&clip, clip_boxes, n);
+	    pixman_image_set_clip_region (src_img, &clip);
+	    pixman_image_set_source_clipping (src_img, 1);
+	    pixman_region_fini (&clip);
+	}
+
+	image_endian_swap (src_img);
+    }
+
+    /* Create destination image */
+    {
+	dst_format = RANDOM_ELT(formats);
+	dst_bpp = (PIXMAN_FORMAT_BPP (dst_format) + 7) / 8;
+	dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
+	dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
+	dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
+	dst_stride = (dst_stride + 3) & ~3;
+	
+	dst_bits = (uint32_t *)make_random_bytes (dst_stride * dst_height);
+
+	dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
+	dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
+	
+	dst_img = pixman_image_create_bits (
+	    dst_format, dst_width, dst_height, dst_bits, dst_stride);
+
+	image_endian_swap (dst_img);
+    }
+
+    /* Create traps */
+    {
+	int i;
+
+	n_traps = lcg_rand_n (25);
+	traps = fence_malloc (n_traps * sizeof (pixman_trapezoid_t));
+
+	for (i = 0; i < n_traps; ++i)
+	{
+	    pixman_trapezoid_t *t = &(traps[i]);
+	    
+	    t->top = random_fixed (MAX_DST_HEIGHT) - MAX_DST_HEIGHT / 2;
+	    t->bottom = t->top + random_fixed (MAX_DST_HEIGHT);
+	    t->left.p1.x = random_fixed (MAX_DST_WIDTH) - MAX_DST_WIDTH / 2;
+	    t->left.p1.y = t->top - random_fixed (50);
+	    t->left.p2.x = random_fixed (MAX_DST_WIDTH) - MAX_DST_WIDTH / 2;
+	    t->left.p2.y = t->bottom + random_fixed (50);
+	    t->right.p1.x = t->left.p1.x + random_fixed (MAX_DST_WIDTH);
+	    t->right.p1.y = t->top - random_fixed (50);
+	    t->right.p2.x = t->left.p2.x + random_fixed (MAX_DST_WIDTH);
+	    t->right.p2.y = t->bottom - random_fixed (50);
+	}
+    }
+    
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (dst_width);
+	    clip_boxes[i].y1 = lcg_rand_n (dst_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("destination clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (dst_img, &clip);
+	pixman_region_fini (&clip);
+    }
+
+    pixman_composite_trapezoids (op, src_img, dst_img, mask_format,
+				 src_x, src_y, dst_x, dst_y, n_traps, traps);
+
+    if (dst_format == PIXMAN_x8r8g8b8)
+    {
+	/* ignore unused part */
+	for (i = 0; i < dst_stride * dst_height / 4; i++)
+	    dst_bits[i] &= 0xFFFFFF;
+    }
+
+    image_endian_swap (dst_img);
+
+    if (verbose)
+    {
+	int j;
+	
+	for (i = 0; i < dst_height; i++)
+	{
+	    for (j = 0; j < dst_stride; j++)
+		printf ("%02X ", *((uint8_t *)dst_bits + i * dst_stride + j));
+
+	    printf ("\n");
+	}
+    }
+
+    crc32 = compute_crc32 (0, dst_bits, dst_stride * dst_height);
+
+    fence_free (dst_bits);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+    fence_free (traps);
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main("composite traps", 40000, 0xE3112106,
+			    test_composite, argc, argv);
+}
diff --git a/pixman/test/composite.c b/pixman/test/composite.c
index fccf18b53..edea9a96b 100644
--- a/pixman/test/composite.c
+++ b/pixman/test/composite.c
@@ -1,921 +1,921 @@
-/*
- * Copyright © 2005 Eric Anholt
- * Copyright © 2009 Chris Wilson
- * Copyright © 2010 Soeren Sandmann
- * Copyright © 2010 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Eric Anholt not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Eric Anholt makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * ERIC ANHOLT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL ERIC ANHOLT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
-#define PIXMAN_USE_INTERNAL_API
-#include <pixman.h>
-#include <stdio.h>
-#include <stdlib.h> /* abort() */
-#include <math.h>
-#include <config.h>
-#include <time.h>
-#include "utils.h"
-
-typedef struct color_t color_t;
-typedef struct format_t format_t;
-typedef struct image_t image_t;
-typedef struct operator_t operator_t;
-
-struct color_t
-{
-    double r, g, b, a;
-};
-
-struct format_t
-{
-    pixman_format_code_t format;
-    const char *name;
-};
-
-static const color_t colors[] =
-{
-    { 1.0, 1.0, 1.0, 1.0 },
-    { 1.0, 1.0, 1.0, 0.0 },
-    { 0.0, 0.0, 0.0, 1.0 },
-    { 0.0, 0.0, 0.0, 0.0 },
-    { 1.0, 0.0, 0.0, 1.0 },
-    { 0.0, 1.0, 0.0, 1.0 },
-    { 0.0, 0.0, 1.0, 1.0 },
-    { 0.5, 0.0, 0.0, 0.5 },
-};
-
-static uint16_t
-_color_double_to_short (double d)
-{
-    uint32_t i;
-
-    i = (uint32_t) (d * 65536);
-    i -= (i >> 16);
-
-    return i;
-}
-
-static void
-compute_pixman_color (const color_t *color,
-		      pixman_color_t *out)
-{
-    out->red   = _color_double_to_short (color->r);
-    out->green = _color_double_to_short (color->g);
-    out->blue  = _color_double_to_short (color->b);
-    out->alpha = _color_double_to_short (color->a);
-}
-
-#define REPEAT 0x01000000
-#define FLAGS  0xff000000
-
-static const int sizes[] =
-{
-    0,
-    1,
-    1 | REPEAT,
-    10
-};
-
-static const format_t formats[] =
-{
-#define P(x) { PIXMAN_##x, #x }
-
-    /* 32 bpp formats */
-    P(a8r8g8b8),
-    P(x8r8g8b8),
-    P(a8b8g8r8),
-    P(x8b8g8r8),
-    P(b8g8r8a8),
-    P(b8g8r8x8),
-    P(r8g8b8a8),
-    P(r8g8b8x8),
-    P(x2r10g10b10),
-    P(x2b10g10r10),
-    P(a2r10g10b10),
-    P(a2b10g10r10),
-
-    /* 24 bpp formats */
-    P(r8g8b8),
-    P(b8g8r8),
-    P(r5g6b5),
-    P(b5g6r5),
-
-    /* 16 bpp formats */
-    P(x1r5g5b5),
-    P(x1b5g5r5),
-    P(a1r5g5b5),
-    P(a1b5g5r5),
-    P(a4b4g4r4),
-    P(x4b4g4r4),
-    P(a4r4g4b4),
-    P(x4r4g4b4),
-
-    /* 8 bpp formats */
-    P(a8),
-    P(r3g3b2),
-    P(b2g3r3),
-    P(a2r2g2b2),
-    P(a2b2g2r2),
-    P(x4a4),
-
-    /* 4 bpp formats */
-    P(a4),
-    P(r1g2b1),
-    P(b1g2r1),
-    P(a1r1g1b1),
-    P(a1b1g1r1),
-
-    /* 1 bpp formats */
-    P(a1)
-#undef P
-};
-
-struct image_t
-{
-    pixman_image_t *image;
-    const format_t *format;
-    const color_t *color;
-    pixman_repeat_t repeat;
-    int size;
-};
-
-struct operator_t
-{
-    pixman_op_t op;
-    const char *name;
-};
-
-static const operator_t operators[] =
-{
-#define P(x) { PIXMAN_OP_##x, #x }
-    P(CLEAR),
-    P(SRC),
-    P(DST),
-    P(OVER),
-    P(OVER_REVERSE),
-    P(IN),
-    P(IN_REVERSE),
-    P(OUT),
-    P(OUT_REVERSE),
-    P(ATOP),
-    P(ATOP_REVERSE),
-    P(XOR),
-    P(ADD),
-    P(SATURATE),
-
-    P(DISJOINT_CLEAR),
-    P(DISJOINT_SRC),
-    P(DISJOINT_DST),
-    P(DISJOINT_OVER),
-    P(DISJOINT_OVER_REVERSE),
-    P(DISJOINT_IN),
-    P(DISJOINT_IN_REVERSE),
-    P(DISJOINT_OUT),
-    P(DISJOINT_OUT_REVERSE),
-    P(DISJOINT_ATOP),
-    P(DISJOINT_ATOP_REVERSE),
-    P(DISJOINT_XOR),
-
-    P(CONJOINT_CLEAR),
-    P(CONJOINT_SRC),
-    P(CONJOINT_DST),
-    P(CONJOINT_OVER),
-    P(CONJOINT_OVER_REVERSE),
-    P(CONJOINT_IN),
-    P(CONJOINT_IN_REVERSE),
-    P(CONJOINT_OUT),
-    P(CONJOINT_OUT_REVERSE),
-    P(CONJOINT_ATOP),
-    P(CONJOINT_ATOP_REVERSE),
-    P(CONJOINT_XOR),
-#undef P
-};
-
-static double
-calc_op (pixman_op_t op, double src, double dst, double srca, double dsta)
-{
-#define mult_chan(src, dst, Fa, Fb) MIN ((src) * (Fa) + (dst) * (Fb), 1.0)
-
-    double Fa, Fb;
-
-    switch (op)
-    {
-    case PIXMAN_OP_CLEAR:
-    case PIXMAN_OP_DISJOINT_CLEAR:
-    case PIXMAN_OP_CONJOINT_CLEAR:
-	return mult_chan (src, dst, 0.0, 0.0);
-
-    case PIXMAN_OP_SRC:
-    case PIXMAN_OP_DISJOINT_SRC:
-    case PIXMAN_OP_CONJOINT_SRC:
-	return mult_chan (src, dst, 1.0, 0.0);
-
-    case PIXMAN_OP_DST:
-    case PIXMAN_OP_DISJOINT_DST:
-    case PIXMAN_OP_CONJOINT_DST:
-	return mult_chan (src, dst, 0.0, 1.0);
-
-    case PIXMAN_OP_OVER:
-	return mult_chan (src, dst, 1.0, 1.0 - srca);
-
-    case PIXMAN_OP_OVER_REVERSE:
-	return mult_chan (src, dst, 1.0 - dsta, 1.0);
-
-    case PIXMAN_OP_IN:
-	return mult_chan (src, dst, dsta, 0.0);
-
-    case PIXMAN_OP_IN_REVERSE:
-	return mult_chan (src, dst, 0.0, srca);
-
-    case PIXMAN_OP_OUT:
-	return mult_chan (src, dst, 1.0 - dsta, 0.0);
-
-    case PIXMAN_OP_OUT_REVERSE:
-	return mult_chan (src, dst, 0.0, 1.0 - srca);
-
-    case PIXMAN_OP_ATOP:
-	return mult_chan (src, dst, dsta, 1.0 - srca);
-
-    case PIXMAN_OP_ATOP_REVERSE:
-	return mult_chan (src, dst, 1.0 - dsta,  srca);
-
-    case PIXMAN_OP_XOR:
-	return mult_chan (src, dst, 1.0 - dsta, 1.0 - srca);
-
-    case PIXMAN_OP_ADD:
-	return mult_chan (src, dst, 1.0, 1.0);
-
-    case PIXMAN_OP_SATURATE:
-    case PIXMAN_OP_DISJOINT_OVER_REVERSE:
-	if (srca == 0.0)
-	    Fa = 1.0;
-	else
-	    Fa = MIN (1.0, (1.0 - dsta) / srca);
-	return mult_chan (src, dst, Fa, 1.0);
-
-    case PIXMAN_OP_DISJOINT_OVER:
-	if (dsta == 0.0)
-	    Fb = 1.0;
-	else
-	    Fb = MIN (1.0, (1.0 - srca) / dsta);
-	return mult_chan (src, dst, 1.0, Fb);
-
-    case PIXMAN_OP_DISJOINT_IN:
-	if (srca == 0.0)
-	    Fa = 0.0;
-	else
-	    Fa = MAX (0.0, 1.0 - (1.0 - dsta) / srca);
-	return mult_chan (src, dst, Fa, 0.0);
-
-    case PIXMAN_OP_DISJOINT_IN_REVERSE:
-	if (dsta == 0.0)
-	    Fb = 0.0;
-	else
-	    Fb = MAX (0.0, 1.0 - (1.0 - srca) / dsta);
-	return mult_chan (src, dst, 0.0, Fb);
-
-    case PIXMAN_OP_DISJOINT_OUT:
-	if (srca == 0.0)
-	    Fa = 1.0;
-	else
-	    Fa = MIN (1.0, (1.0 - dsta) / srca);
-	return mult_chan (src, dst, Fa, 0.0);
-
-    case PIXMAN_OP_DISJOINT_OUT_REVERSE:
-	if (dsta == 0.0)
-	    Fb = 1.0;
-	else
-	    Fb = MIN (1.0, (1.0 - srca) / dsta);
-	return mult_chan (src, dst, 0.0, Fb);
-
-    case PIXMAN_OP_DISJOINT_ATOP:
-	if (srca == 0.0)
-	    Fa = 0.0;
-	else
-	    Fa = MAX (0.0, 1.0 - (1.0 - dsta) / srca);
-	if (dsta == 0.0)
-	    Fb = 1.0;
-	else
-	    Fb = MIN (1.0, (1.0 - srca) / dsta);
-	return mult_chan (src, dst, Fa, Fb);
-
-    case PIXMAN_OP_DISJOINT_ATOP_REVERSE:
-	if (srca == 0.0)
-	    Fa = 1.0;
-	else
-	    Fa = MIN (1.0, (1.0 - dsta) / srca);
-	if (dsta == 0.0)
-	    Fb = 0.0;
-	else
-	    Fb = MAX (0.0, 1.0 - (1.0 - srca) / dsta);
-	return mult_chan (src, dst, Fa, Fb);
-
-    case PIXMAN_OP_DISJOINT_XOR:
-	if (srca == 0.0)
-	    Fa = 1.0;
-	else
-	    Fa = MIN (1.0, (1.0 - dsta) / srca);
-	if (dsta == 0.0)
-	    Fb = 1.0;
-	else
-	    Fb = MIN (1.0, (1.0 - srca) / dsta);
-	return mult_chan (src, dst, Fa, Fb);
-
-    case PIXMAN_OP_CONJOINT_OVER:
-	if (dsta == 0.0)
-	    Fb = 0.0;
-	else
-	    Fb = MAX (0.0, 1.0 - srca / dsta);
-	return mult_chan (src, dst, 1.0, Fb);
-
-    case PIXMAN_OP_CONJOINT_OVER_REVERSE:
-	if (srca == 0.0)
-	    Fa = 0.0;
-	else
-	    Fa = MAX (0.0, 1.0 - dsta / srca);
-	return mult_chan (src, dst, Fa, 1.0);
-
-    case PIXMAN_OP_CONJOINT_IN:
-	if (srca == 0.0)
-	    Fa = 1.0;
-	else
-	    Fa = MIN (1.0, dsta / srca);
-	return mult_chan (src, dst, Fa, 0.0);
-
-    case PIXMAN_OP_CONJOINT_IN_REVERSE:
-	if (dsta == 0.0)
-	    Fb = 1.0;
-	else
-	    Fb = MIN (1.0, srca / dsta);
-	return mult_chan (src, dst, 0.0, Fb);
-
-    case PIXMAN_OP_CONJOINT_OUT:
-	if (srca == 0.0)
-	    Fa = 0.0;
-	else
-	    Fa = MAX (0.0, 1.0 - dsta / srca);
-	return mult_chan (src, dst, Fa, 0.0);
-
-    case PIXMAN_OP_CONJOINT_OUT_REVERSE:
-	if (dsta == 0.0)
-	    Fb = 0.0;
-	else
-	    Fb = MAX (0.0, 1.0 - srca / dsta);
-	return mult_chan (src, dst, 0.0, Fb);
-
-    case PIXMAN_OP_CONJOINT_ATOP:
-	if (srca == 0.0)
-	    Fa = 1.0;
-	else
-	    Fa = MIN (1.0, dsta / srca);
-	if (dsta == 0.0)
-	    Fb = 0.0;
-	else
-	    Fb = MAX (0.0, 1.0 - srca / dsta);
-	return mult_chan (src, dst, Fa, Fb);
-
-    case PIXMAN_OP_CONJOINT_ATOP_REVERSE:
-	if (srca == 0.0)
-	    Fa = 0.0;
-	else
-	    Fa = MAX (0.0, 1.0 - dsta / srca);
-	if (dsta == 0.0)
-	    Fb = 1.0;
-	else
-	    Fb = MIN (1.0, srca / dsta);
-	return mult_chan (src, dst, Fa, Fb);
-
-    case PIXMAN_OP_CONJOINT_XOR:
-	if (srca == 0.0)
-	    Fa = 0.0;
-	else
-	    Fa = MAX (0.0, 1.0 - dsta / srca);
-	if (dsta == 0.0)
-	    Fb = 0.0;
-	else
-	    Fb = MAX (0.0, 1.0 - srca / dsta);
-	return mult_chan (src, dst, Fa, Fb);
-
-    case PIXMAN_OP_MULTIPLY:
-    case PIXMAN_OP_SCREEN:
-    case PIXMAN_OP_OVERLAY:
-    case PIXMAN_OP_DARKEN:
-    case PIXMAN_OP_LIGHTEN:
-    case PIXMAN_OP_COLOR_DODGE:
-    case PIXMAN_OP_COLOR_BURN:
-    case PIXMAN_OP_HARD_LIGHT:
-    case PIXMAN_OP_SOFT_LIGHT:
-    case PIXMAN_OP_DIFFERENCE:
-    case PIXMAN_OP_EXCLUSION:
-    case PIXMAN_OP_HSL_HUE:
-    case PIXMAN_OP_HSL_SATURATION:
-    case PIXMAN_OP_HSL_COLOR:
-    case PIXMAN_OP_HSL_LUMINOSITY:
-    default:
-	abort();
-	return 0; /* silence MSVC */
-    }
-#undef mult_chan
-}
-
-static void
-do_composite (pixman_op_t op,
-	      const color_t *src,
-	      const color_t *mask,
-	      const color_t *dst,
-	      color_t *result,
-	      pixman_bool_t component_alpha)
-{
-    color_t srcval, srcalpha;
-
-    if (mask == NULL)
-    {
-	srcval = *src;
-
-	srcalpha.r = src->a;
-	srcalpha.g = src->a;
-	srcalpha.b = src->a;
-	srcalpha.a = src->a;
-    }
-    else if (component_alpha)
-    {
-	srcval.r = src->r * mask->r;
-	srcval.g = src->g * mask->g;
-	srcval.b = src->b * mask->b;
-	srcval.a = src->a * mask->a;
-
-	srcalpha.r = src->a * mask->r;
-	srcalpha.g = src->a * mask->g;
-	srcalpha.b = src->a * mask->b;
-	srcalpha.a = src->a * mask->a;
-    }
-    else
-    {
-	srcval.r = src->r * mask->a;
-	srcval.g = src->g * mask->a;
-	srcval.b = src->b * mask->a;
-	srcval.a = src->a * mask->a;
-
-	srcalpha.r = src->a * mask->a;
-	srcalpha.g = src->a * mask->a;
-	srcalpha.b = src->a * mask->a;
-	srcalpha.a = src->a * mask->a;
-    }
-
-    result->r = calc_op (op, srcval.r, dst->r, srcalpha.r, dst->a);
-    result->g = calc_op (op, srcval.g, dst->g, srcalpha.g, dst->a);
-    result->b = calc_op (op, srcval.b, dst->b, srcalpha.b, dst->a);
-    result->a = calc_op (op, srcval.a, dst->a, srcalpha.a, dst->a);
-}
-
-static void
-color_correct (pixman_format_code_t format,
-	       color_t *color)
-{
-#define MASK(x) ((1 << (x)) - 1)
-#define round_pix(pix, m)						\
-    ((int)((pix) * (MASK(m)) + .5) / (double) (MASK(m)))
-
-    if (PIXMAN_FORMAT_R (format) == 0)
-    {
-	color->r = 0.0;
-	color->g = 0.0;
-	color->b = 0.0;
-    }
-    else
-    {
-	color->r = round_pix (color->r, PIXMAN_FORMAT_R (format));
-	color->g = round_pix (color->g, PIXMAN_FORMAT_G (format));
-	color->b = round_pix (color->b, PIXMAN_FORMAT_B (format));
-    }
-
-    if (PIXMAN_FORMAT_A (format) == 0)
-	color->a = 1.0;
-    else
-	color->a = round_pix (color->a, PIXMAN_FORMAT_A (format));
-
-#undef round_pix
-#undef MASK
-}
-
-static void
-get_pixel (pixman_image_t *image,
-	   pixman_format_code_t format,
-	   color_t *color)
-{
-#define MASK(N) ((1UL << (N))-1)
-
-    unsigned long rs, gs, bs, as;
-    int a, r, g, b;
-    unsigned long val;
-
-    val = *(unsigned long *) pixman_image_get_data (image);
-#ifdef WORDS_BIGENDIAN
-    val >>= 8 * sizeof(val) - PIXMAN_FORMAT_BPP (format);
-#endif
-
-    /* Number of bits in each channel */
-    a = PIXMAN_FORMAT_A (format);
-    r = PIXMAN_FORMAT_R (format);
-    g = PIXMAN_FORMAT_G (format);
-    b = PIXMAN_FORMAT_B (format);
-
-    switch (PIXMAN_FORMAT_TYPE (format))
-    {
-    case PIXMAN_TYPE_ARGB:
-        bs = 0;
-        gs = b + bs;
-        rs = g + gs;
-        as = r + rs;
-	break;
-
-    case PIXMAN_TYPE_ABGR:
-        rs = 0;
-        gs = r + rs;
-        bs = g + gs;
-        as = b + bs;
-	break;
-
-    case PIXMAN_TYPE_BGRA:
-        as = 0;
-	rs = PIXMAN_FORMAT_BPP (format) - (b + g + r);
-        gs = r + rs;
-        bs = g + gs;
-	break;
-
-    case PIXMAN_TYPE_RGBA:
-	as = 0;
-	bs = PIXMAN_FORMAT_BPP (format) - (b + g + r);
-	gs = b + bs;
-	rs = g + gs;
-	break;
-
-    case PIXMAN_TYPE_A:
-        as = 0;
-        rs = 0;
-        gs = 0;
-        bs = 0;
-	break;
-
-    case PIXMAN_TYPE_OTHER:
-    case PIXMAN_TYPE_COLOR:
-    case PIXMAN_TYPE_GRAY:
-    case PIXMAN_TYPE_YUY2:
-    case PIXMAN_TYPE_YV12:
-    default:
-	abort ();
-        as = 0;
-        rs = 0;
-        gs = 0;
-        bs = 0;
-	break;
-    }
-
-    if (MASK (a) != 0)
-	color->a = ((val >> as) & MASK (a)) / (double) MASK (a);
-    else
-	color->a = 1.0;
-
-    if (MASK (r) != 0)
-    {
-	color->r = ((val >> rs) & MASK (r)) / (double) MASK (r);
-	color->g = ((val >> gs) & MASK (g)) / (double) MASK (g);
-	color->b = ((val >> bs) & MASK (b)) / (double) MASK (b);
-    }
-    else
-    {
-	color->r = 0.0;
-	color->g = 0.0;
-	color->b = 0.0;
-    }
-
-#undef MASK
-}
-
-static double
-eval_diff (color_t *expected, color_t *test, pixman_format_code_t format)
-{
-    double rscale, gscale, bscale, ascale;
-    double rdiff, gdiff, bdiff, adiff;
-
-    rscale = 1.0 * ((1 << PIXMAN_FORMAT_R (format)) - 1);
-    gscale = 1.0 * ((1 << PIXMAN_FORMAT_G (format)) - 1);
-    bscale = 1.0 * ((1 << PIXMAN_FORMAT_B (format)) - 1);
-    ascale = 1.0 * ((1 << PIXMAN_FORMAT_A (format)) - 1);
-
-    rdiff = fabs (test->r - expected->r) * rscale;
-    bdiff = fabs (test->g - expected->g) * gscale;
-    gdiff = fabs (test->b - expected->b) * bscale;
-    adiff = fabs (test->a - expected->a) * ascale;
-
-    return MAX (MAX (MAX (rdiff, gdiff), bdiff), adiff);
-}
-
-static char *
-describe_image (image_t *info, char *buf)
-{
-    if (info->size)
-    {
-	sprintf (buf, "%s %dx%d%s",
-		 info->format->name,
-		 info->size, info->size,
-		 info->repeat ? "R" :"");
-    }
-    else
-    {
-	sprintf (buf, "solid");
-    }
-
-    return buf;
-}
-
-/* Test a composite of a given operation, source, mask, and destination
- * picture.
- * Fills the window, and samples from the 0,0 pixel corner.
- */
-static pixman_bool_t
-composite_test (image_t *dst,
-		const operator_t *op,
-		image_t *src,
-		image_t *mask,
-		pixman_bool_t component_alpha)
-{
-    pixman_color_t fill;
-    pixman_rectangle16_t rect;
-    color_t expected, result, tdst, tsrc, tmsk;
-    double diff;
-    pixman_bool_t success = TRUE;
-
-    compute_pixman_color (dst->color, &fill);
-    rect.x = rect.y = 0;
-    rect.width = rect.height = dst->size;
-    pixman_image_fill_rectangles (PIXMAN_OP_SRC, dst->image,
-				  &fill, 1, &rect);
-
-    if (mask != NULL)
-    {
-	pixman_image_set_component_alpha (mask->image, component_alpha);
-	pixman_image_composite (op->op, src->image, mask->image, dst->image,
-				0, 0,
-				0, 0,
-				0, 0,
-				dst->size, dst->size);
-
-	tmsk = *mask->color;
-	if (mask->size)
-	{
-	    color_correct (mask->format->format, &tmsk);
-
-	    if (component_alpha &&
-		PIXMAN_FORMAT_R (mask->format->format) == 0)
-	    {
-		/* Ax component-alpha masks expand alpha into
-		 * all color channels.
-		 */
-		tmsk.r = tmsk.g = tmsk.b = tmsk.a;
-	    }
-	}
-    }
-    else
-    {
-	pixman_image_composite (op->op, src->image, NULL, dst->image,
-				0, 0,
-				0, 0,
-				0, 0,
-				dst->size, dst->size);
-    }
-    get_pixel (dst->image, dst->format->format, &result);
-
-    tdst = *dst->color;
-    color_correct (dst->format->format, &tdst);
-    tsrc = *src->color;
-    if (src->size)
-	color_correct (src->format->format, &tsrc);
-    do_composite (op->op, &tsrc, mask ? &tmsk : NULL, &tdst,
-		  &expected, component_alpha);
-    color_correct (dst->format->format, &expected);
-
-    diff = eval_diff (&expected, &result, dst->format->format);
-
-    /* FIXME: We should find out what deviation is acceptable. 3.0
-     * is clearly absurd for 2 bit formats for example. On the other
-     * hand currently 1.0 does not work.
-     */
-    if (diff > 3.0)
-    {
-	char buf[40];
-
-	sprintf (buf, "%s %scomposite",
-		 op->name,
-		 component_alpha ? "CA " : "");
-
-	printf ("%s test error of %.4f --\n"
-		"           R    G    B    A\n"
-		"got:       %.2f %.2f %.2f %.2f [%08lx]\n"
-		"expected:  %.2f %.2f %.2f %.2f\n",
-		buf, diff,
-		result.r, result.g, result.b, result.a,
-		*(unsigned long *) pixman_image_get_data (dst->image),
-		expected.r, expected.g, expected.b, expected.a);
-
-	if (mask != NULL)
-	{
-	    printf ("src color: %.2f %.2f %.2f %.2f\n"
-		    "msk color: %.2f %.2f %.2f %.2f\n"
-		    "dst color: %.2f %.2f %.2f %.2f\n",
-		    src->color->r, src->color->g,
-		    src->color->b, src->color->a,
-		    mask->color->r, mask->color->g,
-		    mask->color->b, mask->color->a,
-		    dst->color->r, dst->color->g,
-		    dst->color->b, dst->color->a);
-	    printf ("src: %s, ", describe_image (src, buf));
-	    printf ("mask: %s, ", describe_image (mask, buf));
-	    printf ("dst: %s\n\n", describe_image (dst, buf));
-	}
-	else
-	{
-	    printf ("src color: %.2f %.2f %.2f %.2f\n"
-		    "dst color: %.2f %.2f %.2f %.2f\n",
-		    src->color->r, src->color->g,
-		    src->color->b, src->color->a,
-		    dst->color->r, dst->color->g,
-		    dst->color->b, dst->color->a);
-	    printf ("src: %s, ", describe_image (src, buf));
-	    printf ("dst: %s\n\n", describe_image (dst, buf));
-	}
-
-	success = FALSE;
-    }
-
-    return success;
-}
-
-static void
-image_init (image_t *info,
-	    int color,
-	    int format,
-	    int size)
-{
-    pixman_color_t fill;
-
-    info->color = &colors[color];
-    compute_pixman_color (info->color, &fill);
-
-    info->format = &formats[format];
-    info->size = sizes[size] & ~FLAGS;
-    info->repeat = PIXMAN_REPEAT_NONE;
-
-    if (info->size)
-    {
-	pixman_rectangle16_t rect;
-
-	info->image = pixman_image_create_bits (info->format->format,
-						info->size, info->size,
-						NULL, 0);
-
-	rect.x = rect.y = 0;
-	rect.width = rect.height = info->size;
-	pixman_image_fill_rectangles (PIXMAN_OP_SRC, info->image, &fill,
-				      1, &rect);
-
-	if (size & REPEAT)
-	{
-	    pixman_image_set_repeat (info->image, PIXMAN_REPEAT_NORMAL);
-	    info->repeat = PIXMAN_REPEAT_NORMAL;
-	}
-    }
-    else
-    {
-	info->image = pixman_image_create_solid_fill (&fill);
-    }
-}
-
-static void
-image_fini (image_t *info)
-{
-    pixman_image_unref (info->image);
-}
-
-static int
-random_size (void)
-{
-    return lcg_rand_n (ARRAY_LENGTH (sizes));
-}
-
-static int
-random_color (void)
-{
-    return lcg_rand_n (ARRAY_LENGTH (colors));
-}
-
-static int
-random_format (void)
-{
-    return lcg_rand_n (ARRAY_LENGTH (formats));
-}
-
-static pixman_bool_t
-run_test (uint32_t seed)
-{
-    image_t src, mask, dst;
-    const operator_t *op;
-    int ca;
-    int ok;
-
-    lcg_srand (seed);
-    
-    image_init (&dst, random_color(), random_format(), 1);
-    image_init (&src, random_color(), random_format(), random_size());
-    image_init (&mask, random_color(), random_format(), random_size());
-
-    op = &(operators [lcg_rand_n (ARRAY_LENGTH (operators))]);
-
-    ca = lcg_rand_n (3);
-
-    switch (ca)
-    {
-    case 0:
-	ok = composite_test (&dst, op, &src, NULL, FALSE);
-	break;
-    case 1:
-	ok = composite_test (&dst, op, &src, &mask, FALSE);
-	break;
-    case 2:
-	ok = composite_test (&dst, op, &src, &mask,
-			     mask.size? TRUE : FALSE);
-	break;
-    default:
-	ok = FALSE;
-	break;
-    }
-
-    image_fini (&src);
-    image_fini (&mask);
-    image_fini (&dst);
-
-    return ok;
-}
-
-int
-main (int argc, char **argv)
-{
-#define N_TESTS (8 * 1024 * 1024)
-    int result = 0;
-    uint32_t i, seed;
-
-    if (argc > 1)
-    {
-	char *end;
-	
-	i = strtol (argv[1], &end, 0);
-
-	if (end != argv[1])
-	{
-	    if (!run_test (i))
-		return 1;
-	    else
-		return 0;
-	}
-	else
-	{
-	    printf ("Usage:\n\n   %s <number>\n\n", argv[0]);
-	    return -1;
-	}
-    }
-
-    if (getenv ("PIXMAN_RANDOMIZE_TESTS"))
-	seed = get_random_seed();
-    else
-	seed = 1;
-    
-#ifdef USE_OPENMP
-#   pragma omp parallel for default(none) shared(result, argv, seed)
-#endif
-    for (i = 0; i <= N_TESTS; ++i)
-    {
-	if (!result && !run_test (i + seed))
-	{
-	    printf ("Test 0x%08X failed.\n", seed + i);
-	    
-	    result = seed + i;
-	}
-    }
-    
-    return result;
-}
+/*
+ * Copyright © 2005 Eric Anholt
+ * Copyright © 2009 Chris Wilson
+ * Copyright © 2010 Soeren Sandmann
+ * Copyright © 2010 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Eric Anholt not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Eric Anholt makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * ERIC ANHOLT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL ERIC ANHOLT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+#define PIXMAN_USE_INTERNAL_API
+#include <pixman.h>
+#include <stdio.h>
+#include <stdlib.h> /* abort() */
+#include <math.h>
+#include <config.h>
+#include <time.h>
+#include "utils.h"
+
+typedef struct color_t color_t;
+typedef struct format_t format_t;
+typedef struct image_t image_t;
+typedef struct operator_t operator_t;
+
+struct color_t
+{
+    double r, g, b, a;
+};
+
+struct format_t
+{
+    pixman_format_code_t format;
+    const char *name;
+};
+
+static const color_t colors[] =
+{
+    { 1.0, 1.0, 1.0, 1.0 },
+    { 1.0, 1.0, 1.0, 0.0 },
+    { 0.0, 0.0, 0.0, 1.0 },
+    { 0.0, 0.0, 0.0, 0.0 },
+    { 1.0, 0.0, 0.0, 1.0 },
+    { 0.0, 1.0, 0.0, 1.0 },
+    { 0.0, 0.0, 1.0, 1.0 },
+    { 0.5, 0.0, 0.0, 0.5 },
+};
+
+static uint16_t
+_color_double_to_short (double d)
+{
+    uint32_t i;
+
+    i = (uint32_t) (d * 65536);
+    i -= (i >> 16);
+
+    return i;
+}
+
+static void
+compute_pixman_color (const color_t *color,
+		      pixman_color_t *out)
+{
+    out->red   = _color_double_to_short (color->r);
+    out->green = _color_double_to_short (color->g);
+    out->blue  = _color_double_to_short (color->b);
+    out->alpha = _color_double_to_short (color->a);
+}
+
+#define REPEAT 0x01000000
+#define FLAGS  0xff000000
+
+static const int sizes[] =
+{
+    0,
+    1,
+    1 | REPEAT,
+    10
+};
+
+static const format_t formats[] =
+{
+#define P(x) { PIXMAN_##x, #x }
+
+    /* 32 bpp formats */
+    P(a8r8g8b8),
+    P(x8r8g8b8),
+    P(a8b8g8r8),
+    P(x8b8g8r8),
+    P(b8g8r8a8),
+    P(b8g8r8x8),
+    P(r8g8b8a8),
+    P(r8g8b8x8),
+    P(x2r10g10b10),
+    P(x2b10g10r10),
+    P(a2r10g10b10),
+    P(a2b10g10r10),
+
+    /* 24 bpp formats */
+    P(r8g8b8),
+    P(b8g8r8),
+    P(r5g6b5),
+    P(b5g6r5),
+
+    /* 16 bpp formats */
+    P(x1r5g5b5),
+    P(x1b5g5r5),
+    P(a1r5g5b5),
+    P(a1b5g5r5),
+    P(a4b4g4r4),
+    P(x4b4g4r4),
+    P(a4r4g4b4),
+    P(x4r4g4b4),
+
+    /* 8 bpp formats */
+    P(a8),
+    P(r3g3b2),
+    P(b2g3r3),
+    P(a2r2g2b2),
+    P(a2b2g2r2),
+    P(x4a4),
+
+    /* 4 bpp formats */
+    P(a4),
+    P(r1g2b1),
+    P(b1g2r1),
+    P(a1r1g1b1),
+    P(a1b1g1r1),
+
+    /* 1 bpp formats */
+    P(a1)
+#undef P
+};
+
+struct image_t
+{
+    pixman_image_t *image;
+    const format_t *format;
+    const color_t *color;
+    pixman_repeat_t repeat;
+    int size;
+};
+
+struct operator_t
+{
+    pixman_op_t op;
+    const char *name;
+};
+
+static const operator_t operators[] =
+{
+#define P(x) { PIXMAN_OP_##x, #x }
+    P(CLEAR),
+    P(SRC),
+    P(DST),
+    P(OVER),
+    P(OVER_REVERSE),
+    P(IN),
+    P(IN_REVERSE),
+    P(OUT),
+    P(OUT_REVERSE),
+    P(ATOP),
+    P(ATOP_REVERSE),
+    P(XOR),
+    P(ADD),
+    P(SATURATE),
+
+    P(DISJOINT_CLEAR),
+    P(DISJOINT_SRC),
+    P(DISJOINT_DST),
+    P(DISJOINT_OVER),
+    P(DISJOINT_OVER_REVERSE),
+    P(DISJOINT_IN),
+    P(DISJOINT_IN_REVERSE),
+    P(DISJOINT_OUT),
+    P(DISJOINT_OUT_REVERSE),
+    P(DISJOINT_ATOP),
+    P(DISJOINT_ATOP_REVERSE),
+    P(DISJOINT_XOR),
+
+    P(CONJOINT_CLEAR),
+    P(CONJOINT_SRC),
+    P(CONJOINT_DST),
+    P(CONJOINT_OVER),
+    P(CONJOINT_OVER_REVERSE),
+    P(CONJOINT_IN),
+    P(CONJOINT_IN_REVERSE),
+    P(CONJOINT_OUT),
+    P(CONJOINT_OUT_REVERSE),
+    P(CONJOINT_ATOP),
+    P(CONJOINT_ATOP_REVERSE),
+    P(CONJOINT_XOR),
+#undef P
+};
+
+static double
+calc_op (pixman_op_t op, double src, double dst, double srca, double dsta)
+{
+#define mult_chan(src, dst, Fa, Fb) MIN ((src) * (Fa) + (dst) * (Fb), 1.0)
+
+    double Fa, Fb;
+
+    switch (op)
+    {
+    case PIXMAN_OP_CLEAR:
+    case PIXMAN_OP_DISJOINT_CLEAR:
+    case PIXMAN_OP_CONJOINT_CLEAR:
+	return mult_chan (src, dst, 0.0, 0.0);
+
+    case PIXMAN_OP_SRC:
+    case PIXMAN_OP_DISJOINT_SRC:
+    case PIXMAN_OP_CONJOINT_SRC:
+	return mult_chan (src, dst, 1.0, 0.0);
+
+    case PIXMAN_OP_DST:
+    case PIXMAN_OP_DISJOINT_DST:
+    case PIXMAN_OP_CONJOINT_DST:
+	return mult_chan (src, dst, 0.0, 1.0);
+
+    case PIXMAN_OP_OVER:
+	return mult_chan (src, dst, 1.0, 1.0 - srca);
+
+    case PIXMAN_OP_OVER_REVERSE:
+	return mult_chan (src, dst, 1.0 - dsta, 1.0);
+
+    case PIXMAN_OP_IN:
+	return mult_chan (src, dst, dsta, 0.0);
+
+    case PIXMAN_OP_IN_REVERSE:
+	return mult_chan (src, dst, 0.0, srca);
+
+    case PIXMAN_OP_OUT:
+	return mult_chan (src, dst, 1.0 - dsta, 0.0);
+
+    case PIXMAN_OP_OUT_REVERSE:
+	return mult_chan (src, dst, 0.0, 1.0 - srca);
+
+    case PIXMAN_OP_ATOP:
+	return mult_chan (src, dst, dsta, 1.0 - srca);
+
+    case PIXMAN_OP_ATOP_REVERSE:
+	return mult_chan (src, dst, 1.0 - dsta,  srca);
+
+    case PIXMAN_OP_XOR:
+	return mult_chan (src, dst, 1.0 - dsta, 1.0 - srca);
+
+    case PIXMAN_OP_ADD:
+	return mult_chan (src, dst, 1.0, 1.0);
+
+    case PIXMAN_OP_SATURATE:
+    case PIXMAN_OP_DISJOINT_OVER_REVERSE:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 1.0);
+
+    case PIXMAN_OP_DISJOINT_OVER:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 1.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_IN:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_DISJOINT_IN_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_OUT:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_DISJOINT_OUT_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_ATOP:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_DISJOINT_ATOP_REVERSE:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_DISJOINT_XOR:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_OVER:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, 1.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_OVER_REVERSE:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	return mult_chan (src, dst, Fa, 1.0);
+
+    case PIXMAN_OP_CONJOINT_IN:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, dsta / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_CONJOINT_IN_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, srca / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_OUT:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_CONJOINT_OUT_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_ATOP:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_ATOP_REVERSE:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_XOR:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_MULTIPLY:
+    case PIXMAN_OP_SCREEN:
+    case PIXMAN_OP_OVERLAY:
+    case PIXMAN_OP_DARKEN:
+    case PIXMAN_OP_LIGHTEN:
+    case PIXMAN_OP_COLOR_DODGE:
+    case PIXMAN_OP_COLOR_BURN:
+    case PIXMAN_OP_HARD_LIGHT:
+    case PIXMAN_OP_SOFT_LIGHT:
+    case PIXMAN_OP_DIFFERENCE:
+    case PIXMAN_OP_EXCLUSION:
+    case PIXMAN_OP_HSL_HUE:
+    case PIXMAN_OP_HSL_SATURATION:
+    case PIXMAN_OP_HSL_COLOR:
+    case PIXMAN_OP_HSL_LUMINOSITY:
+    default:
+	abort();
+	return 0; /* silence MSVC */
+    }
+#undef mult_chan
+}
+
+static void
+do_composite (pixman_op_t op,
+	      const color_t *src,
+	      const color_t *mask,
+	      const color_t *dst,
+	      color_t *result,
+	      pixman_bool_t component_alpha)
+{
+    color_t srcval, srcalpha;
+
+    if (mask == NULL)
+    {
+	srcval = *src;
+
+	srcalpha.r = src->a;
+	srcalpha.g = src->a;
+	srcalpha.b = src->a;
+	srcalpha.a = src->a;
+    }
+    else if (component_alpha)
+    {
+	srcval.r = src->r * mask->r;
+	srcval.g = src->g * mask->g;
+	srcval.b = src->b * mask->b;
+	srcval.a = src->a * mask->a;
+
+	srcalpha.r = src->a * mask->r;
+	srcalpha.g = src->a * mask->g;
+	srcalpha.b = src->a * mask->b;
+	srcalpha.a = src->a * mask->a;
+    }
+    else
+    {
+	srcval.r = src->r * mask->a;
+	srcval.g = src->g * mask->a;
+	srcval.b = src->b * mask->a;
+	srcval.a = src->a * mask->a;
+
+	srcalpha.r = src->a * mask->a;
+	srcalpha.g = src->a * mask->a;
+	srcalpha.b = src->a * mask->a;
+	srcalpha.a = src->a * mask->a;
+    }
+
+    result->r = calc_op (op, srcval.r, dst->r, srcalpha.r, dst->a);
+    result->g = calc_op (op, srcval.g, dst->g, srcalpha.g, dst->a);
+    result->b = calc_op (op, srcval.b, dst->b, srcalpha.b, dst->a);
+    result->a = calc_op (op, srcval.a, dst->a, srcalpha.a, dst->a);
+}
+
+static void
+color_correct (pixman_format_code_t format,
+	       color_t *color)
+{
+#define MASK(x) ((1 << (x)) - 1)
+#define round_pix(pix, m)						\
+    ((int)((pix) * (MASK(m)) + .5) / (double) (MASK(m)))
+
+    if (PIXMAN_FORMAT_R (format) == 0)
+    {
+	color->r = 0.0;
+	color->g = 0.0;
+	color->b = 0.0;
+    }
+    else
+    {
+	color->r = round_pix (color->r, PIXMAN_FORMAT_R (format));
+	color->g = round_pix (color->g, PIXMAN_FORMAT_G (format));
+	color->b = round_pix (color->b, PIXMAN_FORMAT_B (format));
+    }
+
+    if (PIXMAN_FORMAT_A (format) == 0)
+	color->a = 1.0;
+    else
+	color->a = round_pix (color->a, PIXMAN_FORMAT_A (format));
+
+#undef round_pix
+#undef MASK
+}
+
+static void
+get_pixel (pixman_image_t *image,
+	   pixman_format_code_t format,
+	   color_t *color)
+{
+#define MASK(N) ((1UL << (N))-1)
+
+    unsigned long rs, gs, bs, as;
+    int a, r, g, b;
+    unsigned long val;
+
+    val = *(unsigned long *) pixman_image_get_data (image);
+#ifdef WORDS_BIGENDIAN
+    val >>= 8 * sizeof(val) - PIXMAN_FORMAT_BPP (format);
+#endif
+
+    /* Number of bits in each channel */
+    a = PIXMAN_FORMAT_A (format);
+    r = PIXMAN_FORMAT_R (format);
+    g = PIXMAN_FORMAT_G (format);
+    b = PIXMAN_FORMAT_B (format);
+
+    switch (PIXMAN_FORMAT_TYPE (format))
+    {
+    case PIXMAN_TYPE_ARGB:
+        bs = 0;
+        gs = b + bs;
+        rs = g + gs;
+        as = r + rs;
+	break;
+
+    case PIXMAN_TYPE_ABGR:
+        rs = 0;
+        gs = r + rs;
+        bs = g + gs;
+        as = b + bs;
+	break;
+
+    case PIXMAN_TYPE_BGRA:
+        as = 0;
+	rs = PIXMAN_FORMAT_BPP (format) - (b + g + r);
+        gs = r + rs;
+        bs = g + gs;
+	break;
+
+    case PIXMAN_TYPE_RGBA:
+	as = 0;
+	bs = PIXMAN_FORMAT_BPP (format) - (b + g + r);
+	gs = b + bs;
+	rs = g + gs;
+	break;
+
+    case PIXMAN_TYPE_A:
+        as = 0;
+        rs = 0;
+        gs = 0;
+        bs = 0;
+	break;
+
+    case PIXMAN_TYPE_OTHER:
+    case PIXMAN_TYPE_COLOR:
+    case PIXMAN_TYPE_GRAY:
+    case PIXMAN_TYPE_YUY2:
+    case PIXMAN_TYPE_YV12:
+    default:
+	abort ();
+        as = 0;
+        rs = 0;
+        gs = 0;
+        bs = 0;
+	break;
+    }
+
+    if (MASK (a) != 0)
+	color->a = ((val >> as) & MASK (a)) / (double) MASK (a);
+    else
+	color->a = 1.0;
+
+    if (MASK (r) != 0)
+    {
+	color->r = ((val >> rs) & MASK (r)) / (double) MASK (r);
+	color->g = ((val >> gs) & MASK (g)) / (double) MASK (g);
+	color->b = ((val >> bs) & MASK (b)) / (double) MASK (b);
+    }
+    else
+    {
+	color->r = 0.0;
+	color->g = 0.0;
+	color->b = 0.0;
+    }
+
+#undef MASK
+}
+
+static double
+eval_diff (color_t *expected, color_t *test, pixman_format_code_t format)
+{
+    double rscale, gscale, bscale, ascale;
+    double rdiff, gdiff, bdiff, adiff;
+
+    rscale = 1.0 * ((1 << PIXMAN_FORMAT_R (format)) - 1);
+    gscale = 1.0 * ((1 << PIXMAN_FORMAT_G (format)) - 1);
+    bscale = 1.0 * ((1 << PIXMAN_FORMAT_B (format)) - 1);
+    ascale = 1.0 * ((1 << PIXMAN_FORMAT_A (format)) - 1);
+
+    rdiff = fabs (test->r - expected->r) * rscale;
+    bdiff = fabs (test->g - expected->g) * gscale;
+    gdiff = fabs (test->b - expected->b) * bscale;
+    adiff = fabs (test->a - expected->a) * ascale;
+
+    return MAX (MAX (MAX (rdiff, gdiff), bdiff), adiff);
+}
+
+static char *
+describe_image (image_t *info, char *buf)
+{
+    if (info->size)
+    {
+	sprintf (buf, "%s %dx%d%s",
+		 info->format->name,
+		 info->size, info->size,
+		 info->repeat ? "R" :"");
+    }
+    else
+    {
+	sprintf (buf, "solid");
+    }
+
+    return buf;
+}
+
+/* Test a composite of a given operation, source, mask, and destination
+ * picture.
+ * Fills the window, and samples from the 0,0 pixel corner.
+ */
+static pixman_bool_t
+composite_test (image_t *dst,
+		const operator_t *op,
+		image_t *src,
+		image_t *mask,
+		pixman_bool_t component_alpha)
+{
+    pixman_color_t fill;
+    pixman_rectangle16_t rect;
+    color_t expected, result, tdst, tsrc, tmsk;
+    double diff;
+    pixman_bool_t success = TRUE;
+
+    compute_pixman_color (dst->color, &fill);
+    rect.x = rect.y = 0;
+    rect.width = rect.height = dst->size;
+    pixman_image_fill_rectangles (PIXMAN_OP_SRC, dst->image,
+				  &fill, 1, &rect);
+
+    if (mask != NULL)
+    {
+	pixman_image_set_component_alpha (mask->image, component_alpha);
+	pixman_image_composite (op->op, src->image, mask->image, dst->image,
+				0, 0,
+				0, 0,
+				0, 0,
+				dst->size, dst->size);
+
+	tmsk = *mask->color;
+	if (mask->size)
+	{
+	    color_correct (mask->format->format, &tmsk);
+
+	    if (component_alpha &&
+		PIXMAN_FORMAT_R (mask->format->format) == 0)
+	    {
+		/* Ax component-alpha masks expand alpha into
+		 * all color channels.
+		 */
+		tmsk.r = tmsk.g = tmsk.b = tmsk.a;
+	    }
+	}
+    }
+    else
+    {
+	pixman_image_composite (op->op, src->image, NULL, dst->image,
+				0, 0,
+				0, 0,
+				0, 0,
+				dst->size, dst->size);
+    }
+    get_pixel (dst->image, dst->format->format, &result);
+
+    tdst = *dst->color;
+    color_correct (dst->format->format, &tdst);
+    tsrc = *src->color;
+    if (src->size)
+	color_correct (src->format->format, &tsrc);
+    do_composite (op->op, &tsrc, mask ? &tmsk : NULL, &tdst,
+		  &expected, component_alpha);
+    color_correct (dst->format->format, &expected);
+
+    diff = eval_diff (&expected, &result, dst->format->format);
+
+    /* FIXME: We should find out what deviation is acceptable. 3.0
+     * is clearly absurd for 2 bit formats for example. On the other
+     * hand currently 1.0 does not work.
+     */
+    if (diff > 3.0)
+    {
+	char buf[40];
+
+	sprintf (buf, "%s %scomposite",
+		 op->name,
+		 component_alpha ? "CA " : "");
+
+	printf ("%s test error of %.4f --\n"
+		"           R    G    B    A\n"
+		"got:       %.2f %.2f %.2f %.2f [%08lx]\n"
+		"expected:  %.2f %.2f %.2f %.2f\n",
+		buf, diff,
+		result.r, result.g, result.b, result.a,
+		*(unsigned long *) pixman_image_get_data (dst->image),
+		expected.r, expected.g, expected.b, expected.a);
+
+	if (mask != NULL)
+	{
+	    printf ("src color: %.2f %.2f %.2f %.2f\n"
+		    "msk color: %.2f %.2f %.2f %.2f\n"
+		    "dst color: %.2f %.2f %.2f %.2f\n",
+		    src->color->r, src->color->g,
+		    src->color->b, src->color->a,
+		    mask->color->r, mask->color->g,
+		    mask->color->b, mask->color->a,
+		    dst->color->r, dst->color->g,
+		    dst->color->b, dst->color->a);
+	    printf ("src: %s, ", describe_image (src, buf));
+	    printf ("mask: %s, ", describe_image (mask, buf));
+	    printf ("dst: %s\n\n", describe_image (dst, buf));
+	}
+	else
+	{
+	    printf ("src color: %.2f %.2f %.2f %.2f\n"
+		    "dst color: %.2f %.2f %.2f %.2f\n",
+		    src->color->r, src->color->g,
+		    src->color->b, src->color->a,
+		    dst->color->r, dst->color->g,
+		    dst->color->b, dst->color->a);
+	    printf ("src: %s, ", describe_image (src, buf));
+	    printf ("dst: %s\n\n", describe_image (dst, buf));
+	}
+
+	success = FALSE;
+    }
+
+    return success;
+}
+
+static void
+image_init (image_t *info,
+	    int color,
+	    int format,
+	    int size)
+{
+    pixman_color_t fill;
+
+    info->color = &colors[color];
+    compute_pixman_color (info->color, &fill);
+
+    info->format = &formats[format];
+    info->size = sizes[size] & ~FLAGS;
+    info->repeat = PIXMAN_REPEAT_NONE;
+
+    if (info->size)
+    {
+	pixman_rectangle16_t rect;
+
+	info->image = pixman_image_create_bits (info->format->format,
+						info->size, info->size,
+						NULL, 0);
+
+	rect.x = rect.y = 0;
+	rect.width = rect.height = info->size;
+	pixman_image_fill_rectangles (PIXMAN_OP_SRC, info->image, &fill,
+				      1, &rect);
+
+	if (size & REPEAT)
+	{
+	    pixman_image_set_repeat (info->image, PIXMAN_REPEAT_NORMAL);
+	    info->repeat = PIXMAN_REPEAT_NORMAL;
+	}
+    }
+    else
+    {
+	info->image = pixman_image_create_solid_fill (&fill);
+    }
+}
+
+static void
+image_fini (image_t *info)
+{
+    pixman_image_unref (info->image);
+}
+
+static int
+random_size (void)
+{
+    return lcg_rand_n (ARRAY_LENGTH (sizes));
+}
+
+static int
+random_color (void)
+{
+    return lcg_rand_n (ARRAY_LENGTH (colors));
+}
+
+static int
+random_format (void)
+{
+    return lcg_rand_n (ARRAY_LENGTH (formats));
+}
+
+static pixman_bool_t
+run_test (uint32_t seed)
+{
+    image_t src, mask, dst;
+    const operator_t *op;
+    int ca;
+    int ok;
+
+    lcg_srand (seed);
+    
+    image_init (&dst, random_color(), random_format(), 1);
+    image_init (&src, random_color(), random_format(), random_size());
+    image_init (&mask, random_color(), random_format(), random_size());
+
+    op = &(operators [lcg_rand_n (ARRAY_LENGTH (operators))]);
+
+    ca = lcg_rand_n (3);
+
+    switch (ca)
+    {
+    case 0:
+	ok = composite_test (&dst, op, &src, NULL, FALSE);
+	break;
+    case 1:
+	ok = composite_test (&dst, op, &src, &mask, FALSE);
+	break;
+    case 2:
+	ok = composite_test (&dst, op, &src, &mask,
+			     mask.size? TRUE : FALSE);
+	break;
+    default:
+	ok = FALSE;
+	break;
+    }
+
+    image_fini (&src);
+    image_fini (&mask);
+    image_fini (&dst);
+
+    return ok;
+}
+
+int
+main (int argc, char **argv)
+{
+#define N_TESTS (8 * 1024 * 1024)
+    int result = 0;
+    uint32_t i, seed;
+
+    if (argc > 1)
+    {
+	char *end;
+	
+	i = strtol (argv[1], &end, 0);
+
+	if (end != argv[1])
+	{
+	    if (!run_test (i))
+		return 1;
+	    else
+		return 0;
+	}
+	else
+	{
+	    printf ("Usage:\n\n   %s <number>\n\n", argv[0]);
+	    return -1;
+	}
+    }
+
+    if (getenv ("PIXMAN_RANDOMIZE_TESTS"))
+	seed = get_random_seed();
+    else
+	seed = 1;
+    
+#ifdef USE_OPENMP
+#   pragma omp parallel for default(none) shared(result, argv, seed)
+#endif
+    for (i = 0; i <= N_TESTS; ++i)
+    {
+	if (!result && !run_test (i + seed))
+	{
+	    printf ("Test 0x%08X failed.\n", seed + i);
+	    
+	    result = seed + i;
+	}
+    }
+    
+    return result;
+}
diff --git a/pixman/test/fetch-test.c b/pixman/test/fetch-test.c
index cbb1ee63e..feb98d9b7 100644
--- a/pixman/test/fetch-test.c
+++ b/pixman/test/fetch-test.c
@@ -1,206 +1,206 @@
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "pixman.h"
-#include <config.h>
-
-#define SIZE 1024
-
-static pixman_indexed_t mono_palette =
-{
-    0, { 0x00000000, 0x00ffffff },
-};
-
-
-typedef struct {
-    pixman_format_code_t format;
-    int width, height;
-    int stride;
-    uint32_t src[SIZE];
-    uint32_t dst[SIZE];
-    pixman_indexed_t *indexed;
-} testcase_t;
-
-static testcase_t testcases[] =
-{
-    {
-	PIXMAN_a8r8g8b8,
-	2, 2,
-	8,
-	{ 0x00112233, 0x44556677,
-	  0x8899aabb, 0xccddeeff },
-	{ 0x00112233, 0x44556677,
-	  0x8899aabb, 0xccddeeff },
-	NULL,
-    },
-    {
-	PIXMAN_r8g8b8a8,
-	2, 2,
-	8,
-	{ 0x11223300, 0x55667744,
-	  0x99aabb88, 0xddeeffcc },
-	{ 0x00112233, 0x44556677,
-	  0x8899aabb, 0xccddeeff },
-	NULL,
-    },
-    {
-	PIXMAN_g1,
-	8, 2,
-	4,
-#ifdef WORDS_BIGENDIAN
-	{
-	    0xaa000000,
-	    0x55000000
-	},
-#else
-	{
-	    0x00000055,
-	    0x000000aa
-	},
-#endif
-	{
-	    0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000,
-	    0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff
-	},
-	&mono_palette,
-    },
-#if 0
-    {
-	PIXMAN_g8,
-	4, 2,
-	4,
-	{ 0x01234567,
-	  0x89abcdef },
-	{ 0x00010101, 0x00232323, 0x00454545, 0x00676767,
-	  0x00898989, 0x00ababab, 0x00cdcdcd, 0x00efefef, },
-    },
-#endif
-    /* FIXME: make this work on big endian */
-    {
-	PIXMAN_yv12,
-	8, 2,
-	8,
-#ifdef WORDS_BIGENDIAN
-	{
-	    0x00ff00ff, 0x00ff00ff,
-	    0xff00ff00, 0xff00ff00,
-	    0x80ff8000,
-	    0x800080ff
-	},
-#else
-	{
-	    0xff00ff00, 0xff00ff00,
-	    0x00ff00ff, 0x00ff00ff,
-	    0x0080ff80,
-	    0xff800080
-	},
-#endif
-	{
-	    0xff000000, 0xffffffff, 0xffb80000, 0xffffe113,
-	    0xff000000, 0xffffffff, 0xff0023ee, 0xff4affff,
-	    0xffffffff, 0xff000000, 0xffffe113, 0xffb80000,
-	    0xffffffff, 0xff000000, 0xff4affff, 0xff0023ee,
-	},
-    },
-};
-
-int n_test_cases = sizeof(testcases)/sizeof(testcases[0]);
-
-
-static uint32_t
-reader (const void *src, int size)
-{
-    switch (size)
-    {
-    case 1:
-	return *(uint8_t *)src;
-    case 2:
-	return *(uint16_t *)src;
-    case 4:
-	return *(uint32_t *)src;
-    default:
-	assert(0);
-	return 0; /* silence MSVC */
-    }
-}
-
-
-static void
-writer (void *src, uint32_t value, int size)
-{
-    switch (size)
-    {
-    case 1:
-	*(uint8_t *)src = value;
-	break;
-    case 2:
-	*(uint16_t *)src = value;
-	break;
-    case 4:
-	*(uint32_t *)src = value;
-	break;
-    default:
-	assert(0);
-    }
-}
-
-
-int
-main (int argc, char **argv)
-{
-    uint32_t dst[SIZE];
-    pixman_image_t *src_img;
-    pixman_image_t *dst_img;
-    int i, j, x, y;
-    int ret = 0;
-
-    for (i = 0; i < n_test_cases; ++i)
-    {
-	for (j = 0; j < 2; ++j)
-	{
-	    src_img = pixman_image_create_bits (testcases[i].format,
-						testcases[i].width,
-						testcases[i].height,
-						testcases[i].src,
-						testcases[i].stride);
-	    pixman_image_set_indexed(src_img, testcases[i].indexed);
-
-	    dst_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
-						testcases[i].width,
-						testcases[i].height,
-						dst,
-						testcases[i].width*4);
-
-	    if (j)
-	    {
-		pixman_image_set_accessors (src_img, reader, writer);
-		pixman_image_set_accessors (dst_img, reader, writer);
-	    }
-
-	    pixman_image_composite (PIXMAN_OP_SRC, src_img, NULL, dst_img,
-				    0, 0, 0, 0, 0, 0, testcases[i].width, testcases[i].height);
-
-	    pixman_image_unref (src_img);
-	    pixman_image_unref (dst_img);
-
-	    for (y = 0; y < testcases[i].height; ++y)
-	    {
-		for (x = 0; x < testcases[i].width; ++x)
-		{
-		    int offset = y * testcases[i].width + x;
-
-		    if (dst[offset] != testcases[i].dst[offset])
-		    {
-			printf ("test %i%c: pixel mismatch at (x=%d,y=%d): %08x expected, %08x obtained\n",
-			        i + 1, 'a' + j,
-			        x, y,
-			        testcases[i].dst[offset], dst[offset]);
-			ret = 1;
-		    }
-		}
-	    }
-	}
-    }
-
-    return ret;
-}
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman.h"
+#include <config.h>
+
+#define SIZE 1024
+
+static pixman_indexed_t mono_palette =
+{
+    0, { 0x00000000, 0x00ffffff },
+};
+
+
+typedef struct {
+    pixman_format_code_t format;
+    int width, height;
+    int stride;
+    uint32_t src[SIZE];
+    uint32_t dst[SIZE];
+    pixman_indexed_t *indexed;
+} testcase_t;
+
+static testcase_t testcases[] =
+{
+    {
+	PIXMAN_a8r8g8b8,
+	2, 2,
+	8,
+	{ 0x00112233, 0x44556677,
+	  0x8899aabb, 0xccddeeff },
+	{ 0x00112233, 0x44556677,
+	  0x8899aabb, 0xccddeeff },
+	NULL,
+    },
+    {
+	PIXMAN_r8g8b8a8,
+	2, 2,
+	8,
+	{ 0x11223300, 0x55667744,
+	  0x99aabb88, 0xddeeffcc },
+	{ 0x00112233, 0x44556677,
+	  0x8899aabb, 0xccddeeff },
+	NULL,
+    },
+    {
+	PIXMAN_g1,
+	8, 2,
+	4,
+#ifdef WORDS_BIGENDIAN
+	{
+	    0xaa000000,
+	    0x55000000
+	},
+#else
+	{
+	    0x00000055,
+	    0x000000aa
+	},
+#endif
+	{
+	    0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000,
+	    0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff
+	},
+	&mono_palette,
+    },
+#if 0
+    {
+	PIXMAN_g8,
+	4, 2,
+	4,
+	{ 0x01234567,
+	  0x89abcdef },
+	{ 0x00010101, 0x00232323, 0x00454545, 0x00676767,
+	  0x00898989, 0x00ababab, 0x00cdcdcd, 0x00efefef, },
+    },
+#endif
+    /* FIXME: make this work on big endian */
+    {
+	PIXMAN_yv12,
+	8, 2,
+	8,
+#ifdef WORDS_BIGENDIAN
+	{
+	    0x00ff00ff, 0x00ff00ff,
+	    0xff00ff00, 0xff00ff00,
+	    0x80ff8000,
+	    0x800080ff
+	},
+#else
+	{
+	    0xff00ff00, 0xff00ff00,
+	    0x00ff00ff, 0x00ff00ff,
+	    0x0080ff80,
+	    0xff800080
+	},
+#endif
+	{
+	    0xff000000, 0xffffffff, 0xffb80000, 0xffffe113,
+	    0xff000000, 0xffffffff, 0xff0023ee, 0xff4affff,
+	    0xffffffff, 0xff000000, 0xffffe113, 0xffb80000,
+	    0xffffffff, 0xff000000, 0xff4affff, 0xff0023ee,
+	},
+    },
+};
+
+int n_test_cases = sizeof(testcases)/sizeof(testcases[0]);
+
+
+static uint32_t
+reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+	return *(uint8_t *)src;
+    case 2:
+	return *(uint16_t *)src;
+    case 4:
+	return *(uint32_t *)src;
+    default:
+	assert(0);
+	return 0; /* silence MSVC */
+    }
+}
+
+
+static void
+writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+	*(uint8_t *)src = value;
+	break;
+    case 2:
+	*(uint16_t *)src = value;
+	break;
+    case 4:
+	*(uint32_t *)src = value;
+	break;
+    default:
+	assert(0);
+    }
+}
+
+
+int
+main (int argc, char **argv)
+{
+    uint32_t dst[SIZE];
+    pixman_image_t *src_img;
+    pixman_image_t *dst_img;
+    int i, j, x, y;
+    int ret = 0;
+
+    for (i = 0; i < n_test_cases; ++i)
+    {
+	for (j = 0; j < 2; ++j)
+	{
+	    src_img = pixman_image_create_bits (testcases[i].format,
+						testcases[i].width,
+						testcases[i].height,
+						testcases[i].src,
+						testcases[i].stride);
+	    pixman_image_set_indexed(src_img, testcases[i].indexed);
+
+	    dst_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+						testcases[i].width,
+						testcases[i].height,
+						dst,
+						testcases[i].width*4);
+
+	    if (j)
+	    {
+		pixman_image_set_accessors (src_img, reader, writer);
+		pixman_image_set_accessors (dst_img, reader, writer);
+	    }
+
+	    pixman_image_composite (PIXMAN_OP_SRC, src_img, NULL, dst_img,
+				    0, 0, 0, 0, 0, 0, testcases[i].width, testcases[i].height);
+
+	    pixman_image_unref (src_img);
+	    pixman_image_unref (dst_img);
+
+	    for (y = 0; y < testcases[i].height; ++y)
+	    {
+		for (x = 0; x < testcases[i].width; ++x)
+		{
+		    int offset = y * testcases[i].width + x;
+
+		    if (dst[offset] != testcases[i].dst[offset])
+		    {
+			printf ("test %i%c: pixel mismatch at (x=%d,y=%d): %08x expected, %08x obtained\n",
+			        i + 1, 'a' + j,
+			        x, y,
+			        testcases[i].dst[offset], dst[offset]);
+			ret = 1;
+		    }
+		}
+	    }
+	}
+    }
+
+    return ret;
+}
diff --git a/pixman/test/stress-test.c b/pixman/test/stress-test.c
index 92b5f0bca..571420ab0 100644
--- a/pixman/test/stress-test.c
+++ b/pixman/test/stress-test.c
@@ -1,872 +1,872 @@
-#include <stdio.h>
-#include "utils.h"
-#include <sys/types.h>
-
-#if 0
-#define fence_malloc malloc
-#define fence_free free
-#define make_random_bytes malloc
-#endif
-
-static const pixman_format_code_t image_formats[] =
-{
-    PIXMAN_a8r8g8b8,
-    PIXMAN_x8r8g8b8,
-    PIXMAN_r5g6b5,
-    PIXMAN_r3g3b2,
-    PIXMAN_a8,
-    PIXMAN_a8b8g8r8,
-    PIXMAN_x8b8g8r8,
-    PIXMAN_b8g8r8a8,
-    PIXMAN_b8g8r8x8,
-    PIXMAN_r8g8b8a8,
-    PIXMAN_r8g8b8x8,
-    PIXMAN_x14r6g6b6,
-    PIXMAN_r8g8b8,
-    PIXMAN_b8g8r8,
-    PIXMAN_r5g6b5,
-    PIXMAN_b5g6r5,
-    PIXMAN_x2r10g10b10,
-    PIXMAN_a2r10g10b10,
-    PIXMAN_x2b10g10r10,
-    PIXMAN_a2b10g10r10,
-    PIXMAN_a1r5g5b5,
-    PIXMAN_x1r5g5b5,
-    PIXMAN_a1b5g5r5,
-    PIXMAN_x1b5g5r5,
-    PIXMAN_a4r4g4b4,
-    PIXMAN_x4r4g4b4,
-    PIXMAN_a4b4g4r4,
-    PIXMAN_x4b4g4r4,
-    PIXMAN_a8,
-    PIXMAN_r3g3b2,
-    PIXMAN_b2g3r3,
-    PIXMAN_a2r2g2b2,
-    PIXMAN_a2b2g2r2,
-    PIXMAN_c8,
-    PIXMAN_g8,
-    PIXMAN_x4c4,
-    PIXMAN_x4g4,
-    PIXMAN_c4,
-    PIXMAN_g4,
-    PIXMAN_g1,
-    PIXMAN_x4a4,
-    PIXMAN_a4,
-    PIXMAN_r1g2b1,
-    PIXMAN_b1g2r1,
-    PIXMAN_a1r1g1b1,
-    PIXMAN_a1b1g1r1,
-    PIXMAN_a1
-};
-
-static pixman_filter_t filters[] =
-{
-    PIXMAN_FILTER_NEAREST,
-    PIXMAN_FILTER_BILINEAR,
-    PIXMAN_FILTER_FAST,
-    PIXMAN_FILTER_GOOD,
-    PIXMAN_FILTER_BEST,
-    PIXMAN_FILTER_CONVOLUTION
-};
-
-static int
-get_size (void)
-{
-    switch (lcg_rand_n (28))
-    {
-    case 0:
-	return 1;
-
-    case 1:
-	return 2;
-
-    default:
-    case 2:
-	return lcg_rand_n (200);
-
-    case 4:
-	return lcg_rand_n (2000) + 1000;
-
-    case 5:
-	return 65535;
-
-    case 6:
-	return 65536;
-
-    case 7:
-	return lcg_rand_N (64000) + 63000;
-    }
-}
-
-static void
-destroy (pixman_image_t *image, void *data)
-{
-    if (image->type == BITS && image->bits.free_me != image->bits.bits)
-    {
-	uint32_t *bits;
-
-	if (image->bits.bits != (void *)0x01)
-	{
-	    bits = image->bits.bits;
-
-	    if (image->bits.rowstride < 0)
-		bits -= (- image->bits.rowstride * (image->bits.height - 1));
-
-	    fence_free (bits);
-	}
-    }
-
-    free (data);
-}
-
-static uint32_t
-real_reader (const void *src, int size)
-{
-    switch (size)
-    {
-    case 1:
-	return *(uint8_t *)src;
-    case 2:
-	return *(uint16_t *)src;
-    case 4:
-	return *(uint32_t *)src;
-    default:
-	assert (0);
-	return 0; /* silence MSVC */
-    }
-}
-
-static void
-real_writer (void *src, uint32_t value, int size)
-{
-    switch (size)
-    {
-    case 1:
-	*(uint8_t *)src = value;
-	break;
-
-    case 2:
-	*(uint16_t *)src = value;
-	break;
-
-    case 4:
-	*(uint32_t *)src = value;
-	break;
-
-    default:
-	assert (0);
-	break;
-    }
-}
-
-static uint32_t
-fake_reader (const void *src, int size)
-{
-    uint32_t r = lcg_rand_u32 ();
-
-    assert (size == 1 || size == 2 || size == 4);
-    return r & ((1 << (size * 8)) - 1);
-}
-
-static void
-fake_writer (void *src, uint32_t value, int size)
-{
-    assert (size == 1 || size == 2 || size == 4);
-}
-
-static int32_t
-log_rand (void)
-{
-    uint32_t mask;
-
-    mask = (1 << lcg_rand_n (31)) - 1;
-
-    return (lcg_rand () & mask) - (mask >> 1);
-}
-
-static pixman_image_t *
-create_random_bits_image (void)
-{
-    pixman_format_code_t format;
-    pixman_indexed_t *indexed;
-    pixman_image_t *image;
-    int width, height, stride;
-    uint32_t *bits;
-    pixman_read_memory_func_t read_func = NULL;
-    pixman_write_memory_func_t write_func = NULL;
-    pixman_filter_t filter;
-    pixman_fixed_t *coefficients = NULL;
-    int n_coefficients = 0;
-
-    /* format */
-    format = image_formats[lcg_rand_n (ARRAY_LENGTH (image_formats))];
-
-    indexed = NULL;
-    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
-    {
-	indexed = malloc (sizeof (pixman_indexed_t));
-
-	initialize_palette (indexed, PIXMAN_FORMAT_BPP (format), TRUE);
-    }
-    else if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
-    {
-	indexed = malloc (sizeof (pixman_indexed_t));
-
-	initialize_palette (indexed, PIXMAN_FORMAT_BPP (format), FALSE);
-    }
-    else
-    {
-	indexed = NULL;
-    }
-
-    /* size */
-    width = get_size ();
-    height = get_size ();
-
-    if ((uint64_t)width * height > 200000)
-    {
-	if (lcg_rand_n(2) == 0)
-	    height = 200000 / width;
-	else
-	    width = 200000 / height;
-    }
-
-    if (height == 0)
-	height = 1;
-    if (width == 0)
-	width = 1;
-
-    /* bits */
-    switch (lcg_rand_n (7))
-    {
-    default:
-    case 0:
-	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
-	stride = (stride + 3) & (~3);
-	bits = (uint32_t *)make_random_bytes (height * stride);
-	break;
-
-    case 1:
-	stride = 0;
-	bits = NULL;
-	break;
-
-    case 2: /* Zero-filled */
-	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
-	stride = (stride + 3) & (~3);
-	bits = fence_malloc (height * stride);
-	if (!bits)
-	    return NULL;
-	memset (bits, 0, height * stride);
-	break;
-
-    case 3: /* Filled with 0xFF */
-	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
-	stride = (stride + 3) & (~3);
-	bits = fence_malloc (height * stride);
-	if (!bits)
-	    return NULL;
-	memset (bits, 0xff, height * stride);
-	break;
-
-    case 4: /* bits is a bad pointer, has read/write functions */
-	stride = 232;
-	bits = (void *)0x01;
-	read_func = fake_reader;
-	write_func = fake_writer;
-	break;
-
-    case 5: /* bits is a real pointer, has read/write functions */
-	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
-	stride = (stride + 3) & (~3);
-	bits = fence_malloc (height * stride);
-	if (!bits)
-	    return NULL;
-	memset (bits, 0xff, height * stride);
-	read_func = real_reader;
-	write_func = real_writer;
-	break;
-
-    case 6: /* bits is a real pointer, stride is negative */
-	stride = (width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17));
-	stride = (stride + 3) & (~3);
-	bits = (uint32_t *)make_random_bytes (height * stride);
-	if (!bits)
-	    return NULL;
-	bits += ((height - 1) * stride) / 4;
-	stride = - stride;
-	break;
-    }
-
-    /* Filter */
-    filter = filters[lcg_rand_n (ARRAY_LENGTH (filters))];
-    if (filter == PIXMAN_FILTER_CONVOLUTION)
-    {
-	int width = lcg_rand_n (17);
-	int height = lcg_rand_n (19);
-
-	n_coefficients = width * height + 2;
-	coefficients = malloc (n_coefficients * sizeof (pixman_fixed_t));
-
-	if (coefficients)
-	{
-	    int i;
-
-	    for (i = 0; i < width * height; ++i)
-		coefficients[i + 2] = lcg_rand_u32();
-
-	    coefficients[0] = width << 16;
-	    coefficients[1] = height << 16;
-	}
-	else
-	{
-	    filter = PIXMAN_FILTER_BEST;
-	}
-    }
-
-    /* Finally create the image */
-    image = pixman_image_create_bits (format, width, height, bits, stride);
-    if (!image)
-	return NULL;
-
-    pixman_image_set_indexed (image, indexed);
-    pixman_image_set_destroy_function (image, destroy, indexed);
-    pixman_image_set_accessors (image, read_func, write_func);
-    pixman_image_set_filter (image, filter, coefficients, n_coefficients);
-
-    return image;
-}
-
-static pixman_repeat_t repeats[] =
-{
-    PIXMAN_REPEAT_NONE,
-    PIXMAN_REPEAT_NORMAL,
-    PIXMAN_REPEAT_REFLECT,
-    PIXMAN_REPEAT_PAD
-};
-
-static uint32_t
-absolute (int32_t i)
-{
-    return i < 0? -i : i;
-}
-
-static void
-set_general_properties (pixman_image_t *image, pixman_bool_t allow_alpha_map)
-{
-    pixman_repeat_t repeat;
-
-    /* Set properties that are generic to all images */
-
-    /* Repeat */
-    repeat = repeats[lcg_rand_n (ARRAY_LENGTH (repeats))];
-    pixman_image_set_repeat (image, repeat);
-
-    /* Alpha map */
-    if (allow_alpha_map && lcg_rand_n (3) == 0)
-    {
-	pixman_image_t *alpha_map;
-	int16_t x, y;
-
-	alpha_map = create_random_bits_image ();
-
-	if (alpha_map)
-	{
-	    set_general_properties (alpha_map, FALSE);
-
-	    x = lcg_rand_N (100000) - 65536;
-	    y = lcg_rand_N (100000) - 65536;
-
-	    pixman_image_set_alpha_map (image, alpha_map, x, y);
-
-	    pixman_image_unref (alpha_map);
-	}
-    }
-
-    /* Component alpha */
-    pixman_image_set_component_alpha (image, lcg_rand_n (3) == 0);
-
-    /* Clip region */
-    if (lcg_rand_n (8) != 0)
-    {
-	pixman_region32_t region;
-	int i, n_rects;
-
-	pixman_region32_init (&region);
-
-	switch (lcg_rand_n (10))
-	{
-	case 0:
-	    n_rects = 0;
-	    break;
-
-	case 1: case 2: case 3:
-	    n_rects = 1;
-	    break;
-
-	case 4: case 5:
-	    n_rects = 2;
-	    break;
-
-	case 6: case 7:
-	    n_rects = 3;
-
-	default:
-	    n_rects = lcg_rand_n (100);
-	    break;
-	}
-
-	for (i = 0; i < n_rects; ++i)
-	{
-	    uint32_t width, height;
-	    int x, y;
-
-	    x = log_rand();
-	    y = log_rand();
-	    width = absolute (log_rand ()) + 1;
-	    height = absolute (log_rand ()) + 1;
-
-	    pixman_region32_union_rect (
-		&region, &region, x, y, width, height);
-	}
-
-	pixman_image_set_clip_region32 (image, &region);
-
-	pixman_region32_fini (&region);
-    }
-
-    /* Whether source clipping is enabled */
-    pixman_image_set_source_clipping (image, !!lcg_rand_n (2));
-
-    /* Client clip */
-    pixman_image_set_has_client_clip (image, !!lcg_rand_n (2));
-
-    /* Transform */
-    if (lcg_rand_n (5) < 2)
-    {
-	pixman_transform_t xform;
-	int i, j, k;
-	uint32_t tx, ty, sx, sy;
-	uint32_t c, s;
-
-	memset (&xform, 0, sizeof xform);
-	xform.matrix[0][0] = pixman_fixed_1;
-	xform.matrix[1][1] = pixman_fixed_1;
-	xform.matrix[2][2] = pixman_fixed_1;
-
-	for (k = 0; k < 3; ++k)
-	{
-	    switch (lcg_rand_n (4))
-	    {
-	    case 0:
-		/* rotation */
-		c = lcg_rand_N (2 * 65536) - 65536;
-		s = lcg_rand_N (2 * 65536) - 65536;
-		pixman_transform_rotate (&xform, NULL, c, s);
-		break;
-
-	    case 1:
-		/* translation */
-		tx = lcg_rand_u32();
-		ty = lcg_rand_u32();
-		pixman_transform_translate (&xform, NULL, tx, ty);
-		break;
-
-	    case 2:
-		/* scale */
-		sx = lcg_rand_u32();
-		sy = lcg_rand_u32();
-		pixman_transform_scale (&xform, NULL, sx, sy);
-		break;
-
-	    case 3:
-		if (lcg_rand_n (16) == 0)
-		{
-		    /* random */
-		    for (i = 0; i < 3; ++i)
-			for (j = 0; j < 3; ++j)
-			    xform.matrix[i][j] = lcg_rand_u32();
-		    break;
-		}
-		else if (lcg_rand_n (16) == 0)
-		{
-		    /* zero */
-		    memset (&xform, 0, sizeof xform);
-		}
-		break;
-	    }
-	}
-
-	pixman_image_set_transform (image, &xform);
-    }
-}
-
-static pixman_color_t
-random_color (void)
-{
-    pixman_color_t color =
-    {
-	lcg_rand() & 0xffff,
-	lcg_rand() & 0xffff,
-	lcg_rand() & 0xffff,
-	lcg_rand() & 0xffff,
-    };
-
-    return color;
-}
-
-
-static pixman_image_t *
-create_random_solid_image (void)
-{
-    pixman_color_t color = random_color();
-    pixman_image_t *image = pixman_image_create_solid_fill (&color);
-
-    return image;
-}
-
-static pixman_gradient_stop_t *
-create_random_stops (int *n_stops)
-{
-    pixman_fixed_t step;
-    pixman_fixed_t s;
-    int i;
-    pixman_gradient_stop_t *stops;
-
-    *n_stops = lcg_rand_n (50) + 1;
-
-    step = pixman_fixed_1 / *n_stops;
-
-    stops = malloc (*n_stops * sizeof (pixman_gradient_stop_t));
-
-    s = 0;
-    for (i = 0; i < (*n_stops) - 1; ++i)
-    {
-	stops[i].x = s;
-	stops[i].color = random_color();
-
-	s += step;
-    }
-
-    stops[*n_stops - 1].x = pixman_fixed_1;
-    stops[*n_stops - 1].color = random_color();
-
-    return stops;
-}
-
-static pixman_point_fixed_t
-create_random_point (void)
-{
-    pixman_point_fixed_t p;
-
-    p.x = log_rand ();
-    p.y = log_rand ();
-
-    return p;
-}
-
-static pixman_image_t *
-create_random_linear_image (void)
-{
-    int n_stops;
-    pixman_gradient_stop_t *stops;
-    pixman_point_fixed_t p1, p2;
-    pixman_image_t *result;
-
-    stops = create_random_stops (&n_stops);
-    if (!stops)
-	return NULL;
-
-    p1 = create_random_point ();
-    p2 = create_random_point ();
-
-    result = pixman_image_create_linear_gradient (&p1, &p2, stops, n_stops);
-
-    free (stops);
-
-    return result;
-}
-
-static pixman_image_t *
-create_random_radial_image (void)
-{
-    int n_stops;
-    pixman_gradient_stop_t *stops;
-    pixman_point_fixed_t inner_c, outer_c;
-    pixman_fixed_t inner_r, outer_r;
-    pixman_image_t *result;
-
-    inner_c = create_random_point();
-    outer_c = create_random_point();
-    inner_r = lcg_rand();
-    outer_r = lcg_rand();
-
-    stops = create_random_stops (&n_stops);
-
-    if (!stops)
-	return NULL;
-
-    result = pixman_image_create_radial_gradient (
-	&inner_c, &outer_c, inner_r, outer_r, stops, n_stops);
-
-    free (stops);
-
-    return result;
-}
-
-static pixman_image_t *
-create_random_conical_image (void)
-{
-    pixman_gradient_stop_t *stops;
-    int n_stops;
-    pixman_point_fixed_t c;
-    pixman_fixed_t angle;
-    pixman_image_t *result;
-
-    c = create_random_point();
-    angle = lcg_rand();
-
-    stops = create_random_stops (&n_stops);
-
-    if (!stops)
-	return NULL;
-
-    result = pixman_image_create_conical_gradient (&c, angle, stops, n_stops);
-
-    free (stops);
-
-    return result;
-}
-
-static pixman_image_t *
-create_random_image (void)
-{
-    pixman_image_t *result;
-
-    switch (lcg_rand_n (5))
-    {
-    default:
-    case 0:
-	result = create_random_bits_image ();
-	break;
-
-    case 1:
-	result = create_random_solid_image ();
-	break;
-
-    case 2:
-	result = create_random_linear_image ();
-	break;
-
-    case 3:
-	result = create_random_radial_image ();
-	break;
-
-    case 4:
-	result = create_random_conical_image ();
-	break;
-    }
-
-    if (result)
-	set_general_properties (result, TRUE);
-
-    return result;
-}
-
-static const pixman_op_t op_list[] =
-{
-    PIXMAN_OP_SRC,
-    PIXMAN_OP_OVER,
-    PIXMAN_OP_ADD,
-    PIXMAN_OP_CLEAR,
-    PIXMAN_OP_SRC,
-    PIXMAN_OP_DST,
-    PIXMAN_OP_OVER,
-    PIXMAN_OP_OVER_REVERSE,
-    PIXMAN_OP_IN,
-    PIXMAN_OP_IN_REVERSE,
-    PIXMAN_OP_OUT,
-    PIXMAN_OP_OUT_REVERSE,
-    PIXMAN_OP_ATOP,
-    PIXMAN_OP_ATOP_REVERSE,
-    PIXMAN_OP_XOR,
-    PIXMAN_OP_ADD,
-    PIXMAN_OP_SATURATE,
-    PIXMAN_OP_DISJOINT_CLEAR,
-    PIXMAN_OP_DISJOINT_SRC,
-    PIXMAN_OP_DISJOINT_DST,
-    PIXMAN_OP_DISJOINT_OVER,
-    PIXMAN_OP_DISJOINT_OVER_REVERSE,
-    PIXMAN_OP_DISJOINT_IN,
-    PIXMAN_OP_DISJOINT_IN_REVERSE,
-    PIXMAN_OP_DISJOINT_OUT,
-    PIXMAN_OP_DISJOINT_OUT_REVERSE,
-    PIXMAN_OP_DISJOINT_ATOP,
-    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
-    PIXMAN_OP_DISJOINT_XOR,
-    PIXMAN_OP_CONJOINT_CLEAR,
-    PIXMAN_OP_CONJOINT_SRC,
-    PIXMAN_OP_CONJOINT_DST,
-    PIXMAN_OP_CONJOINT_OVER,
-    PIXMAN_OP_CONJOINT_OVER_REVERSE,
-    PIXMAN_OP_CONJOINT_IN,
-    PIXMAN_OP_CONJOINT_IN_REVERSE,
-    PIXMAN_OP_CONJOINT_OUT,
-    PIXMAN_OP_CONJOINT_OUT_REVERSE,
-    PIXMAN_OP_CONJOINT_ATOP,
-    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
-    PIXMAN_OP_CONJOINT_XOR,
-    PIXMAN_OP_MULTIPLY,
-    PIXMAN_OP_SCREEN,
-    PIXMAN_OP_OVERLAY,
-    PIXMAN_OP_DARKEN,
-    PIXMAN_OP_LIGHTEN,
-    PIXMAN_OP_COLOR_DODGE,
-    PIXMAN_OP_COLOR_BURN,
-    PIXMAN_OP_HARD_LIGHT,
-    PIXMAN_OP_DIFFERENCE,
-    PIXMAN_OP_EXCLUSION,
-    PIXMAN_OP_SOFT_LIGHT,
-    PIXMAN_OP_HSL_HUE,
-    PIXMAN_OP_HSL_SATURATION,
-    PIXMAN_OP_HSL_COLOR,
-    PIXMAN_OP_HSL_LUMINOSITY,
-};
-
-static void
-run_test (uint32_t seed, pixman_bool_t verbose, uint32_t mod)
-{
-    pixman_image_t *source, *mask, *dest;
-    pixman_op_t op;
-
-    if (verbose)
-    {
-	if (mod == 0 || (seed % mod) == 0)
-	    printf ("Seed 0x%08x\n", seed);
-    }
-	    
-    lcg_srand (seed);
-
-    source = create_random_image ();
-    mask   = create_random_image ();
-    dest   = create_random_bits_image ();
-
-    if (source && mask && dest)
-    {
-	set_general_properties (dest, TRUE);
-
-	op = op_list [lcg_rand_n (ARRAY_LENGTH (op_list))];
-
-	pixman_image_composite32 (op,
-				  source, mask, dest,
-				  log_rand(), log_rand(),
-				  log_rand(), log_rand(),
-				  log_rand(), log_rand(),
-				  absolute (log_rand()),
-				  absolute (log_rand()));
-    }
-    if (source)
-	pixman_image_unref (source);
-    if (mask)
-	pixman_image_unref (mask);
-    if (dest)
-	pixman_image_unref (dest);
-}
-
-static pixman_bool_t
-get_int (char *s, uint32_t *i)
-{
-    char *end;
-    int p;
-
-    p = strtol (s, &end, 0);
-
-    if (end != s && *end == 0)
-    {
-	*i = p;
-	return TRUE;
-    }
-
-    return FALSE;
-}
-
-int
-main (int argc, char **argv)
-{
-    int verbose = FALSE;
-    uint32_t seed = 1;
-    uint32_t n_tests = 0xffffffff;
-    uint32_t mod = 0;
-    pixman_bool_t use_threads = TRUE;
-    uint32_t i;
-
-    pixman_disable_out_of_bounds_workaround ();
-
-    enable_fp_exceptions();
-
-    if (getenv ("VERBOSE") != NULL)
-	verbose = TRUE;
-
-    for (i = 1; i < argc; ++i)
-    {
-	if (strcmp (argv[i], "-v") == 0)
-	{
-	    verbose = TRUE;
-
-	    if (i + 1 < argc)
-	    {
-		get_int (argv[i + 1], &mod);
-		i++;
-	    }
-	}
-	else if (strcmp (argv[i], "-s") == 0 && i + 1 < argc)
-	{
-	    get_int (argv[i + 1], &seed);
-	    use_threads = FALSE;
-	    i++;
-	}
-	else if (strcmp (argv[i], "-n") == 0 && i + 1 < argc)
-	{
-	    get_int (argv[i + 1], &n_tests);
-	    i++;
-	}
-	else
-	{
-	    if (strcmp (argv[i], "-h") != 0)
-		printf ("Unknown option '%s'\n\n", argv[i]);
-
-	    printf ("Options:\n\n"
-		    "-n <number>        Number of tests to run\n"
-		    "-s <seed> 	        Seed of first test (ignored if PIXMAN_RANDOMIZE_TESTS is set)\n"
-		    "-v                 Print out seeds\n"
-		    "-v <n>             Print out every n'th seed\n\n");
-
-	    exit (-1);
-	}
-    }
-
-    if (n_tests == 0xffffffff)
-	n_tests = 8000;
-
-    if (getenv ("PIXMAN_RANDOMIZE_TESTS"))
-    {
-	seed = get_random_seed();
-	printf ("First seed: 0x%08x\n", seed);
-    }
-
-    if (use_threads)
-    {
-#ifdef USE_OPENMP
-#   pragma omp parallel for default(none) shared(verbose, n_tests, mod, seed)
-#endif
-	for (i = seed; i < seed + n_tests; ++i)
-	    run_test (i, verbose, mod);
-    }
-    else
-    {
-	for (i = seed; i < seed + n_tests; ++i)
-	    run_test (i, verbose, mod);
-    }
-
-    return 0;
-}
+#include <stdio.h>
+#include "utils.h"
+#include <sys/types.h>
+
+#if 0
+#define fence_malloc malloc
+#define fence_free free
+#define make_random_bytes malloc
+#endif
+
+static const pixman_format_code_t image_formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_r5g6b5,
+    PIXMAN_r3g3b2,
+    PIXMAN_a8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x14r6g6b6,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_a8,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_c8,
+    PIXMAN_g8,
+    PIXMAN_x4c4,
+    PIXMAN_x4g4,
+    PIXMAN_c4,
+    PIXMAN_g4,
+    PIXMAN_g1,
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_a1
+};
+
+static pixman_filter_t filters[] =
+{
+    PIXMAN_FILTER_NEAREST,
+    PIXMAN_FILTER_BILINEAR,
+    PIXMAN_FILTER_FAST,
+    PIXMAN_FILTER_GOOD,
+    PIXMAN_FILTER_BEST,
+    PIXMAN_FILTER_CONVOLUTION
+};
+
+static int
+get_size (void)
+{
+    switch (lcg_rand_n (28))
+    {
+    case 0:
+	return 1;
+
+    case 1:
+	return 2;
+
+    default:
+    case 2:
+	return lcg_rand_n (200);
+
+    case 4:
+	return lcg_rand_n (2000) + 1000;
+
+    case 5:
+	return 65535;
+
+    case 6:
+	return 65536;
+
+    case 7:
+	return lcg_rand_N (64000) + 63000;
+    }
+}
+
+static void
+destroy (pixman_image_t *image, void *data)
+{
+    if (image->type == BITS && image->bits.free_me != image->bits.bits)
+    {
+	uint32_t *bits;
+
+	if (image->bits.bits != (void *)0x01)
+	{
+	    bits = image->bits.bits;
+
+	    if (image->bits.rowstride < 0)
+		bits -= (- image->bits.rowstride * (image->bits.height - 1));
+
+	    fence_free (bits);
+	}
+    }
+
+    free (data);
+}
+
+static uint32_t
+real_reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+	return *(uint8_t *)src;
+    case 2:
+	return *(uint16_t *)src;
+    case 4:
+	return *(uint32_t *)src;
+    default:
+	assert (0);
+	return 0; /* silence MSVC */
+    }
+}
+
+static void
+real_writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+	*(uint8_t *)src = value;
+	break;
+
+    case 2:
+	*(uint16_t *)src = value;
+	break;
+
+    case 4:
+	*(uint32_t *)src = value;
+	break;
+
+    default:
+	assert (0);
+	break;
+    }
+}
+
+static uint32_t
+fake_reader (const void *src, int size)
+{
+    uint32_t r = lcg_rand_u32 ();
+
+    assert (size == 1 || size == 2 || size == 4);
+    return r & ((1 << (size * 8)) - 1);
+}
+
+static void
+fake_writer (void *src, uint32_t value, int size)
+{
+    assert (size == 1 || size == 2 || size == 4);
+}
+
+static int32_t
+log_rand (void)
+{
+    uint32_t mask;
+
+    mask = (1 << lcg_rand_n (31)) - 1;
+
+    return (lcg_rand () & mask) - (mask >> 1);
+}
+
+static pixman_image_t *
+create_random_bits_image (void)
+{
+    pixman_format_code_t format;
+    pixman_indexed_t *indexed;
+    pixman_image_t *image;
+    int width, height, stride;
+    uint32_t *bits;
+    pixman_read_memory_func_t read_func = NULL;
+    pixman_write_memory_func_t write_func = NULL;
+    pixman_filter_t filter;
+    pixman_fixed_t *coefficients = NULL;
+    int n_coefficients = 0;
+
+    /* format */
+    format = image_formats[lcg_rand_n (ARRAY_LENGTH (image_formats))];
+
+    indexed = NULL;
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
+    {
+	indexed = malloc (sizeof (pixman_indexed_t));
+
+	initialize_palette (indexed, PIXMAN_FORMAT_BPP (format), TRUE);
+    }
+    else if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+	indexed = malloc (sizeof (pixman_indexed_t));
+
+	initialize_palette (indexed, PIXMAN_FORMAT_BPP (format), FALSE);
+    }
+    else
+    {
+	indexed = NULL;
+    }
+
+    /* size */
+    width = get_size ();
+    height = get_size ();
+
+    if ((uint64_t)width * height > 200000)
+    {
+	if (lcg_rand_n(2) == 0)
+	    height = 200000 / width;
+	else
+	    width = 200000 / height;
+    }
+
+    if (height == 0)
+	height = 1;
+    if (width == 0)
+	width = 1;
+
+    /* bits */
+    switch (lcg_rand_n (7))
+    {
+    default:
+    case 0:
+	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = (uint32_t *)make_random_bytes (height * stride);
+	break;
+
+    case 1:
+	stride = 0;
+	bits = NULL;
+	break;
+
+    case 2: /* Zero-filled */
+	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = fence_malloc (height * stride);
+	if (!bits)
+	    return NULL;
+	memset (bits, 0, height * stride);
+	break;
+
+    case 3: /* Filled with 0xFF */
+	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = fence_malloc (height * stride);
+	if (!bits)
+	    return NULL;
+	memset (bits, 0xff, height * stride);
+	break;
+
+    case 4: /* bits is a bad pointer, has read/write functions */
+	stride = 232;
+	bits = (void *)0x01;
+	read_func = fake_reader;
+	write_func = fake_writer;
+	break;
+
+    case 5: /* bits is a real pointer, has read/write functions */
+	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = fence_malloc (height * stride);
+	if (!bits)
+	    return NULL;
+	memset (bits, 0xff, height * stride);
+	read_func = real_reader;
+	write_func = real_writer;
+	break;
+
+    case 6: /* bits is a real pointer, stride is negative */
+	stride = (width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17));
+	stride = (stride + 3) & (~3);
+	bits = (uint32_t *)make_random_bytes (height * stride);
+	if (!bits)
+	    return NULL;
+	bits += ((height - 1) * stride) / 4;
+	stride = - stride;
+	break;
+    }
+
+    /* Filter */
+    filter = filters[lcg_rand_n (ARRAY_LENGTH (filters))];
+    if (filter == PIXMAN_FILTER_CONVOLUTION)
+    {
+	int width = lcg_rand_n (17);
+	int height = lcg_rand_n (19);
+
+	n_coefficients = width * height + 2;
+	coefficients = malloc (n_coefficients * sizeof (pixman_fixed_t));
+
+	if (coefficients)
+	{
+	    int i;
+
+	    for (i = 0; i < width * height; ++i)
+		coefficients[i + 2] = lcg_rand_u32();
+
+	    coefficients[0] = width << 16;
+	    coefficients[1] = height << 16;
+	}
+	else
+	{
+	    filter = PIXMAN_FILTER_BEST;
+	}
+    }
+
+    /* Finally create the image */
+    image = pixman_image_create_bits (format, width, height, bits, stride);
+    if (!image)
+	return NULL;
+
+    pixman_image_set_indexed (image, indexed);
+    pixman_image_set_destroy_function (image, destroy, indexed);
+    pixman_image_set_accessors (image, read_func, write_func);
+    pixman_image_set_filter (image, filter, coefficients, n_coefficients);
+
+    return image;
+}
+
+static pixman_repeat_t repeats[] =
+{
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_REFLECT,
+    PIXMAN_REPEAT_PAD
+};
+
+static uint32_t
+absolute (int32_t i)
+{
+    return i < 0? -i : i;
+}
+
+static void
+set_general_properties (pixman_image_t *image, pixman_bool_t allow_alpha_map)
+{
+    pixman_repeat_t repeat;
+
+    /* Set properties that are generic to all images */
+
+    /* Repeat */
+    repeat = repeats[lcg_rand_n (ARRAY_LENGTH (repeats))];
+    pixman_image_set_repeat (image, repeat);
+
+    /* Alpha map */
+    if (allow_alpha_map && lcg_rand_n (3) == 0)
+    {
+	pixman_image_t *alpha_map;
+	int16_t x, y;
+
+	alpha_map = create_random_bits_image ();
+
+	if (alpha_map)
+	{
+	    set_general_properties (alpha_map, FALSE);
+
+	    x = lcg_rand_N (100000) - 65536;
+	    y = lcg_rand_N (100000) - 65536;
+
+	    pixman_image_set_alpha_map (image, alpha_map, x, y);
+
+	    pixman_image_unref (alpha_map);
+	}
+    }
+
+    /* Component alpha */
+    pixman_image_set_component_alpha (image, lcg_rand_n (3) == 0);
+
+    /* Clip region */
+    if (lcg_rand_n (8) != 0)
+    {
+	pixman_region32_t region;
+	int i, n_rects;
+
+	pixman_region32_init (&region);
+
+	switch (lcg_rand_n (10))
+	{
+	case 0:
+	    n_rects = 0;
+	    break;
+
+	case 1: case 2: case 3:
+	    n_rects = 1;
+	    break;
+
+	case 4: case 5:
+	    n_rects = 2;
+	    break;
+
+	case 6: case 7:
+	    n_rects = 3;
+
+	default:
+	    n_rects = lcg_rand_n (100);
+	    break;
+	}
+
+	for (i = 0; i < n_rects; ++i)
+	{
+	    uint32_t width, height;
+	    int x, y;
+
+	    x = log_rand();
+	    y = log_rand();
+	    width = absolute (log_rand ()) + 1;
+	    height = absolute (log_rand ()) + 1;
+
+	    pixman_region32_union_rect (
+		&region, &region, x, y, width, height);
+	}
+
+	pixman_image_set_clip_region32 (image, &region);
+
+	pixman_region32_fini (&region);
+    }
+
+    /* Whether source clipping is enabled */
+    pixman_image_set_source_clipping (image, !!lcg_rand_n (2));
+
+    /* Client clip */
+    pixman_image_set_has_client_clip (image, !!lcg_rand_n (2));
+
+    /* Transform */
+    if (lcg_rand_n (5) < 2)
+    {
+	pixman_transform_t xform;
+	int i, j, k;
+	uint32_t tx, ty, sx, sy;
+	uint32_t c, s;
+
+	memset (&xform, 0, sizeof xform);
+	xform.matrix[0][0] = pixman_fixed_1;
+	xform.matrix[1][1] = pixman_fixed_1;
+	xform.matrix[2][2] = pixman_fixed_1;
+
+	for (k = 0; k < 3; ++k)
+	{
+	    switch (lcg_rand_n (4))
+	    {
+	    case 0:
+		/* rotation */
+		c = lcg_rand_N (2 * 65536) - 65536;
+		s = lcg_rand_N (2 * 65536) - 65536;
+		pixman_transform_rotate (&xform, NULL, c, s);
+		break;
+
+	    case 1:
+		/* translation */
+		tx = lcg_rand_u32();
+		ty = lcg_rand_u32();
+		pixman_transform_translate (&xform, NULL, tx, ty);
+		break;
+
+	    case 2:
+		/* scale */
+		sx = lcg_rand_u32();
+		sy = lcg_rand_u32();
+		pixman_transform_scale (&xform, NULL, sx, sy);
+		break;
+
+	    case 3:
+		if (lcg_rand_n (16) == 0)
+		{
+		    /* random */
+		    for (i = 0; i < 3; ++i)
+			for (j = 0; j < 3; ++j)
+			    xform.matrix[i][j] = lcg_rand_u32();
+		    break;
+		}
+		else if (lcg_rand_n (16) == 0)
+		{
+		    /* zero */
+		    memset (&xform, 0, sizeof xform);
+		}
+		break;
+	    }
+	}
+
+	pixman_image_set_transform (image, &xform);
+    }
+}
+
+static pixman_color_t
+random_color (void)
+{
+    pixman_color_t color =
+    {
+	lcg_rand() & 0xffff,
+	lcg_rand() & 0xffff,
+	lcg_rand() & 0xffff,
+	lcg_rand() & 0xffff,
+    };
+
+    return color;
+}
+
+
+static pixman_image_t *
+create_random_solid_image (void)
+{
+    pixman_color_t color = random_color();
+    pixman_image_t *image = pixman_image_create_solid_fill (&color);
+
+    return image;
+}
+
+static pixman_gradient_stop_t *
+create_random_stops (int *n_stops)
+{
+    pixman_fixed_t step;
+    pixman_fixed_t s;
+    int i;
+    pixman_gradient_stop_t *stops;
+
+    *n_stops = lcg_rand_n (50) + 1;
+
+    step = pixman_fixed_1 / *n_stops;
+
+    stops = malloc (*n_stops * sizeof (pixman_gradient_stop_t));
+
+    s = 0;
+    for (i = 0; i < (*n_stops) - 1; ++i)
+    {
+	stops[i].x = s;
+	stops[i].color = random_color();
+
+	s += step;
+    }
+
+    stops[*n_stops - 1].x = pixman_fixed_1;
+    stops[*n_stops - 1].color = random_color();
+
+    return stops;
+}
+
+static pixman_point_fixed_t
+create_random_point (void)
+{
+    pixman_point_fixed_t p;
+
+    p.x = log_rand ();
+    p.y = log_rand ();
+
+    return p;
+}
+
+static pixman_image_t *
+create_random_linear_image (void)
+{
+    int n_stops;
+    pixman_gradient_stop_t *stops;
+    pixman_point_fixed_t p1, p2;
+    pixman_image_t *result;
+
+    stops = create_random_stops (&n_stops);
+    if (!stops)
+	return NULL;
+
+    p1 = create_random_point ();
+    p2 = create_random_point ();
+
+    result = pixman_image_create_linear_gradient (&p1, &p2, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_radial_image (void)
+{
+    int n_stops;
+    pixman_gradient_stop_t *stops;
+    pixman_point_fixed_t inner_c, outer_c;
+    pixman_fixed_t inner_r, outer_r;
+    pixman_image_t *result;
+
+    inner_c = create_random_point();
+    outer_c = create_random_point();
+    inner_r = lcg_rand();
+    outer_r = lcg_rand();
+
+    stops = create_random_stops (&n_stops);
+
+    if (!stops)
+	return NULL;
+
+    result = pixman_image_create_radial_gradient (
+	&inner_c, &outer_c, inner_r, outer_r, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_conical_image (void)
+{
+    pixman_gradient_stop_t *stops;
+    int n_stops;
+    pixman_point_fixed_t c;
+    pixman_fixed_t angle;
+    pixman_image_t *result;
+
+    c = create_random_point();
+    angle = lcg_rand();
+
+    stops = create_random_stops (&n_stops);
+
+    if (!stops)
+	return NULL;
+
+    result = pixman_image_create_conical_gradient (&c, angle, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_image (void)
+{
+    pixman_image_t *result;
+
+    switch (lcg_rand_n (5))
+    {
+    default:
+    case 0:
+	result = create_random_bits_image ();
+	break;
+
+    case 1:
+	result = create_random_solid_image ();
+	break;
+
+    case 2:
+	result = create_random_linear_image ();
+	break;
+
+    case 3:
+	result = create_random_radial_image ();
+	break;
+
+    case 4:
+	result = create_random_conical_image ();
+	break;
+    }
+
+    if (result)
+	set_general_properties (result, TRUE);
+
+    return result;
+}
+
+static const pixman_op_t op_list[] =
+{
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+};
+
+static void
+run_test (uint32_t seed, pixman_bool_t verbose, uint32_t mod)
+{
+    pixman_image_t *source, *mask, *dest;
+    pixman_op_t op;
+
+    if (verbose)
+    {
+	if (mod == 0 || (seed % mod) == 0)
+	    printf ("Seed 0x%08x\n", seed);
+    }
+	    
+    lcg_srand (seed);
+
+    source = create_random_image ();
+    mask   = create_random_image ();
+    dest   = create_random_bits_image ();
+
+    if (source && mask && dest)
+    {
+	set_general_properties (dest, TRUE);
+
+	op = op_list [lcg_rand_n (ARRAY_LENGTH (op_list))];
+
+	pixman_image_composite32 (op,
+				  source, mask, dest,
+				  log_rand(), log_rand(),
+				  log_rand(), log_rand(),
+				  log_rand(), log_rand(),
+				  absolute (log_rand()),
+				  absolute (log_rand()));
+    }
+    if (source)
+	pixman_image_unref (source);
+    if (mask)
+	pixman_image_unref (mask);
+    if (dest)
+	pixman_image_unref (dest);
+}
+
+static pixman_bool_t
+get_int (char *s, uint32_t *i)
+{
+    char *end;
+    int p;
+
+    p = strtol (s, &end, 0);
+
+    if (end != s && *end == 0)
+    {
+	*i = p;
+	return TRUE;
+    }
+
+    return FALSE;
+}
+
+int
+main (int argc, char **argv)
+{
+    int verbose = FALSE;
+    uint32_t seed = 1;
+    uint32_t n_tests = 0xffffffff;
+    uint32_t mod = 0;
+    pixman_bool_t use_threads = TRUE;
+    uint32_t i;
+
+    pixman_disable_out_of_bounds_workaround ();
+
+    enable_fp_exceptions();
+
+    if (getenv ("VERBOSE") != NULL)
+	verbose = TRUE;
+
+    for (i = 1; i < argc; ++i)
+    {
+	if (strcmp (argv[i], "-v") == 0)
+	{
+	    verbose = TRUE;
+
+	    if (i + 1 < argc)
+	    {
+		get_int (argv[i + 1], &mod);
+		i++;
+	    }
+	}
+	else if (strcmp (argv[i], "-s") == 0 && i + 1 < argc)
+	{
+	    get_int (argv[i + 1], &seed);
+	    use_threads = FALSE;
+	    i++;
+	}
+	else if (strcmp (argv[i], "-n") == 0 && i + 1 < argc)
+	{
+	    get_int (argv[i + 1], &n_tests);
+	    i++;
+	}
+	else
+	{
+	    if (strcmp (argv[i], "-h") != 0)
+		printf ("Unknown option '%s'\n\n", argv[i]);
+
+	    printf ("Options:\n\n"
+		    "-n <number>        Number of tests to run\n"
+		    "-s <seed> 	        Seed of first test (ignored if PIXMAN_RANDOMIZE_TESTS is set)\n"
+		    "-v                 Print out seeds\n"
+		    "-v <n>             Print out every n'th seed\n\n");
+
+	    exit (-1);
+	}
+    }
+
+    if (n_tests == 0xffffffff)
+	n_tests = 8000;
+
+    if (getenv ("PIXMAN_RANDOMIZE_TESTS"))
+    {
+	seed = get_random_seed();
+	printf ("First seed: 0x%08x\n", seed);
+    }
+
+    if (use_threads)
+    {
+#ifdef USE_OPENMP
+#   pragma omp parallel for default(none) shared(verbose, n_tests, mod, seed)
+#endif
+	for (i = seed; i < seed + n_tests; ++i)
+	    run_test (i, verbose, mod);
+    }
+    else
+    {
+	for (i = seed; i < seed + n_tests; ++i)
+	    run_test (i, verbose, mod);
+    }
+
+    return 0;
+}
diff --git a/pixman/test/trap-crasher.c b/pixman/test/trap-crasher.c
index 96f3b0bab..7485e62fd 100644
--- a/pixman/test/trap-crasher.c
+++ b/pixman/test/trap-crasher.c
@@ -1,27 +1,27 @@
-#include <stdlib.h>
-#include <pixman.h>
-
-int
-main()
-{
-    pixman_image_t *dst;
-    pixman_trapezoid_t traps[1] = {
-	{
-	    2147483646,
-	    2147483647,
-	    {
-		{ 0, 0 },
-		{ 0, 2147483647 }
-	    },
-	    {
-		{ 65536, 0 },
-		{ 0, 2147483647 }
-	    }
-	},
-    };
-
-    dst = pixman_image_create_bits (PIXMAN_a8, 1, 1, NULL, -1);
-
-    pixman_add_trapezoids (dst, 0, 0, sizeof (traps)/sizeof (traps[0]), traps);
-    return (0);
-}
+#include <stdlib.h>
+#include <pixman.h>
+
+int
+main()
+{
+    pixman_image_t *dst;
+    pixman_trapezoid_t traps[1] = {
+	{
+	    2147483646,
+	    2147483647,
+	    {
+		{ 0, 0 },
+		{ 0, 2147483647 }
+	    },
+	    {
+		{ 65536, 0 },
+		{ 0, 2147483647 }
+	    }
+	},
+    };
+
+    dst = pixman_image_create_bits (PIXMAN_a8, 1, 1, NULL, -1);
+
+    pixman_add_trapezoids (dst, 0, 0, sizeof (traps)/sizeof (traps[0]), traps);
+    return (0);
+}
diff --git a/pixman/test/utils.c b/pixman/test/utils.c
index 46b48415f..adabd75dd 100644
--- a/pixman/test/utils.c
+++ b/pixman/test/utils.c
@@ -1,704 +1,704 @@
-#define _GNU_SOURCE
-
-#include "utils.h"
-#include <signal.h>
-
-#ifdef HAVE_GETTIMEOFDAY
-#include <sys/time.h>
-#else
-#include <time.h>
-#endif
-
-#ifdef HAVE_UNISTD_H
-#include <unistd.h>
-#endif
-
-#ifdef HAVE_SYS_MMAN_H
-#include <sys/mman.h>
-#endif
-
-#ifdef HAVE_FENV_H
-#include <fenv.h>
-#endif
-
-#ifdef HAVE_LIBPNG
-#include <png.h>
-#endif
-
-/* Random number seed
- */
-
-uint32_t lcg_seed;
-
-/*----------------------------------------------------------------------------*\
- *  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
- *
- *  This program generates the CRC-32 values for the files named in the
- *  command-line arguments.  These are the same CRC-32 values used by GZIP,
- *  PKZIP, and ZMODEM.  The Crc32_ComputeBuf () can also be detached and
- *  used independently.
- *
- *  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
- *
- *  Based on the byte-oriented implementation "File Verification Using CRC"
- *  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
- *
- *  v1.0.0: original release.
- *  v1.0.1: fixed printf formats.
- *  v1.0.2: fixed something else.
- *  v1.0.3: replaced CRC constant table by generator function.
- *  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
- *  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
-\*----------------------------------------------------------------------------*/
-
-/*----------------------------------------------------------------------------*\
- *  NAME:
- *     Crc32_ComputeBuf () - computes the CRC-32 value of a memory buffer
- *  DESCRIPTION:
- *     Computes or accumulates the CRC-32 value for a memory buffer.
- *     The 'inCrc32' gives a previously accumulated CRC-32 value to allow
- *     a CRC to be generated for multiple sequential buffer-fuls of data.
- *     The 'inCrc32' for the first buffer must be zero.
- *  ARGUMENTS:
- *     inCrc32 - accumulated CRC-32 value, must be 0 on first call
- *     buf     - buffer to compute CRC-32 value for
- *     bufLen  - number of bytes in buffer
- *  RETURNS:
- *     crc32 - computed CRC-32 value
- *  ERRORS:
- *     (no errors are possible)
-\*----------------------------------------------------------------------------*/
-
-uint32_t
-compute_crc32 (uint32_t    in_crc32,
-	       const void *buf,
-	       size_t      buf_len)
-{
-    static const uint32_t crc_table[256] = {
-	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
-	0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
-	0x09B64C2B, 0x7EB17CBD,	0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
-	0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
-	0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,	0x14015C4F, 0x63066CD9,
-	0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
-	0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
-	0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
-	0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
-	0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
-	0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
-	0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
-	0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
-	0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
-	0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
-	0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
-	0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
-	0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
-	0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
-	0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
-	0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
-	0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
-	0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
-	0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
-	0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
-	0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
-	0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
-	0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
-	0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
-	0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
-	0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
-	0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
-	0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
-	0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
-	0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
-	0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
-	0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
-	0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
-	0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
-	0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
-	0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
-	0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
-	0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
-    };
-
-    uint32_t              crc32;
-    unsigned char *       byte_buf;
-    size_t                i;
-
-    /* accumulate crc32 for buffer */
-    crc32 = in_crc32 ^ 0xFFFFFFFF;
-    byte_buf = (unsigned char*) buf;
-
-    for (i = 0; i < buf_len; i++)
-	crc32 = (crc32 >> 8) ^ crc_table[(crc32 ^ byte_buf[i]) & 0xFF];
-
-    return (crc32 ^ 0xFFFFFFFF);
-}
-
-pixman_bool_t
-is_little_endian (void)
-{
-    volatile uint16_t endian_check_var = 0x1234;
-
-    return (*(volatile uint8_t *)&endian_check_var == 0x34);
-}
-
-/* perform endian conversion of pixel data
- */
-void
-image_endian_swap (pixman_image_t *img)
-{
-    int stride = pixman_image_get_stride (img);
-    uint32_t *data = pixman_image_get_data (img);
-    int height = pixman_image_get_height (img);
-    int bpp = PIXMAN_FORMAT_BPP (pixman_image_get_format (img));
-    int i, j;
-
-    /* swap bytes only on big endian systems */
-    if (is_little_endian())
-	return;
-
-    if (bpp == 8)
-	return;
-
-    for (i = 0; i < height; i++)
-    {
-	uint8_t *line_data = (uint8_t *)data + stride * i;
-	
-	switch (bpp)
-	{
-	case 1:
-	    for (j = 0; j < stride; j++)
-	    {
-		line_data[j] =
-		    ((line_data[j] & 0x80) >> 7) |
-		    ((line_data[j] & 0x40) >> 5) |
-		    ((line_data[j] & 0x20) >> 3) |
-		    ((line_data[j] & 0x10) >> 1) |
-		    ((line_data[j] & 0x08) << 1) |
-		    ((line_data[j] & 0x04) << 3) |
-		    ((line_data[j] & 0x02) << 5) |
-		    ((line_data[j] & 0x01) << 7);
-	    }
-	    break;
-	case 4:
-	    for (j = 0; j < stride; j++)
-	    {
-		line_data[j] = (line_data[j] >> 4) | (line_data[j] << 4);
-	    }
-	    break;
-	case 16:
-	    for (j = 0; j + 2 <= stride; j += 2)
-	    {
-		char t1 = line_data[j + 0];
-		char t2 = line_data[j + 1];
-
-		line_data[j + 1] = t1;
-		line_data[j + 0] = t2;
-	    }
-	    break;
-	case 24:
-	    for (j = 0; j + 3 <= stride; j += 3)
-	    {
-		char t1 = line_data[j + 0];
-		char t2 = line_data[j + 1];
-		char t3 = line_data[j + 2];
-
-		line_data[j + 2] = t1;
-		line_data[j + 1] = t2;
-		line_data[j + 0] = t3;
-	    }
-	    break;
-	case 32:
-	    for (j = 0; j + 4 <= stride; j += 4)
-	    {
-		char t1 = line_data[j + 0];
-		char t2 = line_data[j + 1];
-		char t3 = line_data[j + 2];
-		char t4 = line_data[j + 3];
-
-		line_data[j + 3] = t1;
-		line_data[j + 2] = t2;
-		line_data[j + 1] = t3;
-		line_data[j + 0] = t4;
-	    }
-	    break;
-	default:
-	    assert (FALSE);
-	    break;
-	}
-    }
-}
-
-#define N_LEADING_PROTECTED	10
-#define N_TRAILING_PROTECTED	10
-
-typedef struct
-{
-    void *addr;
-    uint32_t len;
-    uint8_t *trailing;
-    int n_bytes;
-} info_t;
-
-#if defined(HAVE_MPROTECT) && defined(HAVE_GETPAGESIZE) && defined(HAVE_SYS_MMAN_H) && defined(HAVE_MMAP)
-
-/* This is apparently necessary on at least OS X */
-#ifndef MAP_ANONYMOUS
-#define MAP_ANONYMOUS MAP_ANON
-#endif
-
-void *
-fence_malloc (int64_t len)
-{
-    unsigned long page_size = getpagesize();
-    unsigned long page_mask = page_size - 1;
-    uint32_t n_payload_bytes = (len + page_mask) & ~page_mask;
-    uint32_t n_bytes =
-	(page_size * (N_LEADING_PROTECTED + N_TRAILING_PROTECTED + 2) +
-	 n_payload_bytes) & ~page_mask;
-    uint8_t *initial_page;
-    uint8_t *leading_protected;
-    uint8_t *trailing_protected;
-    uint8_t *payload;
-    uint8_t *addr;
-
-    if (len < 0)
-	abort();
-    
-    addr = mmap (NULL, n_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
-		 -1, 0);
-
-    if (addr == MAP_FAILED)
-    {
-	printf ("mmap failed on %lld %u\n", (long long int)len, n_bytes);
-	return NULL;
-    }
-
-    initial_page = (uint8_t *)(((unsigned long)addr + page_mask) & ~page_mask);
-    leading_protected = initial_page + page_size;
-    payload = leading_protected + N_LEADING_PROTECTED * page_size;
-    trailing_protected = payload + n_payload_bytes;
-
-    ((info_t *)initial_page)->addr = addr;
-    ((info_t *)initial_page)->len = len;
-    ((info_t *)initial_page)->trailing = trailing_protected;
-    ((info_t *)initial_page)->n_bytes = n_bytes;
-
-    if ((mprotect (leading_protected, N_LEADING_PROTECTED * page_size,
-		  PROT_NONE) == -1) ||
-	(mprotect (trailing_protected, N_TRAILING_PROTECTED * page_size,
-		  PROT_NONE) == -1))
-    {
-	munmap (addr, n_bytes);
-	return NULL;
-    }
-
-    return payload;
-}
-
-void
-fence_free (void *data)
-{
-    uint32_t page_size = getpagesize();
-    uint8_t *payload = data;
-    uint8_t *leading_protected = payload - N_LEADING_PROTECTED * page_size;
-    uint8_t *initial_page = leading_protected - page_size;
-    info_t *info = (info_t *)initial_page;
-
-    munmap (info->addr, info->n_bytes);
-}
-
-#else
-
-void *
-fence_malloc (int64_t len)
-{
-    return malloc (len);
-}
-
-void
-fence_free (void *data)
-{
-    free (data);
-}
-
-#endif
-
-uint8_t *
-make_random_bytes (int n_bytes)
-{
-    uint8_t *bytes = fence_malloc (n_bytes);
-    int i;
-
-    if (!bytes)
-	return NULL;
-
-    for (i = 0; i < n_bytes; ++i)
-	bytes[i] = lcg_rand () & 0xff;
-
-    return bytes;
-}
-
-#ifdef HAVE_LIBPNG
-
-static void
-pngify_pixels (uint32_t *pixels, int n_pixels)
-{
-    int i;
-
-    for (i = 0; i < n_pixels; ++i)
-    {
-	uint32_t p = pixels[i];
-	uint8_t *out = (uint8_t *)&(pixels[i]);
-	uint8_t a, r, g, b;
-
-	a = (p & 0xff000000) >> 24;
-	r = (p & 0x00ff0000) >> 16;
-	g = (p & 0x0000ff00) >> 8;
-	b = (p & 0x000000ff) >> 0;
-
-	if (a != 0)
-	{
-	    r = (r * 255) / a;
-	    g = (g * 255) / a;
-	    b = (b * 255) / a;
-	}
-
-	*out++ = r;
-	*out++ = g;
-	*out++ = b;
-	*out++ = a;
-    }
-}
-
-pixman_bool_t
-write_png (pixman_image_t *image, const char *filename)
-{
-    int width = pixman_image_get_width (image);
-    int height = pixman_image_get_height (image);
-    int stride = width * 4;
-    uint32_t *data = malloc (height * stride);
-    pixman_image_t *copy;
-    png_struct *write_struct;
-    png_info *info_struct;
-    pixman_bool_t result = FALSE;
-    FILE *f = fopen (filename, "wb");
-    png_bytep *row_pointers;
-    int i;
-
-    if (!f)
-	return FALSE;
-
-    row_pointers = malloc (height * sizeof (png_bytep));
-
-    copy = pixman_image_create_bits (
-	PIXMAN_a8r8g8b8, width, height, data, stride);
-
-    pixman_image_composite32 (
-	PIXMAN_OP_SRC, image, NULL, copy, 0, 0, 0, 0, 0, 0, width, height);
-
-    pngify_pixels (data, height * width);
-
-    for (i = 0; i < height; ++i)
-	row_pointers[i] = (png_bytep)(data + i * width);
-
-    if (!(write_struct = png_create_write_struct (
-	      PNG_LIBPNG_VER_STRING, NULL, NULL, NULL)))
-	goto out1;
-
-    if (!(info_struct = png_create_info_struct (write_struct)))
-	goto out2;
-
-    png_init_io (write_struct, f);
-
-    png_set_IHDR (write_struct, info_struct, width, height,
-		  8, PNG_COLOR_TYPE_RGB_ALPHA,
-		  PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
-		  PNG_FILTER_TYPE_BASE);
-
-    png_write_info (write_struct, info_struct);
-
-    png_write_image (write_struct, row_pointers);
-
-    png_write_end (write_struct, NULL);
-
-    result = TRUE;
-
-out2:
-    png_destroy_write_struct (&write_struct, &info_struct);
-
-out1:
-    if (fclose (f) != 0)
-	result = FALSE;
-
-    pixman_image_unref (copy);
-    free (row_pointers);
-    free (data);
-    return result;
-}
-
-#else /* no libpng */
-
-pixman_bool_t
-write_png (pixman_image_t *image, const char *filename)
-{
-    return FALSE;
-}
-
-#endif
-
-/*
- * A function, which can be used as a core part of the test programs,
- * intended to detect various problems with the help of fuzzing input
- * to pixman API (according to some templates, aka "smart" fuzzing).
- * Some general information about such testing can be found here:
- * http://en.wikipedia.org/wiki/Fuzz_testing
- *
- * It may help detecting:
- *  - crashes on bad handling of valid or reasonably invalid input to
- *    pixman API.
- *  - deviations from the behavior of older pixman releases.
- *  - deviations from the behavior of the same pixman release, but
- *    configured in a different way (for example with SIMD optimizations
- *    disabled), or running on a different OS or hardware.
- *
- * The test is performed by calling a callback function a huge number
- * of times. The callback function is expected to run some snippet of
- * pixman code with pseudorandom variations to the data feeded to
- * pixman API. A result of running each callback function should be
- * some deterministic value which depends on test number (test number
- * can be used as a seed for PRNG). When 'verbose' argument is nonzero,
- * callback function is expected to print to stdout some information
- * about what it does.
- *
- * Return values from many small tests are accumulated together and
- * used as final checksum, which can be compared to some expected
- * value. Running the tests not individually, but in a batch helps
- * to reduce process start overhead and also allows to parallelize
- * testing and utilize multiple CPU cores.
- *
- * The resulting executable can be run without any arguments. In
- * this case it runs a batch of tests starting from 1 and up to
- * 'default_number_of_iterations'. The resulting checksum is
- * compared with 'expected_checksum' and FAIL or PASS verdict
- * depends on the result of this comparison.
- *
- * If the executable is run with 2 numbers provided as command line
- * arguments, they specify the starting and ending numbers for a test
- * batch.
- *
- * If the executable is run with only one number provided as a command
- * line argument, then this number is used to call the callback function
- * once, and also with verbose flag set.
- */
-int
-fuzzer_test_main (const char *test_name,
-		  int         default_number_of_iterations,
-		  uint32_t    expected_checksum,
-		  uint32_t    (*test_function)(int testnum, int verbose),
-		  int         argc,
-		  const char *argv[])
-{
-    int i, n1 = 1, n2 = 0;
-    uint32_t checksum = 0;
-    int verbose = getenv ("VERBOSE") != NULL;
-
-    if (argc >= 3)
-    {
-	n1 = atoi (argv[1]);
-	n2 = atoi (argv[2]);
-	if (n2 < n1)
-	{
-	    printf ("invalid test range\n");
-	    return 1;
-	}
-    }
-    else if (argc >= 2)
-    {
-	n2 = atoi (argv[1]);
-	checksum = test_function (n2, 1);
-	printf ("%d: checksum=%08X\n", n2, checksum);
-	return 0;
-    }
-    else
-    {
-	n1 = 1;
-	n2 = default_number_of_iterations;
-    }
-
-#ifdef USE_OPENMP
-    #pragma omp parallel for reduction(+:checksum) default(none) \
-					shared(n1, n2, test_function, verbose)
-#endif
-    for (i = n1; i <= n2; i++)
-    {
-	uint32_t crc = test_function (i, 0);
-	if (verbose)
-	    printf ("%d: %08X\n", i, crc);
-	checksum += crc;
-    }
-
-    if (n1 == 1 && n2 == default_number_of_iterations)
-    {
-	if (checksum == expected_checksum)
-	{
-	    printf ("%s test passed (checksum=%08X)\n",
-		    test_name, checksum);
-	}
-	else
-	{
-	    printf ("%s test failed! (checksum=%08X, expected %08X)\n",
-		    test_name, checksum, expected_checksum);
-	    return 1;
-	}
-    }
-    else
-    {
-	printf ("%d-%d: checksum=%08X\n", n1, n2, checksum);
-    }
-
-    return 0;
-}
-
-/* Try to obtain current time in seconds */
-double
-gettime (void)
-{
-#ifdef HAVE_GETTIMEOFDAY
-    struct timeval tv;
-
-    gettimeofday (&tv, NULL);
-    return (double)((int64_t)tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.;
-#else
-    return (double)clock() / (double)CLOCKS_PER_SEC;
-#endif
-}
-
-uint32_t
-get_random_seed (void)
-{
-    double d = gettime();
-
-    lcg_srand (*(uint32_t *)&d);
-
-    return lcg_rand_u32 ();
-}
-
-static const char *global_msg;
-
-static void
-on_alarm (int signo)
-{
-    printf ("%s\n", global_msg);
-    exit (1);
-}
-
-void
-fail_after (int seconds, const char *msg)
-{
-#ifdef HAVE_SIGACTION
-#ifdef HAVE_ALARM
-    struct sigaction action;
-
-    global_msg = msg;
-
-    memset (&action, 0, sizeof (action));
-    action.sa_handler = on_alarm;
-
-    alarm (seconds);
-
-    sigaction (SIGALRM, &action, NULL);
-#endif
-#endif
-}
-
-void
-enable_fp_exceptions (void)
-{
-#ifdef HAVE_FENV_H
-#ifdef HAVE_FEENABLEEXCEPT
-    /* Note: we don't enable the FE_INEXACT trap because
-     * that happens quite commonly. It is possible that
-     * over- and underflow should similarly be considered
-     * okay, but for now the test suite passes with them
-     * enabled, and it's useful to know if they start
-     * occuring.
-     */
-    feenableexcept (FE_DIVBYZERO	|
-		    FE_INVALID		|
-		    FE_OVERFLOW		|
-		    FE_UNDERFLOW);
-#endif
-#endif
-}
-
-void *
-aligned_malloc (size_t align, size_t size)
-{
-    void *result;
-
-#ifdef HAVE_POSIX_MEMALIGN
-    if (posix_memalign (&result, align, size) != 0)
-      result = NULL;
-#else
-    result = malloc (size);
-#endif
-
-    return result;
-}
-
-#define CONVERT_15(c, is_rgb)						\
-    (is_rgb?								\
-     ((((c) >> 3) & 0x001f) |						\
-      (((c) >> 6) & 0x03e0) |						\
-      (((c) >> 9) & 0x7c00)) :						\
-     (((((c) >> 16) & 0xff) * 153 +					\
-       (((c) >>  8) & 0xff) * 301 +					\
-       (((c)      ) & 0xff) * 58) >> 2))
-
-void
-initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb)
-{
-    int i;
-    uint32_t mask = (1 << depth) - 1;
-
-    for (i = 0; i < 32768; ++i)
-	palette->ent[i] = lcg_rand() & mask;
-
-    memset (palette->rgba, 0, sizeof (palette->rgba));
-
-    for (i = 0; i < mask + 1; ++i)
-    {
-	uint32_t rgba24;
- 	pixman_bool_t retry;
-	uint32_t i15;
-
-	/* We filled the rgb->index map with random numbers, but we
-	 * do need the ability to round trip, that is if some indexed
-	 * color expands to an argb24, then the 15 bit version of that
-	 * color must map back to the index. Anything else, we don't
-	 * care about too much.
-	 */
-	do
-	{
-	    uint32_t old_idx;
-
-	    rgba24 = lcg_rand();
-	    i15 = CONVERT_15 (rgba24, is_rgb);
-
-	    old_idx = palette->ent[i15];
-	    if (CONVERT_15 (palette->rgba[old_idx], is_rgb) == i15)
-		retry = 1;
-	    else
-		retry = 0;
-	} while (retry);
-
-	palette->rgba[i] = rgba24;
-	palette->ent[i15] = i;
-    }
-
-    for (i = 0; i < mask + 1; ++i)
-    {
-	assert (palette->ent[CONVERT_15 (palette->rgba[i], is_rgb)] == i);
-    }
-}
+#define _GNU_SOURCE
+
+#include "utils.h"
+#include <signal.h>
+
+#ifdef HAVE_GETTIMEOFDAY
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#ifdef HAVE_FENV_H
+#include <fenv.h>
+#endif
+
+#ifdef HAVE_LIBPNG
+#include <png.h>
+#endif
+
+/* Random number seed
+ */
+
+uint32_t lcg_seed;
+
+/*----------------------------------------------------------------------------*\
+ *  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
+ *
+ *  This program generates the CRC-32 values for the files named in the
+ *  command-line arguments.  These are the same CRC-32 values used by GZIP,
+ *  PKZIP, and ZMODEM.  The Crc32_ComputeBuf () can also be detached and
+ *  used independently.
+ *
+ *  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
+ *
+ *  Based on the byte-oriented implementation "File Verification Using CRC"
+ *  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
+ *
+ *  v1.0.0: original release.
+ *  v1.0.1: fixed printf formats.
+ *  v1.0.2: fixed something else.
+ *  v1.0.3: replaced CRC constant table by generator function.
+ *  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
+ *  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
+\*----------------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------------*\
+ *  NAME:
+ *     Crc32_ComputeBuf () - computes the CRC-32 value of a memory buffer
+ *  DESCRIPTION:
+ *     Computes or accumulates the CRC-32 value for a memory buffer.
+ *     The 'inCrc32' gives a previously accumulated CRC-32 value to allow
+ *     a CRC to be generated for multiple sequential buffer-fuls of data.
+ *     The 'inCrc32' for the first buffer must be zero.
+ *  ARGUMENTS:
+ *     inCrc32 - accumulated CRC-32 value, must be 0 on first call
+ *     buf     - buffer to compute CRC-32 value for
+ *     bufLen  - number of bytes in buffer
+ *  RETURNS:
+ *     crc32 - computed CRC-32 value
+ *  ERRORS:
+ *     (no errors are possible)
+\*----------------------------------------------------------------------------*/
+
+uint32_t
+compute_crc32 (uint32_t    in_crc32,
+	       const void *buf,
+	       size_t      buf_len)
+{
+    static const uint32_t crc_table[256] = {
+	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
+	0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+	0x09B64C2B, 0x7EB17CBD,	0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
+	0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+	0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,	0x14015C4F, 0x63066CD9,
+	0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+	0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+	0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+	0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
+	0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+	0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
+	0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+	0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
+	0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+	0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
+	0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+	0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
+	0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+	0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
+	0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+	0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+	0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+	0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
+	0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+	0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
+	0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+	0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
+	0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+	0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
+	0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+	0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
+	0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+	0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
+	0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+	0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+	0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+	0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
+	0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+	0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
+	0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+	0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
+	0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+	0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+    };
+
+    uint32_t              crc32;
+    unsigned char *       byte_buf;
+    size_t                i;
+
+    /* accumulate crc32 for buffer */
+    crc32 = in_crc32 ^ 0xFFFFFFFF;
+    byte_buf = (unsigned char*) buf;
+
+    for (i = 0; i < buf_len; i++)
+	crc32 = (crc32 >> 8) ^ crc_table[(crc32 ^ byte_buf[i]) & 0xFF];
+
+    return (crc32 ^ 0xFFFFFFFF);
+}
+
+pixman_bool_t
+is_little_endian (void)
+{
+    volatile uint16_t endian_check_var = 0x1234;
+
+    return (*(volatile uint8_t *)&endian_check_var == 0x34);
+}
+
+/* perform endian conversion of pixel data
+ */
+void
+image_endian_swap (pixman_image_t *img)
+{
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);
+    int bpp = PIXMAN_FORMAT_BPP (pixman_image_get_format (img));
+    int i, j;
+
+    /* swap bytes only on big endian systems */
+    if (is_little_endian())
+	return;
+
+    if (bpp == 8)
+	return;
+
+    for (i = 0; i < height; i++)
+    {
+	uint8_t *line_data = (uint8_t *)data + stride * i;
+	
+	switch (bpp)
+	{
+	case 1:
+	    for (j = 0; j < stride; j++)
+	    {
+		line_data[j] =
+		    ((line_data[j] & 0x80) >> 7) |
+		    ((line_data[j] & 0x40) >> 5) |
+		    ((line_data[j] & 0x20) >> 3) |
+		    ((line_data[j] & 0x10) >> 1) |
+		    ((line_data[j] & 0x08) << 1) |
+		    ((line_data[j] & 0x04) << 3) |
+		    ((line_data[j] & 0x02) << 5) |
+		    ((line_data[j] & 0x01) << 7);
+	    }
+	    break;
+	case 4:
+	    for (j = 0; j < stride; j++)
+	    {
+		line_data[j] = (line_data[j] >> 4) | (line_data[j] << 4);
+	    }
+	    break;
+	case 16:
+	    for (j = 0; j + 2 <= stride; j += 2)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+
+		line_data[j + 1] = t1;
+		line_data[j + 0] = t2;
+	    }
+	    break;
+	case 24:
+	    for (j = 0; j + 3 <= stride; j += 3)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+
+		line_data[j + 2] = t1;
+		line_data[j + 1] = t2;
+		line_data[j + 0] = t3;
+	    }
+	    break;
+	case 32:
+	    for (j = 0; j + 4 <= stride; j += 4)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+		char t4 = line_data[j + 3];
+
+		line_data[j + 3] = t1;
+		line_data[j + 2] = t2;
+		line_data[j + 1] = t3;
+		line_data[j + 0] = t4;
+	    }
+	    break;
+	default:
+	    assert (FALSE);
+	    break;
+	}
+    }
+}
+
+#define N_LEADING_PROTECTED	10
+#define N_TRAILING_PROTECTED	10
+
+typedef struct
+{
+    void *addr;
+    uint32_t len;
+    uint8_t *trailing;
+    int n_bytes;
+} info_t;
+
+#if defined(HAVE_MPROTECT) && defined(HAVE_GETPAGESIZE) && defined(HAVE_SYS_MMAN_H) && defined(HAVE_MMAP)
+
+/* This is apparently necessary on at least OS X */
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+void *
+fence_malloc (int64_t len)
+{
+    unsigned long page_size = getpagesize();
+    unsigned long page_mask = page_size - 1;
+    uint32_t n_payload_bytes = (len + page_mask) & ~page_mask;
+    uint32_t n_bytes =
+	(page_size * (N_LEADING_PROTECTED + N_TRAILING_PROTECTED + 2) +
+	 n_payload_bytes) & ~page_mask;
+    uint8_t *initial_page;
+    uint8_t *leading_protected;
+    uint8_t *trailing_protected;
+    uint8_t *payload;
+    uint8_t *addr;
+
+    if (len < 0)
+	abort();
+    
+    addr = mmap (NULL, n_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
+		 -1, 0);
+
+    if (addr == MAP_FAILED)
+    {
+	printf ("mmap failed on %lld %u\n", (long long int)len, n_bytes);
+	return NULL;
+    }
+
+    initial_page = (uint8_t *)(((unsigned long)addr + page_mask) & ~page_mask);
+    leading_protected = initial_page + page_size;
+    payload = leading_protected + N_LEADING_PROTECTED * page_size;
+    trailing_protected = payload + n_payload_bytes;
+
+    ((info_t *)initial_page)->addr = addr;
+    ((info_t *)initial_page)->len = len;
+    ((info_t *)initial_page)->trailing = trailing_protected;
+    ((info_t *)initial_page)->n_bytes = n_bytes;
+
+    if ((mprotect (leading_protected, N_LEADING_PROTECTED * page_size,
+		  PROT_NONE) == -1) ||
+	(mprotect (trailing_protected, N_TRAILING_PROTECTED * page_size,
+		  PROT_NONE) == -1))
+    {
+	munmap (addr, n_bytes);
+	return NULL;
+    }
+
+    return payload;
+}
+
+void
+fence_free (void *data)
+{
+    uint32_t page_size = getpagesize();
+    uint8_t *payload = data;
+    uint8_t *leading_protected = payload - N_LEADING_PROTECTED * page_size;
+    uint8_t *initial_page = leading_protected - page_size;
+    info_t *info = (info_t *)initial_page;
+
+    munmap (info->addr, info->n_bytes);
+}
+
+#else
+
+void *
+fence_malloc (int64_t len)
+{
+    return malloc (len);
+}
+
+void
+fence_free (void *data)
+{
+    free (data);
+}
+
+#endif
+
+uint8_t *
+make_random_bytes (int n_bytes)
+{
+    uint8_t *bytes = fence_malloc (n_bytes);
+    int i;
+
+    if (!bytes)
+	return NULL;
+
+    for (i = 0; i < n_bytes; ++i)
+	bytes[i] = lcg_rand () & 0xff;
+
+    return bytes;
+}
+
+#ifdef HAVE_LIBPNG
+
+static void
+pngify_pixels (uint32_t *pixels, int n_pixels)
+{
+    int i;
+
+    for (i = 0; i < n_pixels; ++i)
+    {
+	uint32_t p = pixels[i];
+	uint8_t *out = (uint8_t *)&(pixels[i]);
+	uint8_t a, r, g, b;
+
+	a = (p & 0xff000000) >> 24;
+	r = (p & 0x00ff0000) >> 16;
+	g = (p & 0x0000ff00) >> 8;
+	b = (p & 0x000000ff) >> 0;
+
+	if (a != 0)
+	{
+	    r = (r * 255) / a;
+	    g = (g * 255) / a;
+	    b = (b * 255) / a;
+	}
+
+	*out++ = r;
+	*out++ = g;
+	*out++ = b;
+	*out++ = a;
+    }
+}
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename)
+{
+    int width = pixman_image_get_width (image);
+    int height = pixman_image_get_height (image);
+    int stride = width * 4;
+    uint32_t *data = malloc (height * stride);
+    pixman_image_t *copy;
+    png_struct *write_struct;
+    png_info *info_struct;
+    pixman_bool_t result = FALSE;
+    FILE *f = fopen (filename, "wb");
+    png_bytep *row_pointers;
+    int i;
+
+    if (!f)
+	return FALSE;
+
+    row_pointers = malloc (height * sizeof (png_bytep));
+
+    copy = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, width, height, data, stride);
+
+    pixman_image_composite32 (
+	PIXMAN_OP_SRC, image, NULL, copy, 0, 0, 0, 0, 0, 0, width, height);
+
+    pngify_pixels (data, height * width);
+
+    for (i = 0; i < height; ++i)
+	row_pointers[i] = (png_bytep)(data + i * width);
+
+    if (!(write_struct = png_create_write_struct (
+	      PNG_LIBPNG_VER_STRING, NULL, NULL, NULL)))
+	goto out1;
+
+    if (!(info_struct = png_create_info_struct (write_struct)))
+	goto out2;
+
+    png_init_io (write_struct, f);
+
+    png_set_IHDR (write_struct, info_struct, width, height,
+		  8, PNG_COLOR_TYPE_RGB_ALPHA,
+		  PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
+		  PNG_FILTER_TYPE_BASE);
+
+    png_write_info (write_struct, info_struct);
+
+    png_write_image (write_struct, row_pointers);
+
+    png_write_end (write_struct, NULL);
+
+    result = TRUE;
+
+out2:
+    png_destroy_write_struct (&write_struct, &info_struct);
+
+out1:
+    if (fclose (f) != 0)
+	result = FALSE;
+
+    pixman_image_unref (copy);
+    free (row_pointers);
+    free (data);
+    return result;
+}
+
+#else /* no libpng */
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename)
+{
+    return FALSE;
+}
+
+#endif
+
+/*
+ * A function, which can be used as a core part of the test programs,
+ * intended to detect various problems with the help of fuzzing input
+ * to pixman API (according to some templates, aka "smart" fuzzing).
+ * Some general information about such testing can be found here:
+ * http://en.wikipedia.org/wiki/Fuzz_testing
+ *
+ * It may help detecting:
+ *  - crashes on bad handling of valid or reasonably invalid input to
+ *    pixman API.
+ *  - deviations from the behavior of older pixman releases.
+ *  - deviations from the behavior of the same pixman release, but
+ *    configured in a different way (for example with SIMD optimizations
+ *    disabled), or running on a different OS or hardware.
+ *
+ * The test is performed by calling a callback function a huge number
+ * of times. The callback function is expected to run some snippet of
+ * pixman code with pseudorandom variations to the data feeded to
+ * pixman API. A result of running each callback function should be
+ * some deterministic value which depends on test number (test number
+ * can be used as a seed for PRNG). When 'verbose' argument is nonzero,
+ * callback function is expected to print to stdout some information
+ * about what it does.
+ *
+ * Return values from many small tests are accumulated together and
+ * used as final checksum, which can be compared to some expected
+ * value. Running the tests not individually, but in a batch helps
+ * to reduce process start overhead and also allows to parallelize
+ * testing and utilize multiple CPU cores.
+ *
+ * The resulting executable can be run without any arguments. In
+ * this case it runs a batch of tests starting from 1 and up to
+ * 'default_number_of_iterations'. The resulting checksum is
+ * compared with 'expected_checksum' and FAIL or PASS verdict
+ * depends on the result of this comparison.
+ *
+ * If the executable is run with 2 numbers provided as command line
+ * arguments, they specify the starting and ending numbers for a test
+ * batch.
+ *
+ * If the executable is run with only one number provided as a command
+ * line argument, then this number is used to call the callback function
+ * once, and also with verbose flag set.
+ */
+int
+fuzzer_test_main (const char *test_name,
+		  int         default_number_of_iterations,
+		  uint32_t    expected_checksum,
+		  uint32_t    (*test_function)(int testnum, int verbose),
+		  int         argc,
+		  const char *argv[])
+{
+    int i, n1 = 1, n2 = 0;
+    uint32_t checksum = 0;
+    int verbose = getenv ("VERBOSE") != NULL;
+
+    if (argc >= 3)
+    {
+	n1 = atoi (argv[1]);
+	n2 = atoi (argv[2]);
+	if (n2 < n1)
+	{
+	    printf ("invalid test range\n");
+	    return 1;
+	}
+    }
+    else if (argc >= 2)
+    {
+	n2 = atoi (argv[1]);
+	checksum = test_function (n2, 1);
+	printf ("%d: checksum=%08X\n", n2, checksum);
+	return 0;
+    }
+    else
+    {
+	n1 = 1;
+	n2 = default_number_of_iterations;
+    }
+
+#ifdef USE_OPENMP
+    #pragma omp parallel for reduction(+:checksum) default(none) \
+					shared(n1, n2, test_function, verbose)
+#endif
+    for (i = n1; i <= n2; i++)
+    {
+	uint32_t crc = test_function (i, 0);
+	if (verbose)
+	    printf ("%d: %08X\n", i, crc);
+	checksum += crc;
+    }
+
+    if (n1 == 1 && n2 == default_number_of_iterations)
+    {
+	if (checksum == expected_checksum)
+	{
+	    printf ("%s test passed (checksum=%08X)\n",
+		    test_name, checksum);
+	}
+	else
+	{
+	    printf ("%s test failed! (checksum=%08X, expected %08X)\n",
+		    test_name, checksum, expected_checksum);
+	    return 1;
+	}
+    }
+    else
+    {
+	printf ("%d-%d: checksum=%08X\n", n1, n2, checksum);
+    }
+
+    return 0;
+}
+
+/* Try to obtain current time in seconds */
+double
+gettime (void)
+{
+#ifdef HAVE_GETTIMEOFDAY
+    struct timeval tv;
+
+    gettimeofday (&tv, NULL);
+    return (double)((int64_t)tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.;
+#else
+    return (double)clock() / (double)CLOCKS_PER_SEC;
+#endif
+}
+
+uint32_t
+get_random_seed (void)
+{
+    double d = gettime();
+
+    lcg_srand (*(uint32_t *)&d);
+
+    return lcg_rand_u32 ();
+}
+
+static const char *global_msg;
+
+static void
+on_alarm (int signo)
+{
+    printf ("%s\n", global_msg);
+    exit (1);
+}
+
+void
+fail_after (int seconds, const char *msg)
+{
+#ifdef HAVE_SIGACTION
+#ifdef HAVE_ALARM
+    struct sigaction action;
+
+    global_msg = msg;
+
+    memset (&action, 0, sizeof (action));
+    action.sa_handler = on_alarm;
+
+    alarm (seconds);
+
+    sigaction (SIGALRM, &action, NULL);
+#endif
+#endif
+}
+
+void
+enable_fp_exceptions (void)
+{
+#ifdef HAVE_FENV_H
+#ifdef HAVE_FEENABLEEXCEPT
+    /* Note: we don't enable the FE_INEXACT trap because
+     * that happens quite commonly. It is possible that
+     * over- and underflow should similarly be considered
+     * okay, but for now the test suite passes with them
+     * enabled, and it's useful to know if they start
+     * occuring.
+     */
+    feenableexcept (FE_DIVBYZERO	|
+		    FE_INVALID		|
+		    FE_OVERFLOW		|
+		    FE_UNDERFLOW);
+#endif
+#endif
+}
+
+void *
+aligned_malloc (size_t align, size_t size)
+{
+    void *result;
+
+#ifdef HAVE_POSIX_MEMALIGN
+    if (posix_memalign (&result, align, size) != 0)
+      result = NULL;
+#else
+    result = malloc (size);
+#endif
+
+    return result;
+}
+
+#define CONVERT_15(c, is_rgb)						\
+    (is_rgb?								\
+     ((((c) >> 3) & 0x001f) |						\
+      (((c) >> 6) & 0x03e0) |						\
+      (((c) >> 9) & 0x7c00)) :						\
+     (((((c) >> 16) & 0xff) * 153 +					\
+       (((c) >>  8) & 0xff) * 301 +					\
+       (((c)      ) & 0xff) * 58) >> 2))
+
+void
+initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb)
+{
+    int i;
+    uint32_t mask = (1 << depth) - 1;
+
+    for (i = 0; i < 32768; ++i)
+	palette->ent[i] = lcg_rand() & mask;
+
+    memset (palette->rgba, 0, sizeof (palette->rgba));
+
+    for (i = 0; i < mask + 1; ++i)
+    {
+	uint32_t rgba24;
+ 	pixman_bool_t retry;
+	uint32_t i15;
+
+	/* We filled the rgb->index map with random numbers, but we
+	 * do need the ability to round trip, that is if some indexed
+	 * color expands to an argb24, then the 15 bit version of that
+	 * color must map back to the index. Anything else, we don't
+	 * care about too much.
+	 */
+	do
+	{
+	    uint32_t old_idx;
+
+	    rgba24 = lcg_rand();
+	    i15 = CONVERT_15 (rgba24, is_rgb);
+
+	    old_idx = palette->ent[i15];
+	    if (CONVERT_15 (palette->rgba[old_idx], is_rgb) == i15)
+		retry = 1;
+	    else
+		retry = 0;
+	} while (retry);
+
+	palette->rgba[i] = rgba24;
+	palette->ent[i15] = i;
+    }
+
+    for (i = 0; i < mask + 1; ++i)
+    {
+	assert (palette->ent[CONVERT_15 (palette->rgba[i], is_rgb)] == i);
+    }
+}
diff --git a/pixman/test/utils.h b/pixman/test/utils.h
index cd9a3c263..3790483db 100644
--- a/pixman/test/utils.h
+++ b/pixman/test/utils.h
@@ -1,151 +1,151 @@
-#include <stdlib.h>
-#include <config.h>
-#include <assert.h>
-#include "pixman-private.h" /* For 'inline' definition */
-
-#define ARRAY_LENGTH(A) ((int) (sizeof (A) / sizeof ((A) [0])))
-
-/* A primitive pseudorandom number generator,
- * taken from POSIX.1-2001 example
- */
-
-extern uint32_t lcg_seed;
-#ifdef USE_OPENMP
-#pragma omp threadprivate(lcg_seed)
-#endif
-
-static inline uint32_t
-lcg_rand (void)
-{
-    lcg_seed = lcg_seed * 1103515245 + 12345;
-    return ((uint32_t)(lcg_seed / 65536) % 32768);
-}
-
-static inline void
-lcg_srand (uint32_t seed)
-{
-    lcg_seed = seed;
-}
-
-static inline uint32_t
-lcg_rand_n (int max)
-{
-    return lcg_rand () % max;
-}
-
-static inline uint32_t
-lcg_rand_N (int max)
-{
-    uint32_t lo = lcg_rand ();
-    uint32_t hi = lcg_rand () << 15;
-    return (lo | hi) % max;
-}
-
-static inline uint32_t
-lcg_rand_u32 (void)
-{
-    /* This uses the 10/11 most significant bits from the 3 lcg results
-     * (and mixes them with the low from the adjacent one).
-     */
-    uint32_t lo = lcg_rand() >> -(32 - 15 - 11 * 2);
-    uint32_t mid = lcg_rand() << (32 - 15 - 11 * 1);
-    uint32_t hi = lcg_rand() << (32 - 15 - 11 * 0);
-
-    return (hi ^ mid ^ lo);
-}
-
-/* CRC 32 computation
- */
-uint32_t
-compute_crc32 (uint32_t    in_crc32,
-	       const void *buf,
-	       size_t      buf_len);
-
-/* Returns TRUE if running on a little endian system */
-pixman_bool_t
-is_little_endian (void);
-
-/* perform endian conversion of pixel data
- */
-void
-image_endian_swap (pixman_image_t *img);
-
-/* Allocate memory that is bounded by protected pages,
- * so that out-of-bounds access will cause segfaults
- */
-void *
-fence_malloc (int64_t len);
-
-void
-fence_free (void *data);
-
-/* Generate n_bytes random bytes in fence_malloced memory */
-uint8_t *
-make_random_bytes (int n_bytes);
-
-/* Return current time in seconds */
-double
-gettime (void);
-
-uint32_t
-get_random_seed (void);
-
-/* main body of the fuzzer test */
-int
-fuzzer_test_main (const char *test_name,
-		  int         default_number_of_iterations,
-		  uint32_t    expected_checksum,
-		  uint32_t    (*test_function)(int testnum, int verbose),
-		  int         argc,
-		  const char *argv[]);
-
-void
-fail_after (int seconds, const char *msg);
-
-/* If possible, enable traps for floating point exceptions */
-void enable_fp_exceptions(void);
-
-pixman_bool_t
-write_png (pixman_image_t *image, const char *filename);
-
-/* A pair of macros which can help to detect corruption of
- * floating point registers after a function call. This may
- * happen if _mm_empty() call is forgotten in MMX/SSE2 fast
- * path code, or ARM NEON assembly optimized function forgets
- * to save/restore d8-d15 registers before use.
- */
-
-#define FLOAT_REGS_CORRUPTION_DETECTOR_START()                 \
-    static volatile double frcd_volatile_constant1 = 123451;   \
-    static volatile double frcd_volatile_constant2 = 123452;   \
-    static volatile double frcd_volatile_constant3 = 123453;   \
-    static volatile double frcd_volatile_constant4 = 123454;   \
-    static volatile double frcd_volatile_constant5 = 123455;   \
-    static volatile double frcd_volatile_constant6 = 123456;   \
-    static volatile double frcd_volatile_constant7 = 123457;   \
-    static volatile double frcd_volatile_constant8 = 123458;   \
-    double frcd_canary_variable1 = frcd_volatile_constant1;    \
-    double frcd_canary_variable2 = frcd_volatile_constant2;    \
-    double frcd_canary_variable3 = frcd_volatile_constant3;    \
-    double frcd_canary_variable4 = frcd_volatile_constant4;    \
-    double frcd_canary_variable5 = frcd_volatile_constant5;    \
-    double frcd_canary_variable6 = frcd_volatile_constant6;    \
-    double frcd_canary_variable7 = frcd_volatile_constant7;    \
-    double frcd_canary_variable8 = frcd_volatile_constant8;
-
-#define FLOAT_REGS_CORRUPTION_DETECTOR_FINISH()                \
-    assert (frcd_canary_variable1 == frcd_volatile_constant1); \
-    assert (frcd_canary_variable2 == frcd_volatile_constant2); \
-    assert (frcd_canary_variable3 == frcd_volatile_constant3); \
-    assert (frcd_canary_variable4 == frcd_volatile_constant4); \
-    assert (frcd_canary_variable5 == frcd_volatile_constant5); \
-    assert (frcd_canary_variable6 == frcd_volatile_constant6); \
-    assert (frcd_canary_variable7 == frcd_volatile_constant7); \
-    assert (frcd_canary_variable8 == frcd_volatile_constant8);
-
-/* Try to get an aligned memory chunk */
-void *
-aligned_malloc (size_t align, size_t size);
-
-void
-initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb);
+#include <stdlib.h>
+#include <config.h>
+#include <assert.h>
+#include "pixman-private.h" /* For 'inline' definition */
+
+#define ARRAY_LENGTH(A) ((int) (sizeof (A) / sizeof ((A) [0])))
+
+/* A primitive pseudorandom number generator,
+ * taken from POSIX.1-2001 example
+ */
+
+extern uint32_t lcg_seed;
+#ifdef USE_OPENMP
+#pragma omp threadprivate(lcg_seed)
+#endif
+
+static inline uint32_t
+lcg_rand (void)
+{
+    lcg_seed = lcg_seed * 1103515245 + 12345;
+    return ((uint32_t)(lcg_seed / 65536) % 32768);
+}
+
+static inline void
+lcg_srand (uint32_t seed)
+{
+    lcg_seed = seed;
+}
+
+static inline uint32_t
+lcg_rand_n (int max)
+{
+    return lcg_rand () % max;
+}
+
+static inline uint32_t
+lcg_rand_N (int max)
+{
+    uint32_t lo = lcg_rand ();
+    uint32_t hi = lcg_rand () << 15;
+    return (lo | hi) % max;
+}
+
+static inline uint32_t
+lcg_rand_u32 (void)
+{
+    /* This uses the 10/11 most significant bits from the 3 lcg results
+     * (and mixes them with the low from the adjacent one).
+     */
+    uint32_t lo = lcg_rand() >> -(32 - 15 - 11 * 2);
+    uint32_t mid = lcg_rand() << (32 - 15 - 11 * 1);
+    uint32_t hi = lcg_rand() << (32 - 15 - 11 * 0);
+
+    return (hi ^ mid ^ lo);
+}
+
+/* CRC 32 computation
+ */
+uint32_t
+compute_crc32 (uint32_t    in_crc32,
+	       const void *buf,
+	       size_t      buf_len);
+
+/* Returns TRUE if running on a little endian system */
+pixman_bool_t
+is_little_endian (void);
+
+/* perform endian conversion of pixel data
+ */
+void
+image_endian_swap (pixman_image_t *img);
+
+/* Allocate memory that is bounded by protected pages,
+ * so that out-of-bounds access will cause segfaults
+ */
+void *
+fence_malloc (int64_t len);
+
+void
+fence_free (void *data);
+
+/* Generate n_bytes random bytes in fence_malloced memory */
+uint8_t *
+make_random_bytes (int n_bytes);
+
+/* Return current time in seconds */
+double
+gettime (void);
+
+uint32_t
+get_random_seed (void);
+
+/* main body of the fuzzer test */
+int
+fuzzer_test_main (const char *test_name,
+		  int         default_number_of_iterations,
+		  uint32_t    expected_checksum,
+		  uint32_t    (*test_function)(int testnum, int verbose),
+		  int         argc,
+		  const char *argv[]);
+
+void
+fail_after (int seconds, const char *msg);
+
+/* If possible, enable traps for floating point exceptions */
+void enable_fp_exceptions(void);
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename);
+
+/* A pair of macros which can help to detect corruption of
+ * floating point registers after a function call. This may
+ * happen if _mm_empty() call is forgotten in MMX/SSE2 fast
+ * path code, or ARM NEON assembly optimized function forgets
+ * to save/restore d8-d15 registers before use.
+ */
+
+#define FLOAT_REGS_CORRUPTION_DETECTOR_START()                 \
+    static volatile double frcd_volatile_constant1 = 123451;   \
+    static volatile double frcd_volatile_constant2 = 123452;   \
+    static volatile double frcd_volatile_constant3 = 123453;   \
+    static volatile double frcd_volatile_constant4 = 123454;   \
+    static volatile double frcd_volatile_constant5 = 123455;   \
+    static volatile double frcd_volatile_constant6 = 123456;   \
+    static volatile double frcd_volatile_constant7 = 123457;   \
+    static volatile double frcd_volatile_constant8 = 123458;   \
+    double frcd_canary_variable1 = frcd_volatile_constant1;    \
+    double frcd_canary_variable2 = frcd_volatile_constant2;    \
+    double frcd_canary_variable3 = frcd_volatile_constant3;    \
+    double frcd_canary_variable4 = frcd_volatile_constant4;    \
+    double frcd_canary_variable5 = frcd_volatile_constant5;    \
+    double frcd_canary_variable6 = frcd_volatile_constant6;    \
+    double frcd_canary_variable7 = frcd_volatile_constant7;    \
+    double frcd_canary_variable8 = frcd_volatile_constant8;
+
+#define FLOAT_REGS_CORRUPTION_DETECTOR_FINISH()                \
+    assert (frcd_canary_variable1 == frcd_volatile_constant1); \
+    assert (frcd_canary_variable2 == frcd_volatile_constant2); \
+    assert (frcd_canary_variable3 == frcd_volatile_constant3); \
+    assert (frcd_canary_variable4 == frcd_volatile_constant4); \
+    assert (frcd_canary_variable5 == frcd_volatile_constant5); \
+    assert (frcd_canary_variable6 == frcd_volatile_constant6); \
+    assert (frcd_canary_variable7 == frcd_volatile_constant7); \
+    assert (frcd_canary_variable8 == frcd_volatile_constant8);
+
+/* Try to get an aligned memory chunk */
+void *
+aligned_malloc (size_t align, size_t size);
+
+void
+initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb);
-- 
cgit v1.2.3