17 files changed, 3553 insertions, 3472 deletions
diff --git a/pixman/.gitignore b/pixman/.gitignore
new file mode 100644
index 000000000..98612c91f
--- /dev/null
+++ b/pixman/.gitignore
@@ -0,0 +1,81 @@
+Makefile
+Makefile.in
+.deps
+.libs
+.msg
+*.pc
+*.lo
+*.la
+*.a
+*.o
+*~
+aclocal.m4
+autom4te.cache
+compile
+config.guess
+config.log
+config.status
+config.sub
+configure
+depcomp
+install-sh
+libtool
+ltmain.sh
+missing
+stamp-h?
+config.h
+config.h.in
+.*.swp
+demos/alpha-test
+demos/checkerboard
+demos/clip-in
+demos/clip-test
+demos/composite-test
+demos/convolution-test
+demos/gradient-test
+demos/quad2quad
+demos/radial-test
+demos/screen-test
+demos/trap-test
+demos/tri-test
+pixman/pixman-combine32.c
+pixman/pixman-combine32.h
+pixman/pixman-combine64.c
+pixman/pixman-combine64.h
+pixman/pixman-version.h
+test/a1-trap-test
+test/affine-test
+test/alpha-loop
+test/alphamap
+test/alpha-test
+test/blitters-test
+test/clip-in
+test/clip-test
+test/composite
+test/composite-test
+test/composite-traps-test
+test/convolution-test
+test/fetch-test
+test/gradient-crash-test
+test/gradient-test
+test/lowlevel-blt-bench
+test/oob-test
+test/pdf-op-test
+test/region-contains-test
+test/region-test
+test/region-translate
+test/region-translate-test
+test/scaling-crash-test
+test/scaling-helpers-test
+test/scaling-test
+test/screen-test
+test/stress-test
+test/trap-crasher
+test/trap-test
+test/window-test
+*.pdb
+*.dll
+*.lib
+*.ilk
+*.obj
+*.exe
diff --git a/pixman/COPYING b/pixman/COPYING
index 11b022bc2..6168dea56 100644
--- a/pixman/COPYING
+++ b/pixman/COPYING
@@ -1,42 +1,42 @@
-The following is the MIT license, agreed upon by most contributors.
-Copyright holders of new code should use this license statement where
-possible. They may also add themselves to the list below.
-
-/*
- * Copyright 1987, 1988, 1989, 1998  The Open Group
- * Copyright 1987, 1988, 1989 Digital Equipment Corporation
- * Copyright 1999, 2004, 2008 Keith Packard
- * Copyright 2000 SuSE, Inc.
- * Copyright 2000 Keith Packard, member of The XFree86 Project, Inc.
- * Copyright 2004, 2005, 2007, 2008, 2009, 2010 Red Hat, Inc.
- * Copyright 2004 Nicholas Miell
- * Copyright 2005 Lars Knoll & Zack Rusin, Trolltech
- * Copyright 2005 Trolltech AS
- * Copyright 2007 Luca Barbato
- * Copyright 2008 Aaron Plattner, NVIDIA Corporation
- * Copyright 2008 Rodrigo Kumpera
- * Copyright 2008 André Tupinambá
- * Copyright 2008 Mozilla Corporation
- * Copyright 2008 Frederic Plourde
- * Copyright 2009, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2009, 2010 Nokia Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
+The following is the MIT license, agreed upon by most contributors.
+Copyright holders of new code should use this license statement where
+possible. They may also add themselves to the list below.
+
+/*
+ * Copyright 1987, 1988, 1989, 1998  The Open Group
+ * Copyright 1987, 1988, 1989 Digital Equipment Corporation
+ * Copyright 1999, 2004, 2008 Keith Packard
+ * Copyright 2000 SuSE, Inc.
+ * Copyright 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright 2004, 2005, 2007, 2008, 2009, 2010 Red Hat, Inc.
+ * Copyright 2004 Nicholas Miell
+ * Copyright 2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright 2005 Trolltech AS
+ * Copyright 2007 Luca Barbato
+ * Copyright 2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright 2008 Rodrigo Kumpera
+ * Copyright 2008 André Tupinambá
+ * Copyright 2008 Mozilla Corporation
+ * Copyright 2008 Frederic Plourde
+ * Copyright 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2009, 2010 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
diff --git a/pixman/INSTALL b/pixman/INSTALL
index 5458714e1..cf1202b66 100644
--- a/pixman/INSTALL
+++ b/pixman/INSTALL
@@ -1,234 +1,234 @@
-Installation Instructions
-*************************
-
-Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
-2006 Free Software Foundation, Inc.
-
-This file is free documentation; the Free Software Foundation gives
-unlimited permission to copy, distribute and modify it.
-
-Basic Installation
-==================
-
-Briefly, the shell commands `./configure; make; make install' should
-configure, build, and install this package.  The following
-more-detailed instructions are generic; see the `README' file for
-instructions specific to this package.
-
-   The `configure' shell script attempts to guess correct values for
-various system-dependent variables used during compilation.  It uses
-those values to create a `Makefile' in each directory of the package.
-It may also create one or more `.h' files containing system-dependent
-definitions.  Finally, it creates a shell script `config.status' that
-you can run in the future to recreate the current configuration, and a
-file `config.log' containing compiler output (useful mainly for
-debugging `configure').
-
-   It can also use an optional file (typically called `config.cache'
-and enabled with `--cache-file=config.cache' or simply `-C') that saves
-the results of its tests to speed up reconfiguring.  Caching is
-disabled by default to prevent problems with accidental use of stale
-cache files.
-
-   If you need to do unusual things to compile the package, please try
-to figure out how `configure' could check whether to do them, and mail
-diffs or instructions to the address given in the `README' so they can
-be considered for the next release.  If you are using the cache, and at
-some point `config.cache' contains results you don't want to keep, you
-may remove or edit it.
-
-   The file `configure.ac' (or `configure.in') is used to create
-`configure' by a program called `autoconf'.  You need `configure.ac' if
-you want to change it or regenerate `configure' using a newer version
-of `autoconf'.
-
-The simplest way to compile this package is:
-
-  1. `cd' to the directory containing the package's source code and type
-     `./configure' to configure the package for your system.
-
-     Running `configure' might take a while.  While running, it prints
-     some messages telling which features it is checking for.
-
-  2. Type `make' to compile the package.
-
-  3. Optionally, type `make check' to run any self-tests that come with
-     the package.
-
-  4. Type `make install' to install the programs and any data files and
-     documentation.
-
-  5. You can remove the program binaries and object files from the
-     source code directory by typing `make clean'.  To also remove the
-     files that `configure' created (so you can compile the package for
-     a different kind of computer), type `make distclean'.  There is
-     also a `make maintainer-clean' target, but that is intended mainly
-     for the package's developers.  If you use it, you may have to get
-     all sorts of other programs in order to regenerate files that came
-     with the distribution.
-
-Compilers and Options
-=====================
-
-Some systems require unusual options for compilation or linking that the
-`configure' script does not know about.  Run `./configure --help' for
-details on some of the pertinent environment variables.
-
-   You can give `configure' initial values for configuration parameters
-by setting variables in the command line or in the environment.  Here
-is an example:
-
-     ./configure CC=c99 CFLAGS=-g LIBS=-lposix
-
-   *Note Defining Variables::, for more details.
-
-Compiling For Multiple Architectures
-====================================
-
-You can compile the package for more than one kind of computer at the
-same time, by placing the object files for each architecture in their
-own directory.  To do this, you can use GNU `make'.  `cd' to the
-directory where you want the object files and executables to go and run
-the `configure' script.  `configure' automatically checks for the
-source code in the directory that `configure' is in and in `..'.
-
-   With a non-GNU `make', it is safer to compile the package for one
-architecture at a time in the source code directory.  After you have
-installed the package for one architecture, use `make distclean' before
-reconfiguring for another architecture.
-
-Installation Names
-==================
-
-By default, `make install' installs the package's commands under
-`/usr/local/bin', include files under `/usr/local/include', etc.  You
-can specify an installation prefix other than `/usr/local' by giving
-`configure' the option `--prefix=PREFIX'.
-
-   You can specify separate installation prefixes for
-architecture-specific files and architecture-independent files.  If you
-pass the option `--exec-prefix=PREFIX' to `configure', the package uses
-PREFIX as the prefix for installing programs and libraries.
-Documentation and other data files still use the regular prefix.
-
-   In addition, if you use an unusual directory layout you can give
-options like `--bindir=DIR' to specify different values for particular
-kinds of files.  Run `configure --help' for a list of the directories
-you can set and what kinds of files go in them.
-
-   If the package supports it, you can cause programs to be installed
-with an extra prefix or suffix on their names by giving `configure' the
-option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
-
-Optional Features
-=================
-
-Some packages pay attention to `--enable-FEATURE' options to
-`configure', where FEATURE indicates an optional part of the package.
-They may also pay attention to `--with-PACKAGE' options, where PACKAGE
-is something like `gnu-as' or `x' (for the X Window System).  The
-`README' should mention any `--enable-' and `--with-' options that the
-package recognizes.
-
-   For packages that use the X Window System, `configure' can usually
-find the X include and library files automatically, but if it doesn't,
-you can use the `configure' options `--x-includes=DIR' and
-`--x-libraries=DIR' to specify their locations.
-
-Specifying the System Type
-==========================
-
-There may be some features `configure' cannot figure out automatically,
-but needs to determine by the type of machine the package will run on.
-Usually, assuming the package is built to be run on the _same_
-architectures, `configure' can figure that out, but if it prints a
-message saying it cannot guess the machine type, give it the
-`--build=TYPE' option.  TYPE can either be a short name for the system
-type, such as `sun4', or a canonical name which has the form:
-
-     CPU-COMPANY-SYSTEM
-
-where SYSTEM can have one of these forms:
-
-     OS KERNEL-OS
-
-   See the file `config.sub' for the possible values of each field.  If
-`config.sub' isn't included in this package, then this package doesn't
-need to know the machine type.
-
-   If you are _building_ compiler tools for cross-compiling, you should
-use the option `--target=TYPE' to select the type of system they will
-produce code for.
-
-   If you want to _use_ a cross compiler, that generates code for a
-platform different from the build platform, you should specify the
-"host" platform (i.e., that on which the generated programs will
-eventually be run) with `--host=TYPE'.
-
-Sharing Defaults
-================
-
-If you want to set default values for `configure' scripts to share, you
-can create a site shell script called `config.site' that gives default
-values for variables like `CC', `cache_file', and `prefix'.
-`configure' looks for `PREFIX/share/config.site' if it exists, then
-`PREFIX/etc/config.site' if it exists.  Or, you can set the
-`CONFIG_SITE' environment variable to the location of the site script.
-A warning: not all `configure' scripts look for a site script.
-
-Defining Variables
-==================
-
-Variables not defined in a site shell script can be set in the
-environment passed to `configure'.  However, some packages may run
-configure again during the build, and the customized values of these
-variables may be lost.  In order to avoid this problem, you should set
-them in the `configure' command line, using `VAR=value'.  For example:
-
-     ./configure CC=/usr/local2/bin/gcc
-
-causes the specified `gcc' to be used as the C compiler (unless it is
-overridden in the site shell script).
-
-Unfortunately, this technique does not work for `CONFIG_SHELL' due to
-an Autoconf bug.  Until the bug is fixed you can use this workaround:
-
-     CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
-
-`configure' Invocation
-======================
-
-`configure' recognizes the following options to control how it operates.
-
-`--help'
-`-h'
-     Print a summary of the options to `configure', and exit.
-
-`--version'
-`-V'
-     Print the version of Autoconf used to generate the `configure'
-     script, and exit.
-
-`--cache-file=FILE'
-     Enable the cache: use and save the results of the tests in FILE,
-     traditionally `config.cache'.  FILE defaults to `/dev/null' to
-     disable caching.
-
-`--config-cache'
-`-C'
-     Alias for `--cache-file=config.cache'.
-
-`--quiet'
-`--silent'
-`-q'
-     Do not print messages saying which checks are being made.  To
-     suppress all normal output, redirect it to `/dev/null' (any error
-     messages will still be shown).
-
-`--srcdir=DIR'
-     Look for the package's source code in directory DIR.  Usually
-     `configure' can determine that directory automatically.
-
-`configure' also accepts some other, not widely useful, options.  Run
-`configure --help' for more details.
-
+Installation Instructions
+*************************
+
+Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
+2006 Free Software Foundation, Inc.
+
+This file is free documentation; the Free Software Foundation gives
+unlimited permission to copy, distribute and modify it.
+
+Basic Installation
+==================
+
+Briefly, the shell commands `./configure; make; make install' should
+configure, build, and install this package.  The following
+more-detailed instructions are generic; see the `README' file for
+instructions specific to this package.
+
+   The `configure' shell script attempts to guess correct values for
+various system-dependent variables used during compilation.  It uses
+those values to create a `Makefile' in each directory of the package.
+It may also create one or more `.h' files containing system-dependent
+definitions.  Finally, it creates a shell script `config.status' that
+you can run in the future to recreate the current configuration, and a
+file `config.log' containing compiler output (useful mainly for
+debugging `configure').
+
+   It can also use an optional file (typically called `config.cache'
+and enabled with `--cache-file=config.cache' or simply `-C') that saves
+the results of its tests to speed up reconfiguring.  Caching is
+disabled by default to prevent problems with accidental use of stale
+cache files.
+
+   If you need to do unusual things to compile the package, please try
+to figure out how `configure' could check whether to do them, and mail
+diffs or instructions to the address given in the `README' so they can
+be considered for the next release.  If you are using the cache, and at
+some point `config.cache' contains results you don't want to keep, you
+may remove or edit it.
+
+   The file `configure.ac' (or `configure.in') is used to create
+`configure' by a program called `autoconf'.  You need `configure.ac' if
+you want to change it or regenerate `configure' using a newer version
+of `autoconf'.
+
+The simplest way to compile this package is:
+
+  1. `cd' to the directory containing the package's source code and type
+     `./configure' to configure the package for your system.
+
+     Running `configure' might take a while.  While running, it prints
+     some messages telling which features it is checking for.
+
+  2. Type `make' to compile the package.
+
+  3. Optionally, type `make check' to run any self-tests that come with
+     the package.
+
+  4. Type `make install' to install the programs and any data files and
+     documentation.
+
+  5. You can remove the program binaries and object files from the
+     source code directory by typing `make clean'.  To also remove the
+     files that `configure' created (so you can compile the package for
+     a different kind of computer), type `make distclean'.  There is
+     also a `make maintainer-clean' target, but that is intended mainly
+     for the package's developers.  If you use it, you may have to get
+     all sorts of other programs in order to regenerate files that came
+     with the distribution.
+
+Compilers and Options
+=====================
+
+Some systems require unusual options for compilation or linking that the
+`configure' script does not know about.  Run `./configure --help' for
+details on some of the pertinent environment variables.
+
+   You can give `configure' initial values for configuration parameters
+by setting variables in the command line or in the environment.  Here
+is an example:
+
+     ./configure CC=c99 CFLAGS=-g LIBS=-lposix
+
+   *Note Defining Variables::, for more details.
+
+Compiling For Multiple Architectures
+====================================
+
+You can compile the package for more than one kind of computer at the
+same time, by placing the object files for each architecture in their
+own directory.  To do this, you can use GNU `make'.  `cd' to the
+directory where you want the object files and executables to go and run
+the `configure' script.  `configure' automatically checks for the
+source code in the directory that `configure' is in and in `..'.
+
+   With a non-GNU `make', it is safer to compile the package for one
+architecture at a time in the source code directory.  After you have
+installed the package for one architecture, use `make distclean' before
+reconfiguring for another architecture.
+
+Installation Names
+==================
+
+By default, `make install' installs the package's commands under
+`/usr/local/bin', include files under `/usr/local/include', etc.  You
+can specify an installation prefix other than `/usr/local' by giving
+`configure' the option `--prefix=PREFIX'.
+
+   You can specify separate installation prefixes for
+architecture-specific files and architecture-independent files.  If you
+pass the option `--exec-prefix=PREFIX' to `configure', the package uses
+PREFIX as the prefix for installing programs and libraries.
+Documentation and other data files still use the regular prefix.
+
+   In addition, if you use an unusual directory layout you can give
+options like `--bindir=DIR' to specify different values for particular
+kinds of files.  Run `configure --help' for a list of the directories
+you can set and what kinds of files go in them.
+
+   If the package supports it, you can cause programs to be installed
+with an extra prefix or suffix on their names by giving `configure' the
+option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
+
+Optional Features
+=================
+
+Some packages pay attention to `--enable-FEATURE' options to
+`configure', where FEATURE indicates an optional part of the package.
+They may also pay attention to `--with-PACKAGE' options, where PACKAGE
+is something like `gnu-as' or `x' (for the X Window System).  The
+`README' should mention any `--enable-' and `--with-' options that the
+package recognizes.
+
+   For packages that use the X Window System, `configure' can usually
+find the X include and library files automatically, but if it doesn't,
+you can use the `configure' options `--x-includes=DIR' and
+`--x-libraries=DIR' to specify their locations.
+
+Specifying the System Type
+==========================
+
+There may be some features `configure' cannot figure out automatically,
+but needs to determine by the type of machine the package will run on.
+Usually, assuming the package is built to be run on the _same_
+architectures, `configure' can figure that out, but if it prints a
+message saying it cannot guess the machine type, give it the
+`--build=TYPE' option.  TYPE can either be a short name for the system
+type, such as `sun4', or a canonical name which has the form:
+
+     CPU-COMPANY-SYSTEM
+
+where SYSTEM can have one of these forms:
+
+     OS KERNEL-OS
+
+   See the file `config.sub' for the possible values of each field.  If
+`config.sub' isn't included in this package, then this package doesn't
+need to know the machine type.
+
+   If you are _building_ compiler tools for cross-compiling, you should
+use the option `--target=TYPE' to select the type of system they will
+produce code for.
+
+   If you want to _use_ a cross compiler, that generates code for a
+platform different from the build platform, you should specify the
+"host" platform (i.e., that on which the generated programs will
+eventually be run) with `--host=TYPE'.
+
+Sharing Defaults
+================
+
+If you want to set default values for `configure' scripts to share, you
+can create a site shell script called `config.site' that gives default
+values for variables like `CC', `cache_file', and `prefix'.
+`configure' looks for `PREFIX/share/config.site' if it exists, then
+`PREFIX/etc/config.site' if it exists.  Or, you can set the
+`CONFIG_SITE' environment variable to the location of the site script.
+A warning: not all `configure' scripts look for a site script.
+
+Defining Variables
+==================
+
+Variables not defined in a site shell script can be set in the
+environment passed to `configure'.  However, some packages may run
+configure again during the build, and the customized values of these
+variables may be lost.  In order to avoid this problem, you should set
+them in the `configure' command line, using `VAR=value'.  For example:
+
+     ./configure CC=/usr/local2/bin/gcc
+
+causes the specified `gcc' to be used as the C compiler (unless it is
+overridden in the site shell script).
+
+Unfortunately, this technique does not work for `CONFIG_SHELL' due to
+an Autoconf bug.  Until the bug is fixed you can use this workaround:
+
+     CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
+
+`configure' Invocation
+======================
+
+`configure' recognizes the following options to control how it operates.
+
+`--help'
+`-h'
+     Print a summary of the options to `configure', and exit.
+
+`--version'
+`-V'
+     Print the version of Autoconf used to generate the `configure'
+     script, and exit.
+
+`--cache-file=FILE'
+     Enable the cache: use and save the results of the tests in FILE,
+     traditionally `config.cache'.  FILE defaults to `/dev/null' to
+     disable caching.
+
+`--config-cache'
+`-C'
+     Alias for `--cache-file=config.cache'.
+
+`--quiet'
+`--silent'
+`-q'
+     Do not print messages saying which checks are being made.  To
+     suppress all normal output, redirect it to `/dev/null' (any error
+     messages will still be shown).
+
+`--srcdir=DIR'
+     Look for the package's source code in directory DIR.  Usually
+     `configure' can determine that directory automatically.
+
+`configure' also accepts some other, not widely useful, options.  Run
+`configure --help' for more details.
+
diff --git a/pixman/TODO b/pixman/TODO
index 4434ec7cb..465abe7b5 100644
--- a/pixman/TODO
+++ b/pixman/TODO
@@ -1,271 +1,271 @@
-  - Testing
-    - Test implementations against each other
-    - Test both with and without the operator strength reduction.
-      They shold be identical.
-
-  - SSE 2 issues:
-
-      - Use MM_HINT_NTA instead of MM_HINT_T0
-
-      - Use of fbCompositeOver_x888x8x8888sse2()
-
-  - Update the RLEASING file
-
-  - Things to keep in mind if breaking ABI:
-
-      - There should be a guard #ifndef I_AM_EITHER_CAIRO_OR_THE_X_SERVER
-
-      - X server will require 16.16 essentially forever. Can we get
-        the required precision by simply adding offset_x/y to the
-        relevant rendering API?
-
-      - Get rid of workaround for X server bug.
-
-      - pixman_image_set_indexed() should copy its argument, and X
-        should be ported over to use a pixman_image as the
-        representation of a Picture, rather than creating one on each
-        operation.
-
-      - We should get rid of pixman_set_static_pointers()
-
-      - We should get rid of the various trapezoid helper functions().
-        (They only exist because they are theoretically available to
-        drivers).
-
-      - 16 bit regions should be deleted
-
-      - There should only be one trap rasterization API.
-
-      - The PIXMAN_g8/c8/etc formats should use the A channel
-        to indicate the actual depth. That way PIXMAN_x4c4 and PIXMAN_c8
-	won't collide.
-
-  - Maybe bite the bullet and make configure.ac generate a pixman-types.h
-    file that can be included from pixman.h to avoid the #ifdef magic
-    in pixman.h
-
-  - Make pixman_region_point_in() survive a NULL box, then fix up
-    pixman-compose.c
-
-      - Possibly look into inlining the fetch functions
-
-  - There is a bug with source clipping demonstrated by clip-test in the
-    test directory. If we interprete source clipping as given in
-    destination coordinates, which is probably the only sane choice,
-    then the result should have two red bars down the sides.
-    
-  - Test suite
-
-  - Add a general way of dealing with architecture specific
-    fast-paths.  The current idea is to have each operation that can
-    be optimized is called through a function pointer that is
-    initially set to an initialization function that is responsible for
-    setting the function pointer to the appropriate fast-path.
-
-  - Go through things marked FIXME
-
-  - Add calls to prepare and finish access where necessary.  grep for
-    ACCESS_MEM, and make sure they are correctly wrapped in prepare
-    and finish.
-
-  - restore READ/WRITE in the fbcompose combiners since they sometimes
-    store directly to destination drawables.
-
-  - It probably makes sense to move the more strange X region API
-    into pixman as well, but guarded with PIXMAN_XORG_COMPATIBILITY
-
-  - Reinstate the FbBits typedef? At the moment we don't
-    even have the FbBits type; we just use uint32_t everywhere.
-
-    Keith says in bug 2335:
-
-        The 64-bit code in fb (pixman) is probably broken; it hasn't been
-        used in quite some time as PCI (and AGP) is 32-bits wide, so
-        doing things 64-bits at a time is a net loss.  To quickly fix
-        this, I suggest just using 32-bit datatypes by setting
-        IC_SHIFT to 5 for all machines.
-
-  - Consider optimizing the 8/16 bit solid fills in pixman-util.c by
-    storing more than one value at a time.
-
-  - Add an image cache to prevent excessive malloc/free. Note that pixman
-    needs to be thread safe when used from cairo.
-
-  - Moving to 24.8 coordinates. This is tricky because X is still
-    defined as 16.16 and will be basically forever. It's possible we
-    could do this by adding extra offset_x/y parameters to the
-    trapezoid calls. The X server could then just call the API with
-    (0, 0). Cairo would have to make sure that the delta *within* a
-    batch of trapezoids does not exceed 16 bit.
-
-  - Consider adding actual backends. Brain dump:
-
-    A backend is something that knows how to
-
-      - Create images
-      - Composite three images
-      - Rasterize trapezoids
-      - Do solid fills and blits
-
-    These operations are provided by a vtable that the backend will
-    create when it is initialized. Initial backends:
-
-      - VMX
-      - SSE2
-      - MMX
-      - Plain Old C
-
-    When the SIMD backends are initialized, they will be passed a
-    pointer to the Plain Old C backend that they can use for fallback
-    purposes.
-
-    Images would gain a vtable as well that would contain things like
-
-      - Read scanline
-      - Write scanline
-
-    (Or even read_patch/write_patch as suggested by Keith a while
-    back).
-
-    This could simplify the compositing code considerably.
-
-  - Review the pixman_format_code_t enum to make sure it will support
-    future formats. Some formats we will probably need:
-
-    	   ARGB/ABGR with 16/32/64 bit integer/floating channels
-	   YUV2,
-	   YV12
-
-    Also we may need the ability to distinguish between PICT_c8 and
-    PICT_x4c4. (This could be done by interpreting the A channel as
-    the depth for TYPE_COLOR and TYPE_GRAY formats).
-
-    A possibility may be to reserve the two top bits and make them
-    encode "number of places to shift the channel widths given" Since
-    these bits are 00 at the moment everything will continue to work,
-    but these additional widths will be allowed:
-
-    	     All even widths between 18-32
-	     All multiples of four widths between 33 and 64
-	     All multiples of eight between 64 and 128
-
-    This means things like r21g22b21 won't work - is that worth
-    worrying about? I don't think so. And of course the bpp field
-    can't handle a depth of over 256, so > 64 bit channels arent'
-    really all that useful.
-
-    We could reserve one extra bit to indicate floating point, but
-    we may also just add 
-
-       	   PIXMAN_TYPE_ARGB_FLOAT
-	   PIXMAN_TYPE_BGRA_FLOAT
-	   PIXMAN_TYPE_A_FLOAT
-    
-    image types. With five bits we can support up to 32 different
-    format types, which should be enough for everybody, even if we
-    decide to support all the various video formats here:
-
-    	        http://www.fourcc.org/yuv.php
-
-    It may make sense to have a PIXMAN_TYPE_YUV, and then use the
-    channel bits to specify the exact subtype.
-
-    Another possibility is to add 
-
-      	  PIXMAN_TYPE_ARGB_W
-	  PIXMAN_TYPE_ARGB_WW
-    
-    where the channel widths would get 16 and 32 added to them,
-    respectively.
-
-    What about color spaces such a linear vs. srGB etc.?
-
-
-done:
-
-- Use pixmanFillsse2 and pixmanBltsse2
-
-- Be consistent about calling sse2 sse2
-
-- Rename "SSE" to "MMX_EXTENSIONS". (Deleted mmx extensions).
-
-- Commented-out uses of fbCompositeCopyAreasse2()
-
-- Consider whether calling regions region16 is really such a great
-  idea. Vlad wants 32 bit regions for Cairo. This will break X server
-  ABI, but should otherwise be mostly harmless, though a
-  pixman_region_get_boxes16() may be useful.
-
-- Altivec signal issue (Company has fix, there is also a patch by
-  dwmw2 in rawhide).
-
-- Behdad's MMX issue - see list
-
-- SSE2 issues:
-    - Crashes in Mozilla because of unaligned stack. Possible fixes
-        - Make use of gcc 4.2 feature to align the stack
-        - Write some sort of trampoline that aligns the stack
-          before calling SSE functions.
-
-- Get rid of the switch-of-doom; replace it with a big table
-  describing the various fast paths.
-
-- Make source clipping optional.
-    - done: source clipping happens through an indirection.
-        still needs to make the indirection settable. (And call it
-        from X)
-
-- Run cairo test suite; fix bugs
-	- one bug in source-scale-clip
-
- - Remove the warning suppression in the ACCESS_MEM macro and fix the
-    warnings that are real
-	- irrelevant now.
-
-- make the wrapper functions global instead of image specific
-	- this won't work since pixman is linked to both fb and wfb
-
-- Add non-mmx solid fill
-
-- Make sure the endian-ness macros are defined correctly.
-
-- The rectangles in a region probably shouldn't be returned const as
-  the X server will be changing them.
-
-- Right now we _always_ have a clip region, which is empty by default.
-  Why does this work at all? It probably doesn't. The server
-  distinguishes two cases, one where nothing is clipped (CT_NONE), and
-  one where there is a clip region (CT_REGION).
-
-- Default clip region should be the full image
-
-  - Test if pseudo color still works. It does, but it also shows that
-    copying a pixman_indexed_t on every composite operation is not
-    going to fly. So, for now set_indexed() does not copy the 
-    indexed table. 
-
-    Also just the malloc() to allocate a pixman image shows up pretty
-    high.
-
-    Options include
-
-      - Make all the setters not copy their arguments
-
-      - Possibly combined with going back to the stack allocated 
-        approach that we already use for regions.
-
-      - Keep a cached pixman_image_t around for every picture. It would
-        have to be kept uptodate every time something changes about the
-        picture.
-
-      - Break the X server ABI and simply have the relevant parameter
-        stored in the pixman image. This would have the additional benefits
-        that:
-
-          - We can get rid of the annoying repeat field which is duplicated
-            elsewhere.
-
-          - We can use pixman_color_t and pixman_gradient_stop_t
-            etc. instead of the types that are defined in
-            renderproto.h
-
+  - Testing
+    - Test implementations against each other
+    - Test both with and without the operator strength reduction.
+      They shold be identical.
+
+  - SSE 2 issues:
+
+      - Use MM_HINT_NTA instead of MM_HINT_T0
+
+      - Use of fbCompositeOver_x888x8x8888sse2()
+
+  - Update the RLEASING file
+
+  - Things to keep in mind if breaking ABI:
+
+      - There should be a guard #ifndef I_AM_EITHER_CAIRO_OR_THE_X_SERVER
+
+      - X server will require 16.16 essentially forever. Can we get
+        the required precision by simply adding offset_x/y to the
+        relevant rendering API?
+
+      - Get rid of workaround for X server bug.
+
+      - pixman_image_set_indexed() should copy its argument, and X
+        should be ported over to use a pixman_image as the
+        representation of a Picture, rather than creating one on each
+        operation.
+
+      - We should get rid of pixman_set_static_pointers()
+
+      - We should get rid of the various trapezoid helper functions().
+        (They only exist because they are theoretically available to
+        drivers).
+
+      - 16 bit regions should be deleted
+
+      - There should only be one trap rasterization API.
+
+      - The PIXMAN_g8/c8/etc formats should use the A channel
+        to indicate the actual depth. That way PIXMAN_x4c4 and PIXMAN_c8
+	won't collide.
+
+  - Maybe bite the bullet and make configure.ac generate a pixman-types.h
+    file that can be included from pixman.h to avoid the #ifdef magic
+    in pixman.h
+
+  - Make pixman_region_point_in() survive a NULL box, then fix up
+    pixman-compose.c
+
+      - Possibly look into inlining the fetch functions
+
+  - There is a bug with source clipping demonstrated by clip-test in the
+    test directory. If we interprete source clipping as given in
+    destination coordinates, which is probably the only sane choice,
+    then the result should have two red bars down the sides.
+    
+  - Test suite
+
+  - Add a general way of dealing with architecture specific
+    fast-paths.  The current idea is to have each operation that can
+    be optimized is called through a function pointer that is
+    initially set to an initialization function that is responsible for
+    setting the function pointer to the appropriate fast-path.
+
+  - Go through things marked FIXME
+
+  - Add calls to prepare and finish access where necessary.  grep for
+    ACCESS_MEM, and make sure they are correctly wrapped in prepare
+    and finish.
+
+  - restore READ/WRITE in the fbcompose combiners since they sometimes
+    store directly to destination drawables.
+
+  - It probably makes sense to move the more strange X region API
+    into pixman as well, but guarded with PIXMAN_XORG_COMPATIBILITY
+
+  - Reinstate the FbBits typedef? At the moment we don't
+    even have the FbBits type; we just use uint32_t everywhere.
+
+    Keith says in bug 2335:
+
+        The 64-bit code in fb (pixman) is probably broken; it hasn't been
+        used in quite some time as PCI (and AGP) is 32-bits wide, so
+        doing things 64-bits at a time is a net loss.  To quickly fix
+        this, I suggest just using 32-bit datatypes by setting
+        IC_SHIFT to 5 for all machines.
+
+  - Consider optimizing the 8/16 bit solid fills in pixman-util.c by
+    storing more than one value at a time.
+
+  - Add an image cache to prevent excessive malloc/free. Note that pixman
+    needs to be thread safe when used from cairo.
+
+  - Moving to 24.8 coordinates. This is tricky because X is still
+    defined as 16.16 and will be basically forever. It's possible we
+    could do this by adding extra offset_x/y parameters to the
+    trapezoid calls. The X server could then just call the API with
+    (0, 0). Cairo would have to make sure that the delta *within* a
+    batch of trapezoids does not exceed 16 bit.
+
+  - Consider adding actual backends. Brain dump:
+
+    A backend is something that knows how to
+
+      - Create images
+      - Composite three images
+      - Rasterize trapezoids
+      - Do solid fills and blits
+
+    These operations are provided by a vtable that the backend will
+    create when it is initialized. Initial backends:
+
+      - VMX
+      - SSE2
+      - MMX
+      - Plain Old C
+
+    When the SIMD backends are initialized, they will be passed a
+    pointer to the Plain Old C backend that they can use for fallback
+    purposes.
+
+    Images would gain a vtable as well that would contain things like
+
+      - Read scanline
+      - Write scanline
+
+    (Or even read_patch/write_patch as suggested by Keith a while
+    back).
+
+    This could simplify the compositing code considerably.
+
+  - Review the pixman_format_code_t enum to make sure it will support
+    future formats. Some formats we will probably need:
+
+    	   ARGB/ABGR with 16/32/64 bit integer/floating channels
+	   YUV2,
+	   YV12
+
+    Also we may need the ability to distinguish between PICT_c8 and
+    PICT_x4c4. (This could be done by interpreting the A channel as
+    the depth for TYPE_COLOR and TYPE_GRAY formats).
+
+    A possibility may be to reserve the two top bits and make them
+    encode "number of places to shift the channel widths given" Since
+    these bits are 00 at the moment everything will continue to work,
+    but these additional widths will be allowed:
+
+    	     All even widths between 18-32
+	     All multiples of four widths between 33 and 64
+	     All multiples of eight between 64 and 128
+
+    This means things like r21g22b21 won't work - is that worth
+    worrying about? I don't think so. And of course the bpp field
+    can't handle a depth of over 256, so > 64 bit channels arent'
+    really all that useful.
+
+    We could reserve one extra bit to indicate floating point, but
+    we may also just add 
+
+       	   PIXMAN_TYPE_ARGB_FLOAT
+	   PIXMAN_TYPE_BGRA_FLOAT
+	   PIXMAN_TYPE_A_FLOAT
+    
+    image types. With five bits we can support up to 32 different
+    format types, which should be enough for everybody, even if we
+    decide to support all the various video formats here:
+
+    	        http://www.fourcc.org/yuv.php
+
+    It may make sense to have a PIXMAN_TYPE_YUV, and then use the
+    channel bits to specify the exact subtype.
+
+    Another possibility is to add 
+
+      	  PIXMAN_TYPE_ARGB_W
+	  PIXMAN_TYPE_ARGB_WW
+    
+    where the channel widths would get 16 and 32 added to them,
+    respectively.
+
+    What about color spaces such a linear vs. srGB etc.?
+
+
+done:
+
+- Use pixmanFillsse2 and pixmanBltsse2
+
+- Be consistent about calling sse2 sse2
+
+- Rename "SSE" to "MMX_EXTENSIONS". (Deleted mmx extensions).
+
+- Commented-out uses of fbCompositeCopyAreasse2()
+
+- Consider whether calling regions region16 is really such a great
+  idea. Vlad wants 32 bit regions for Cairo. This will break X server
+  ABI, but should otherwise be mostly harmless, though a
+  pixman_region_get_boxes16() may be useful.
+
+- Altivec signal issue (Company has fix, there is also a patch by
+  dwmw2 in rawhide).
+
+- Behdad's MMX issue - see list
+
+- SSE2 issues:
+    - Crashes in Mozilla because of unaligned stack. Possible fixes
+        - Make use of gcc 4.2 feature to align the stack
+        - Write some sort of trampoline that aligns the stack
+          before calling SSE functions.
+
+- Get rid of the switch-of-doom; replace it with a big table
+  describing the various fast paths.
+
+- Make source clipping optional.
+    - done: source clipping happens through an indirection.
+        still needs to make the indirection settable. (And call it
+        from X)
+
+- Run cairo test suite; fix bugs
+	- one bug in source-scale-clip
+
+ - Remove the warning suppression in the ACCESS_MEM macro and fix the
+    warnings that are real
+	- irrelevant now.
+
+- make the wrapper functions global instead of image specific
+	- this won't work since pixman is linked to both fb and wfb
+
+- Add non-mmx solid fill
+
+- Make sure the endian-ness macros are defined correctly.
+
+- The rectangles in a region probably shouldn't be returned const as
+  the X server will be changing them.
+
+- Right now we _always_ have a clip region, which is empty by default.
+  Why does this work at all? It probably doesn't. The server
+  distinguishes two cases, one where nothing is clipped (CT_NONE), and
+  one where there is a clip region (CT_REGION).
+
+- Default clip region should be the full image
+
+  - Test if pseudo color still works. It does, but it also shows that
+    copying a pixman_indexed_t on every composite operation is not
+    going to fly. So, for now set_indexed() does not copy the 
+    indexed table. 
+
+    Also just the malloc() to allocate a pixman image shows up pretty
+    high.
+
+    Options include
+
+      - Make all the setters not copy their arguments
+
+      - Possibly combined with going back to the stack allocated 
+        approach that we already use for regions.
+
+      - Keep a cached pixman_image_t around for every picture. It would
+        have to be kept uptodate every time something changes about the
+        picture.
+
+      - Break the X server ABI and simply have the relevant parameter
+        stored in the pixman image. This would have the additional benefits
+        that:
+
+          - We can get rid of the annoying repeat field which is duplicated
+            elsewhere.
+
+          - We can use pixman_color_t and pixman_gradient_stop_t
+            etc. instead of the types that are defined in
+            renderproto.h
+
diff --git a/pixman/pixman-1-uninstalled.pc.in b/pixman/pixman-1-uninstalled.pc.in
index e0347d010..c15e86547 100644
--- a/pixman/pixman-1-uninstalled.pc.in
+++ b/pixman/pixman-1-uninstalled.pc.in
@@ -1,5 +1,5 @@
-Name: Pixman
-Description: The pixman library (version 1)
-Version: @PACKAGE_VERSION@
-Cflags: -I${pc_top_builddir}/${pcfiledir}/pixman
-Libs: ${pc_top_builddir}/${pcfiledir}/pixman/libpixman-1.la
+Name: Pixman
+Description: The pixman library (version 1)
+Version: @PACKAGE_VERSION@
+Cflags: -I${pc_top_builddir}/${pcfiledir}/pixman
+Libs: ${pc_top_builddir}/${pcfiledir}/pixman/libpixman-1.la
diff --git a/pixman/pixman-1.pc.in b/pixman/pixman-1.pc.in
index 936d95db0..e44361749 100644
--- a/pixman/pixman-1.pc.in
+++ b/pixman/pixman-1.pc.in
@@ -1,11 +1,11 @@
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: Pixman
-Description: The pixman library (version 1)
-Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/pixman-1 @DEP_CFLAGS@
-Libs: -L${libdir} -lpixman-1 @DEP_LIBS@
-
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: Pixman
+Description: The pixman library (version 1)
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/pixman-1 @DEP_CFLAGS@
+Libs: -L${libdir} -lpixman-1 @DEP_LIBS@
+
diff --git a/pixman/pixman/make-combine.pl b/pixman/pixman/make-combine.pl
index 210a5da12..417bdf085 100644
--- a/pixman/pixman/make-combine.pl
+++ b/pixman/pixman/make-combine.pl
@@ -1,86 +1,86 @@
-$usage = "Usage: combine.pl { 8 | 16 } < pixman-combine.c.template";
-
-$#ARGV == 0 or die $usage;
-
-# Get the component size.
-$size = int($ARGV[0]);
-$size == 8 or $size == 16 or die $usage;
-
-$pixel_size = $size * 4;
-$half_pixel_size = $size * 2;
-
-sub mask {
-    my $str = shift;
-    my $suffix;
-    $suffix = "ULL" if $size > 8;
-
-    return "0x" . $str . $suffix;
-}
-
-# Generate mask strings.
-$nibbles = $size / 4;
-$mask = "f" x $nibbles;
-$zero_mask = "0" x $nibbles;
-$one_half = "8" . "0" x ($nibbles - 1);
-
-print "/* WARNING: This file is generated by combine.pl from combine.inc.\n";
-print "   Please edit one of those files rather than this one. */\n";
-print "\n";
-
-print "#line 1 \"pixman-combine.c.template\"\n";
-
-$mask_ = mask($mask);
-$one_half_ = mask($one_half);
-$g_mask = mask($mask . $zero_mask);
-$b_mask = mask($mask . $zero_mask x 2);
-$a_mask = mask($mask . $zero_mask x 3);
-$rb_mask = mask($mask . $zero_mask . $mask);
-$ag_mask = mask($mask . $zero_mask . $mask . $zero_mask);
-$rb_one_half = mask($one_half . $zero_mask . $one_half);
-$rb_mask_plus_one = mask("1" . $zero_mask x 2 . "1" .  $zero_mask);
-
-while (<STDIN>) {
-    # Mask and 1/2 value for a single component.
-    s/#define COMPONENT_SIZE\b/$& $size/;
-    s/#define MASK\b/$& $mask_/;
-    s/#define ONE_HALF\b/$& $one_half_/;
-
-    # Shifts and masks for green, blue, and alpha.
-    s/#define G_SHIFT\b/$& $size/;
-    s/#define R_SHIFT\b/$& $size * 2/;
-    s/#define A_SHIFT\b/$& $size * 3/;
-    s/#define G_MASK\b/$& $g_mask/;
-    s/#define R_MASK\b/$& $b_mask/;
-    s/#define A_MASK\b/$& $a_mask/;
-
-    # Special values for dealing with red + blue at the same time.
-    s/#define RB_MASK\b/$& $rb_mask/;
-    s/#define AG_MASK\b/$& $ag_mask/;
-    s/#define RB_ONE_HALF\b/$& $rb_one_half/;
-    s/#define RB_MASK_PLUS_ONE\b/$& $rb_mask_plus_one/;
-
-    # Add 32/64 suffix to combining function types.
-    s/\bCombineFunc\b/CombineFunc$pixel_size/;
-    s/\bFbComposeFunctions\b/FbComposeFunctions$pixel_size/;
-    s/combine_width/combine_$pixel_size/;
-    s/_pixman_setup_combiner_functions_width/_pixman_setup_combiner_functions_$pixel_size/;
-    s/UNc/UN$size/g;
-    s/ALPHA_c/ALPHA_$size/g;
-    s/RED_c/RED_$size/g;
-    s/GREEN_c/GREEN_$size/g;
-    s/BLUE_c/BLUE_$size/g;
-
-    # Convert comp*_t values into the appropriate real types.
-    s/comp1_t/uint${size}_t/g;
-    s/comp2_t/uint${half_pixel_size}_t/g;
-    s/comp4_t/uint${pixel_size}_t/g;
-
-    # Change the function table name for the 64-bit version.
-    s/pixman_composeFunctions/pixman_composeFunctions64/ if $size == 16;
-
-    # Change the header for the 64-bit version
-    s/pixman-combine.h/pixman-combine64.h/ if $size == 16;
-    s/pixman-combine.h/pixman-combine32.h/ if $size == 8;
-
-    print;
-}
+$usage = "Usage: combine.pl { 8 | 16 } < pixman-combine.c.template";
+
+$#ARGV == 0 or die $usage;
+
+# Get the component size.
+$size = int($ARGV[0]);
+$size == 8 or $size == 16 or die $usage;
+
+$pixel_size = $size * 4;
+$half_pixel_size = $size * 2;
+
+sub mask {
+    my $str = shift;
+    my $suffix;
+    $suffix = "ULL" if $size > 8;
+
+    return "0x" . $str . $suffix;
+}
+
+# Generate mask strings.
+$nibbles = $size / 4;
+$mask = "f" x $nibbles;
+$zero_mask = "0" x $nibbles;
+$one_half = "8" . "0" x ($nibbles - 1);
+
+print "/* WARNING: This file is generated by combine.pl from combine.inc.\n";
+print "   Please edit one of those files rather than this one. */\n";
+print "\n";
+
+print "#line 1 \"pixman-combine.c.template\"\n";
+
+$mask_ = mask($mask);
+$one_half_ = mask($one_half);
+$g_mask = mask($mask . $zero_mask);
+$b_mask = mask($mask . $zero_mask x 2);
+$a_mask = mask($mask . $zero_mask x 3);
+$rb_mask = mask($mask . $zero_mask . $mask);
+$ag_mask = mask($mask . $zero_mask . $mask . $zero_mask);
+$rb_one_half = mask($one_half . $zero_mask . $one_half);
+$rb_mask_plus_one = mask("1" . $zero_mask x 2 . "1" .  $zero_mask);
+
+while (<STDIN>) {
+    # Mask and 1/2 value for a single component.
+    s/#define COMPONENT_SIZE\b/$& $size/;
+    s/#define MASK\b/$& $mask_/;
+    s/#define ONE_HALF\b/$& $one_half_/;
+
+    # Shifts and masks for green, blue, and alpha.
+    s/#define G_SHIFT\b/$& $size/;
+    s/#define R_SHIFT\b/$& $size * 2/;
+    s/#define A_SHIFT\b/$& $size * 3/;
+    s/#define G_MASK\b/$& $g_mask/;
+    s/#define R_MASK\b/$& $b_mask/;
+    s/#define A_MASK\b/$& $a_mask/;
+
+    # Special values for dealing with red + blue at the same time.
+    s/#define RB_MASK\b/$& $rb_mask/;
+    s/#define AG_MASK\b/$& $ag_mask/;
+    s/#define RB_ONE_HALF\b/$& $rb_one_half/;
+    s/#define RB_MASK_PLUS_ONE\b/$& $rb_mask_plus_one/;
+
+    # Add 32/64 suffix to combining function types.
+    s/\bCombineFunc\b/CombineFunc$pixel_size/;
+    s/\bFbComposeFunctions\b/FbComposeFunctions$pixel_size/;
+    s/combine_width/combine_$pixel_size/;
+    s/_pixman_setup_combiner_functions_width/_pixman_setup_combiner_functions_$pixel_size/;
+    s/UNc/UN$size/g;
+    s/ALPHA_c/ALPHA_$size/g;
+    s/RED_c/RED_$size/g;
+    s/GREEN_c/GREEN_$size/g;
+    s/BLUE_c/BLUE_$size/g;
+
+    # Convert comp*_t values into the appropriate real types.
+    s/comp1_t/uint${size}_t/g;
+    s/comp2_t/uint${half_pixel_size}_t/g;
+    s/comp4_t/uint${pixel_size}_t/g;
+
+    # Change the function table name for the 64-bit version.
+    s/pixman_composeFunctions/pixman_composeFunctions64/ if $size == 16;
+
+    # Change the header for the 64-bit version
+    s/pixman-combine.h/pixman-combine64.h/ if $size == 16;
+    s/pixman-combine.h/pixman-combine32.h/ if $size == 8;
+
+    print;
+}
diff --git a/pixman/pixman/pixman-access-accessors.c b/pixman/pixman/pixman-access-accessors.c
index 3263582f1..bde67a70e 100644
--- a/pixman/pixman/pixman-access-accessors.c
+++ b/pixman/pixman/pixman-access-accessors.c
@@ -1,3 +1,3 @@
-#define PIXMAN_FB_ACCESSORS
-
-#include "pixman-access.c"
+#define PIXMAN_FB_ACCESSORS
+
+#include "pixman-access.c"
diff --git a/pixman/pixman/pixman-arm-neon-asm.h b/pixman/pixman/pixman-arm-neon-asm.h
index 0ba67d05f..97adc6a87 100644
--- a/pixman/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman/pixman-arm-neon-asm.h
@@ -1,1177 +1,1177 @@
-/*
- * Copyright © 2009 Nokia Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
- */
-
-/*
- * This file contains a macro ('generate_composite_function') which can
- * construct 2D image processing functions, based on a common template.
- * Any combinations of source, destination and mask images with 8bpp,
- * 16bpp, 24bpp, 32bpp color formats are supported.
- *
- * This macro takes care of:
- *  - handling of leading and trailing unaligned pixels
- *  - doing most of the work related to L2 cache preload
- *  - encourages the use of software pipelining for better instructions
- *    scheduling
- *
- * The user of this macro has to provide some configuration parameters
- * (bit depths for the images, prefetch distance, etc.) and a set of
- * macros, which should implement basic code chunks responsible for
- * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
- * examples.
- *
- * TODO:
- *  - try overlapped pixel method (from Ian Rickards) when processing
- *    exactly two blocks of pixels
- *  - maybe add an option to do reverse scanline processing
- */
-
-/*
- * Bit flags for 'generate_composite_function' macro which are used
- * to tune generated functions behavior.
- */
-.set FLAG_DST_WRITEONLY,       0
-.set FLAG_DST_READWRITE,       1
-.set FLAG_DEINTERLEAVE_32BPP,  2
-
-/*
- * Offset in stack where mask and source pointer/stride can be accessed
- * from 'init' macro. This is useful for doing special handling for solid mask.
- */
-.set ARGS_STACK_OFFSET,        40
-
-/*
- * Constants for selecting preferable prefetch type.
- */
-.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
-.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
-.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
-
-/*
- * Definitions of supplementary pixld/pixst macros (for partial load/store of
- * pixel data).
- */
-
-.macro pixldst1 op, elem_size, reg1, mem_operand, abits
-.if abits > 0
-    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
-.else
-    op&.&elem_size {d&reg1}, [&mem_operand&]!
-.endif
-.endm
-
-.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
-.if abits > 0
-    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
-.else
-    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
-.endif
-.endm
-
-.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
-.if abits > 0
-    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
-.else
-    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
-.endif
-.endm
-
-.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
-    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
-.endm
-
-.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
-    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
-.endm
-
-.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
-    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
-.endm
-
-.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
-.if numbytes == 32
-    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
-                              %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif numbytes == 16
-    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
-.elseif numbytes == 8
-    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
-.elseif numbytes == 4
-    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
-        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
-    .elseif elem_size == 16
-        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
-        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
-    .else
-        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
-    .endif
-.elseif numbytes == 2
-    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
-        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
-    .else
-        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
-    .endif
-.elseif numbytes == 1
-    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
-.else
-    .error "unsupported size: numbytes"
-.endif
-.endm
-
-.macro pixld numpix, bpp, basereg, mem_operand, abits=0
-.if bpp > 0
-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
-                      %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif (bpp == 24) && (numpix == 8)
-    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
-.elseif (bpp == 24) && (numpix == 4)
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
-.elseif (bpp == 24) && (numpix == 2)
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
-.elseif (bpp == 24) && (numpix == 1)
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
-.else
-    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
-.endif
-.endif
-.endm
-
-.macro pixst numpix, bpp, basereg, mem_operand, abits=0
-.if bpp > 0
-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
-                      %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif (bpp == 24) && (numpix == 8)
-    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
-.elseif (bpp == 24) && (numpix == 4)
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
-.elseif (bpp == 24) && (numpix == 2)
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
-.elseif (bpp == 24) && (numpix == 1)
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
-.else
-    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
-.endif
-.endif
-.endm
-
-.macro pixld_a numpix, bpp, basereg, mem_operand
-.if (bpp * numpix) <= 128
-    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
-.else
-    pixld numpix, bpp, basereg, mem_operand, 128
-.endif
-.endm
-
-.macro pixst_a numpix, bpp, basereg, mem_operand
-.if (bpp * numpix) <= 128
-    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
-.else
-    pixst numpix, bpp, basereg, mem_operand, 128
-.endif
-.endm
-
-/*
- * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
- * aliases to be defined)
- */
-.macro pixld1_s elem_size, reg1, mem_operand
-.if elem_size == 16
-    mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
-    add     TMP1, mem_operand, TMP1, asl #1
-    mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
-    add     TMP2, mem_operand, TMP2, asl #1
-    vld1.16 {d&reg1&[0]}, [TMP1, :16]
-    mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
-    add     TMP1, mem_operand, TMP1, asl #1
-    vld1.16 {d&reg1&[1]}, [TMP2, :16]
-    mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
-    add     TMP2, mem_operand, TMP2, asl #1
-    vld1.16 {d&reg1&[2]}, [TMP1, :16]
-    vld1.16 {d&reg1&[3]}, [TMP2, :16]
-.elseif elem_size == 32
-    mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
-    add     TMP1, mem_operand, TMP1, asl #2
-    mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
-    add     TMP2, mem_operand, TMP2, asl #2
-    vld1.32 {d&reg1&[0]}, [TMP1, :32]
-    vld1.32 {d&reg1&[1]}, [TMP2, :32]
-.else
-    .error "unsupported"
-.endif
-.endm
-
-.macro pixld2_s elem_size, reg1, reg2, mem_operand
-.if elem_size == 32
-    mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X, asl #1
-    add     TMP1, mem_operand, TMP1, asl #2
-    mov     TMP2, VX, asr #16
-    sub     VX, VX, UNIT_X
-    add     TMP2, mem_operand, TMP2, asl #2
-    vld1.32 {d&reg1&[0]}, [TMP1, :32]
-    mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X, asl #1
-    add     TMP1, mem_operand, TMP1, asl #2
-    vld1.32 {d&reg2&[0]}, [TMP2, :32]
-    mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
-    add     TMP2, mem_operand, TMP2, asl #2
-    vld1.32 {d&reg1&[1]}, [TMP1, :32]
-    vld1.32 {d&reg2&[1]}, [TMP2, :32]
-.else
-    pixld1_s elem_size, reg1, mem_operand
-    pixld1_s elem_size, reg2, mem_operand
-.endif
-.endm
-
-.macro pixld0_s elem_size, reg1, idx, mem_operand
-.if elem_size == 16
-    mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
-    add     TMP1, mem_operand, TMP1, asl #1
-    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
-.elseif elem_size == 32
-    mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
-    add     TMP1, mem_operand, TMP1, asl #2
-    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
-.endif
-.endm
-
-.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
-.if numbytes == 32
-    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
-    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
-    pixdeinterleave elem_size, %(basereg+4)
-.elseif numbytes == 16
-    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
-.elseif numbytes == 8
-    pixld1_s elem_size, %(basereg+1), mem_operand
-.elseif numbytes == 4
-    .if elem_size == 32
-        pixld0_s elem_size, %(basereg+0), 1, mem_operand
-    .elseif elem_size == 16
-        pixld0_s elem_size, %(basereg+0), 2, mem_operand
-        pixld0_s elem_size, %(basereg+0), 3, mem_operand
-    .else
-        pixld0_s elem_size, %(basereg+0), 4, mem_operand
-        pixld0_s elem_size, %(basereg+0), 5, mem_operand
-        pixld0_s elem_size, %(basereg+0), 6, mem_operand
-        pixld0_s elem_size, %(basereg+0), 7, mem_operand
-    .endif
-.elseif numbytes == 2
-    .if elem_size == 16
-        pixld0_s elem_size, %(basereg+0), 1, mem_operand
-    .else
-        pixld0_s elem_size, %(basereg+0), 2, mem_operand
-        pixld0_s elem_size, %(basereg+0), 3, mem_operand
-    .endif
-.elseif numbytes == 1
-    pixld0_s elem_size, %(basereg+0), 1, mem_operand
-.else
-    .error "unsupported size: numbytes"
-.endif
-.endm
-
-.macro pixld_s numpix, bpp, basereg, mem_operand
-.if bpp > 0
-    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
-.endif
-.endm
-
-.macro vuzp8 reg1, reg2
-    vuzp.8 d&reg1, d&reg2
-.endm
-
-.macro vzip8 reg1, reg2
-    vzip.8 d&reg1, d&reg2
-.endm
-
-/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
-.macro pixdeinterleave bpp, basereg
-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    vuzp8 %(basereg+0), %(basereg+1)
-    vuzp8 %(basereg+2), %(basereg+3)
-    vuzp8 %(basereg+1), %(basereg+3)
-    vuzp8 %(basereg+0), %(basereg+2)
-.endif
-.endm
-
-/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
-.macro pixinterleave bpp, basereg
-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    vzip8 %(basereg+0), %(basereg+2)
-    vzip8 %(basereg+1), %(basereg+3)
-    vzip8 %(basereg+2), %(basereg+3)
-    vzip8 %(basereg+0), %(basereg+1)
-.endif
-.endm
-
-/*
- * This is a macro for implementing cache preload. The main idea is that
- * cache preload logic is mostly independent from the rest of pixels
- * processing code. It starts at the top left pixel and moves forward
- * across pixels and can jump across scanlines. Prefetch distance is
- * handled in an 'incremental' way: it starts from 0 and advances to the
- * optimal distance over time. After reaching optimal prefetch distance,
- * it is kept constant. There are some checks which prevent prefetching
- * unneeded pixel lines below the image (but it still can prefetch a bit
- * more data on the right side of the image - not a big issue and may
- * be actually helpful when rendering text glyphs). Additional trick is
- * the use of LDR instruction for prefetch instead of PLD when moving to
- * the next line, the point is that we have a high chance of getting TLB
- * miss in this case, and PLD would be useless.
- *
- * This sounds like it may introduce a noticeable overhead (when working with
- * fully cached data). But in reality, due to having a separate pipeline and
- * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
- * execute simultaneously with NEON and be completely shadowed by it. Thus
- * we get no performance overhead at all (*). This looks like a very nice
- * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
- * but still can implement some rather advanced prefetch logic in sofware
- * for almost zero cost!
- *
- * (*) The overhead of the prefetcher is visible when running some trivial
- * pixels processing like simple copy. Anyway, having prefetch is a must
- * when working with the graphics data.
- */
-.macro PF a, x:vararg
-.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
-    a x
-.endif
-.endm
-
-.macro cache_preload std_increment, boost_increment
-.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
-.if regs_shortage
-    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
-.endif
-.if std_increment != 0
-    PF add PF_X, PF_X, #std_increment
-.endif
-    PF tst PF_CTL, #0xF
-    PF addne PF_X, PF_X, #boost_increment
-    PF subne PF_CTL, PF_CTL, #1
-    PF cmp PF_X, ORIG_W
-.if src_bpp_shift >= 0
-    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-.endif
-.if dst_r_bpp != 0
-    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-.endif
-.if mask_bpp_shift >= 0
-    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
-.endif
-    PF subge PF_X, PF_X, ORIG_W
-    PF subges PF_CTL, PF_CTL, #0x10
-.if src_bpp_shift >= 0
-    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-.endif
-.if dst_r_bpp != 0
-    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-.endif
-.if mask_bpp_shift >= 0
-    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
-.endif
-.endif
-.endm
-
-.macro cache_preload_simple
-.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
-.if src_bpp > 0
-    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
-.endif
-.if dst_r_bpp > 0
-    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
-.endif
-.if mask_bpp > 0
-    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
-.endif
-.endif
-.endm
-
-.macro fetch_mask_pixblock
-    pixld       pixblock_size, mask_bpp, \
-                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-.endm
-
-/*
- * Macro which is used to process leading pixels until destination
- * pointer is properly aligned (at 16 bytes boundary). When destination
- * buffer uses 16bpp format, this is unnecessary, or even pointless.
- */
-.macro ensure_destination_ptr_alignment process_pixblock_head, \
-                                        process_pixblock_tail, \
-                                        process_pixblock_tail_head
-.if dst_w_bpp != 24
-    tst         DST_R, #0xF
-    beq         2f
-
-.irp lowbit, 1, 2, 4, 8, 16
-local skip1
-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
-.if lowbit < 16 /* we don't need more than 16-byte alignment */
-    tst         DST_R, #lowbit
-    beq         1f
-.endif
-    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
-    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
-.if dst_r_bpp > 0
-    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
-.else
-    add         DST_R, DST_R, #lowbit
-.endif
-    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
-    sub         W, W, #(lowbit * 8 / dst_w_bpp)
-1:
-.endif
-.endr
-    pixdeinterleave src_bpp, src_basereg
-    pixdeinterleave mask_bpp, mask_basereg
-    pixdeinterleave dst_r_bpp, dst_r_basereg
-
-    process_pixblock_head
-    cache_preload 0, pixblock_size
-    cache_preload_simple
-    process_pixblock_tail
-
-    pixinterleave dst_w_bpp, dst_w_basereg
-.irp lowbit, 1, 2, 4, 8, 16
-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
-.if lowbit < 16 /* we don't need more than 16-byte alignment */
-    tst         DST_W, #lowbit
-    beq         1f
-.endif
-    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
-1:
-.endif
-.endr
-.endif
-2:
-.endm
-
-/*
- * Special code for processing up to (pixblock_size - 1) remaining
- * trailing pixels. As SIMD processing performs operation on
- * pixblock_size pixels, anything smaller than this has to be loaded
- * and stored in a special way. Loading and storing of pixel data is
- * performed in such a way that we fill some 'slots' in the NEON
- * registers (some slots naturally are unused), then perform compositing
- * operation as usual. In the end, the data is taken from these 'slots'
- * and saved to memory.
- *
- * cache_preload_flag - allows to suppress prefetch if
- *                      set to 0
- * dst_aligned_flag   - selects whether destination buffer
- *                      is aligned
- */
-.macro process_trailing_pixels cache_preload_flag, \
-                               dst_aligned_flag, \
-                               process_pixblock_head, \
-                               process_pixblock_tail, \
-                               process_pixblock_tail_head
-    tst         W, #(pixblock_size - 1)
-    beq         2f
-.irp chunk_size, 16, 8, 4, 2, 1
-.if pixblock_size > chunk_size
-    tst         W, #chunk_size
-    beq         1f
-    pixld_src   chunk_size, src_bpp, src_basereg, SRC
-    pixld       chunk_size, mask_bpp, mask_basereg, MASK
-.if dst_aligned_flag != 0
-    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
-.else
-    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
-.endif
-.if cache_preload_flag != 0
-    PF add      PF_X, PF_X, #chunk_size
-.endif
-1:
-.endif
-.endr
-    pixdeinterleave src_bpp, src_basereg
-    pixdeinterleave mask_bpp, mask_basereg
-    pixdeinterleave dst_r_bpp, dst_r_basereg
-
-    process_pixblock_head
-.if cache_preload_flag != 0
-    cache_preload 0, pixblock_size
-    cache_preload_simple
-.endif
-    process_pixblock_tail
-    pixinterleave dst_w_bpp, dst_w_basereg
-.irp chunk_size, 16, 8, 4, 2, 1
-.if pixblock_size > chunk_size
-    tst         W, #chunk_size
-    beq         1f
-.if dst_aligned_flag != 0
-    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
-.else
-    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
-.endif
-1:
-.endif
-.endr
-2:
-.endm
-
-/*
- * Macro, which performs all the needed operations to switch to the next
- * scanline and start the next loop iteration unless all the scanlines
- * are already processed.
- */
-.macro advance_to_next_scanline start_of_loop_label
-.if regs_shortage
-    ldrd        W, [sp] /* load W and H (width and height) from stack */
-.else
-    mov         W, ORIG_W
-.endif
-    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
-.if src_bpp != 0
-    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
-.endif
-.if mask_bpp != 0
-    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
-.endif
-.if (dst_w_bpp != 24)
-    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
-.endif
-.if (src_bpp != 24) && (src_bpp != 0)
-    sub         SRC, SRC, W, lsl #src_bpp_shift
-.endif
-.if (mask_bpp != 24) && (mask_bpp != 0)
-    sub         MASK, MASK, W, lsl #mask_bpp_shift
-.endif
-    subs        H, H, #1
-    mov         DST_R, DST_W
-.if regs_shortage
-    str         H, [sp, #4] /* save updated height to stack */
-.endif
-    bge         start_of_loop_label
-.endm
-
-/*
- * Registers are allocated in the following way by default:
- * d0, d1, d2, d3     - reserved for loading source pixel data
- * d4, d5, d6, d7     - reserved for loading destination pixel data
- * d24, d25, d26, d27 - reserved for loading mask pixel data
- * d28, d29, d30, d31 - final destination pixel data for writeback to memory
- */
-.macro generate_composite_function fname, \
-                                   src_bpp_, \
-                                   mask_bpp_, \
-                                   dst_w_bpp_, \
-                                   flags, \
-                                   pixblock_size_, \
-                                   prefetch_distance, \
-                                   init, \
-                                   cleanup, \
-                                   process_pixblock_head, \
-                                   process_pixblock_tail, \
-                                   process_pixblock_tail_head, \
-                                   dst_w_basereg_ = 28, \
-                                   dst_r_basereg_ = 4, \
-                                   src_basereg_   = 0, \
-                                   mask_basereg_  = 24
-
-    .func fname
-    .global fname
-    /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
-    .hidden fname
-    .type fname, %function
-#endif
-fname:
-    push        {r4-r12, lr}        /* save all registers */
-
-/*
- * Select prefetch type for this function. If prefetch distance is
- * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
- * has to be used instead of ADVANCED.
- */
-    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
-.if prefetch_distance == 0
-    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
-.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
-        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
-    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
-.endif
-
-/*
- * Make some macro arguments globally visible and accessible
- * from other macros
- */
-    .set src_bpp, src_bpp_
-    .set mask_bpp, mask_bpp_
-    .set dst_w_bpp, dst_w_bpp_
-    .set pixblock_size, pixblock_size_
-    .set dst_w_basereg, dst_w_basereg_
-    .set dst_r_basereg, dst_r_basereg_
-    .set src_basereg, src_basereg_
-    .set mask_basereg, mask_basereg_
-
-    .macro pixld_src x:vararg
-        pixld x
-    .endm
-    .macro fetch_src_pixblock
-        pixld_src   pixblock_size, src_bpp, \
-                    (src_basereg - pixblock_size * src_bpp / 64), SRC
-    .endm
-/*
- * Assign symbolic names to registers
- */
-    W           .req        r0      /* width (is updated during processing) */
-    H           .req        r1      /* height (is updated during processing) */
-    DST_W       .req        r2      /* destination buffer pointer for writes */
-    DST_STRIDE  .req        r3      /* destination image stride */
-    SRC         .req        r4      /* source buffer pointer */
-    SRC_STRIDE  .req        r5      /* source image stride */
-    DST_R       .req        r6      /* destination buffer pointer for reads */
-
-    MASK        .req        r7      /* mask pointer */
-    MASK_STRIDE .req        r8      /* mask stride */
-
-    PF_CTL      .req        r9      /* combined lines counter and prefetch */
-                                    /* distance increment counter */
-    PF_X        .req        r10     /* pixel index in a scanline for current */
-                                    /* pretetch position */
-    PF_SRC      .req        r11     /* pointer to source scanline start */
-                                    /* for prefetch purposes */
-    PF_DST      .req        r12     /* pointer to destination scanline start */
-                                    /* for prefetch purposes */
-    PF_MASK     .req        r14     /* pointer to mask scanline start */
-                                    /* for prefetch purposes */
-/*
- * Check whether we have enough registers for all the local variables.
- * If we don't have enough registers, original width and height are
- * kept on top of stack (and 'regs_shortage' variable is set to indicate
- * this for the rest of code). Even if there are enough registers, the
- * allocation scheme may be a bit different depending on whether source
- * or mask is not used.
- */
-.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
-    ORIG_W      .req        r10     /* saved original width */
-    DUMMY       .req        r12     /* temporary register */
-    .set        regs_shortage, 0
-.elseif mask_bpp == 0
-    ORIG_W      .req        r7      /* saved original width */
-    DUMMY       .req        r8      /* temporary register */
-    .set        regs_shortage, 0
-.elseif src_bpp == 0
-    ORIG_W      .req        r4      /* saved original width */
-    DUMMY       .req        r5      /* temporary register */
-    .set        regs_shortage, 0
-.else
-    ORIG_W      .req        r1      /* saved original width */
-    DUMMY       .req        r1      /* temporary register */
-    .set        regs_shortage, 1
-.endif
-
-    .set mask_bpp_shift, -1
-.if src_bpp == 32
-    .set src_bpp_shift, 2
-.elseif src_bpp == 24
-    .set src_bpp_shift, 0
-.elseif src_bpp == 16
-    .set src_bpp_shift, 1
-.elseif src_bpp == 8
-    .set src_bpp_shift, 0
-.elseif src_bpp == 0
-    .set src_bpp_shift, -1
-.else
-    .error "requested src bpp (src_bpp) is not supported"
-.endif
-.if mask_bpp == 32
-    .set mask_bpp_shift, 2
-.elseif mask_bpp == 24
-    .set mask_bpp_shift, 0
-.elseif mask_bpp == 8
-    .set mask_bpp_shift, 0
-.elseif mask_bpp == 0
-    .set mask_bpp_shift, -1
-.else
-    .error "requested mask bpp (mask_bpp) is not supported"
-.endif
-.if dst_w_bpp == 32
-    .set dst_bpp_shift, 2
-.elseif dst_w_bpp == 24
-    .set dst_bpp_shift, 0
-.elseif dst_w_bpp == 16
-    .set dst_bpp_shift, 1
-.elseif dst_w_bpp == 8
-    .set dst_bpp_shift, 0
-.else
-    .error "requested dst bpp (dst_w_bpp) is not supported"
-.endif
-
-.if (((flags) & FLAG_DST_READWRITE) != 0)
-    .set dst_r_bpp, dst_w_bpp
-.else
-    .set dst_r_bpp, 0
-.endif
-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
-    .set DEINTERLEAVE_32BPP_ENABLED, 1
-.else
-    .set DEINTERLEAVE_32BPP_ENABLED, 0
-.endif
-
-.if prefetch_distance < 0 || prefetch_distance > 15
-    .error "invalid prefetch distance (prefetch_distance)"
-.endif
-
-.if src_bpp > 0
-    ldr         SRC, [sp, #40]
-.endif
-.if mask_bpp > 0
-    ldr         MASK, [sp, #48]
-.endif
-    PF mov      PF_X, #0
-.if src_bpp > 0
-    ldr         SRC_STRIDE, [sp, #44]
-.endif
-.if mask_bpp > 0
-    ldr         MASK_STRIDE, [sp, #52]
-.endif
-    mov         DST_R, DST_W
-
-.if src_bpp == 24
-    sub         SRC_STRIDE, SRC_STRIDE, W
-    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
-.endif
-.if mask_bpp == 24
-    sub         MASK_STRIDE, MASK_STRIDE, W
-    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
-.endif
-.if dst_w_bpp == 24
-    sub         DST_STRIDE, DST_STRIDE, W
-    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
-.endif
-
-/*
- * Setup advanced prefetcher initial state
- */
-    PF mov      PF_SRC, SRC
-    PF mov      PF_DST, DST_R
-    PF mov      PF_MASK, MASK
-    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
-    PF mov      PF_CTL, H, lsl #4
-    PF add      PF_CTL, #(prefetch_distance - 0x10)
-
-    init
-.if regs_shortage
-    push        {r0, r1}
-.endif
-    subs        H, H, #1
-.if regs_shortage
-    str         H, [sp, #4] /* save updated height to stack */
-.else
-    mov         ORIG_W, W
-.endif
-    blt         9f
-    cmp         W, #(pixblock_size * 2)
-    blt         8f
-/*
- * This is the start of the pipelined loop, which if optimized for
- * long scanlines
- */
-0:
-    ensure_destination_ptr_alignment process_pixblock_head, \
-                                     process_pixblock_tail, \
-                                     process_pixblock_tail_head
-
-    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
-    pixld_a     pixblock_size, dst_r_bpp, \
-                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
-    fetch_src_pixblock
-    pixld       pixblock_size, mask_bpp, \
-                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-    PF add      PF_X, PF_X, #pixblock_size
-    process_pixblock_head
-    cache_preload 0, pixblock_size
-    cache_preload_simple
-    subs        W, W, #(pixblock_size * 2)
-    blt         2f
-1:
-    process_pixblock_tail_head
-    cache_preload_simple
-    subs        W, W, #pixblock_size
-    bge         1b
-2:
-    process_pixblock_tail
-    pixst_a     pixblock_size, dst_w_bpp, \
-                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
-
-    /* Process the remaining trailing pixels in the scanline */
-    process_trailing_pixels 1, 1, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
-    advance_to_next_scanline 0b
-
-.if regs_shortage
-    pop         {r0, r1}
-.endif
-    cleanup
-    pop         {r4-r12, pc}  /* exit */
-/*
- * This is the start of the loop, designed to process images with small width
- * (less than pixblock_size * 2 pixels). In this case neither pipelining
- * nor prefetch are used.
- */
-8:
-    /* Process exactly pixblock_size pixels if needed */
-    tst         W, #pixblock_size
-    beq         1f
-    pixld       pixblock_size, dst_r_bpp, \
-                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
-    fetch_src_pixblock
-    pixld       pixblock_size, mask_bpp, \
-                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-    process_pixblock_head
-    process_pixblock_tail
-    pixst       pixblock_size, dst_w_bpp, \
-                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
-1:
-    /* Process the remaining trailing pixels in the scanline */
-    process_trailing_pixels 0, 0, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
-    advance_to_next_scanline 8b
-9:
-.if regs_shortage
-    pop         {r0, r1}
-.endif
-    cleanup
-    pop         {r4-r12, pc}  /* exit */
-
-    .purgem     fetch_src_pixblock
-    .purgem     pixld_src
-
-    .unreq      SRC
-    .unreq      MASK
-    .unreq      DST_R
-    .unreq      DST_W
-    .unreq      ORIG_W
-    .unreq      W
-    .unreq      H
-    .unreq      SRC_STRIDE
-    .unreq      DST_STRIDE
-    .unreq      MASK_STRIDE
-    .unreq      PF_CTL
-    .unreq      PF_X
-    .unreq      PF_SRC
-    .unreq      PF_DST
-    .unreq      PF_MASK
-    .unreq      DUMMY
-    .endfunc
-.endm
-
-/*
- * A simplified variant of function generation template for a single
- * scanline processing (for implementing pixman combine functions)
- */
-.macro generate_composite_function_scanline        use_nearest_scaling, \
-                                                   fname, \
-                                                   src_bpp_, \
-                                                   mask_bpp_, \
-                                                   dst_w_bpp_, \
-                                                   flags, \
-                                                   pixblock_size_, \
-                                                   init, \
-                                                   cleanup, \
-                                                   process_pixblock_head, \
-                                                   process_pixblock_tail, \
-                                                   process_pixblock_tail_head, \
-                                                   dst_w_basereg_ = 28, \
-                                                   dst_r_basereg_ = 4, \
-                                                   src_basereg_   = 0, \
-                                                   mask_basereg_  = 24
-
-    .func fname
-    .global fname
-    /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
-    .hidden fname
-    .type fname, %function
-#endif
-fname:
-    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
-/*
- * Make some macro arguments globally visible and accessible
- * from other macros
- */
-    .set src_bpp, src_bpp_
-    .set mask_bpp, mask_bpp_
-    .set dst_w_bpp, dst_w_bpp_
-    .set pixblock_size, pixblock_size_
-    .set dst_w_basereg, dst_w_basereg_
-    .set dst_r_basereg, dst_r_basereg_
-    .set src_basereg, src_basereg_
-    .set mask_basereg, mask_basereg_
-
-.if use_nearest_scaling != 0
-    /*
-     * Assign symbolic names to registers for nearest scaling
-     */
-    W           .req        r0
-    DST_W       .req        r1
-    SRC         .req        r2
-    VX          .req        r3
-    UNIT_X      .req        ip
-    MASK        .req        lr
-    TMP1        .req        r4
-    TMP2        .req        r5
-    DST_R       .req        r6
-
-    .macro pixld_src x:vararg
-        pixld_s x
-    .endm
-
-    ldr         UNIT_X, [sp]
-    push        {r4-r6, lr}
-    .if mask_bpp != 0
-    ldr         MASK, [sp, #(16 + 4)]
-    .endif
-.else
-    /*
-     * Assign symbolic names to registers
-     */
-    W           .req        r0      /* width (is updated during processing) */
-    DST_W       .req        r1      /* destination buffer pointer for writes */
-    SRC         .req        r2      /* source buffer pointer */
-    DST_R       .req        ip      /* destination buffer pointer for reads */
-    MASK        .req        r3      /* mask pointer */
-
-    .macro pixld_src x:vararg
-        pixld x
-    .endm
-.endif
-
-.if (((flags) & FLAG_DST_READWRITE) != 0)
-    .set dst_r_bpp, dst_w_bpp
-.else
-    .set dst_r_bpp, 0
-.endif
-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
-    .set DEINTERLEAVE_32BPP_ENABLED, 1
-.else
-    .set DEINTERLEAVE_32BPP_ENABLED, 0
-.endif
-
-    .macro fetch_src_pixblock
-        pixld_src   pixblock_size, src_bpp, \
-                    (src_basereg - pixblock_size * src_bpp / 64), SRC
-    .endm
-
-    init
-    mov         DST_R, DST_W
-
-    cmp         W, #pixblock_size
-    blt         8f
-
-    ensure_destination_ptr_alignment process_pixblock_head, \
-                                     process_pixblock_tail, \
-                                     process_pixblock_tail_head
-
-    subs        W, W, #pixblock_size
-    blt         7f
-
-    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
-    pixld_a     pixblock_size, dst_r_bpp, \
-                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
-    fetch_src_pixblock
-    pixld       pixblock_size, mask_bpp, \
-                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-    process_pixblock_head
-    subs        W, W, #pixblock_size
-    blt         2f
-1:
-    process_pixblock_tail_head
-    subs        W, W, #pixblock_size
-    bge         1b
-2:
-    process_pixblock_tail
-    pixst_a     pixblock_size, dst_w_bpp, \
-                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
-7:
-    /* Process the remaining trailing pixels in the scanline (dst aligned) */
-    process_trailing_pixels 0, 1, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
-
-    cleanup
-.if use_nearest_scaling != 0
-    pop         {r4-r6, pc}  /* exit */
-.else
-    bx          lr  /* exit */
-.endif
-8:
-    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
-    process_trailing_pixels 0, 0, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
-
-    cleanup
-
-.if use_nearest_scaling != 0
-    pop         {r4-r6, pc}  /* exit */
-
-    .unreq      DST_R
-    .unreq      SRC
-    .unreq      W
-    .unreq      VX
-    .unreq      UNIT_X
-    .unreq      TMP1
-    .unreq      TMP2
-    .unreq      DST_W
-    .unreq      MASK
-
-.else
-    bx          lr  /* exit */
-
-    .unreq      SRC
-    .unreq      MASK
-    .unreq      DST_R
-    .unreq      DST_W
-    .unreq      W
-.endif
-
-    .purgem     fetch_src_pixblock
-    .purgem     pixld_src
-
-    .endfunc
-.endm
-
-.macro generate_composite_function_single_scanline x:vararg
-    generate_composite_function_scanline 0, x
-.endm
-
-.macro generate_composite_function_nearest_scanline x:vararg
-    generate_composite_function_scanline 1, x
-.endm
-
-/* Default prologue/epilogue, nothing special needs to be done */
-
-.macro default_init
-.endm
-
-.macro default_cleanup
-.endm
-
-/*
- * Prologue/epilogue variant which additionally saves/restores d8-d15
- * registers (they need to be saved/restored by callee according to ABI).
- * This is required if the code needs to use all the NEON registers.
- */
-
-.macro default_init_need_all_regs
-    vpush       {d8-d15}
-.endm
-
-.macro default_cleanup_need_all_regs
-    vpop        {d8-d15}
-.endm
-
-/******************************************************************************/
-
-/*
- * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
- * into a planar a8r8g8b8 format (with a, r, g, b color components
- * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
- *
- * Warning: the conversion is destructive and the original
- *          value (in) is lost.
- */
-.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
-    vshrn.u16   out_r, in,    #8
-    vshrn.u16   out_g, in,    #3
-    vsli.u16    in,    in,    #5
-    vmov.u8     out_a, #255
-    vsri.u8     out_r, out_r, #5
-    vsri.u8     out_g, out_g, #6
-    vshrn.u16   out_b, in,    #2
-.endm
-
-.macro convert_0565_to_x888 in, out_r, out_g, out_b
-    vshrn.u16   out_r, in,    #8
-    vshrn.u16   out_g, in,    #3
-    vsli.u16    in,    in,    #5
-    vsri.u8     out_r, out_r, #5
-    vsri.u8     out_g, out_g, #6
-    vshrn.u16   out_b, in,    #2
-.endm
-
-/*
- * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
- * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
- * pixels packed in 128-bit register (out). Requires two temporary 128-bit
- * registers (tmp1, tmp2)
- */
-.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
-    vshll.u8    tmp1, in_g, #8
-    vshll.u8    out, in_r, #8
-    vshll.u8    tmp2, in_b, #8
-    vsri.u16    out, tmp1, #5
-    vsri.u16    out, tmp2, #11
-.endm
-
-/*
- * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
- * returned in (out0, out1) registers pair. Requires one temporary
- * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
- * value from 'in' is lost
- */
-.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
-    vshl.u16    out0, in,   #5  /* G top 6 bits */
-    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
-    vsri.u16    in,   in,   #5  /* R is ready in top bits */
-    vsri.u16    out0, out0, #6  /* G is ready in top bits */
-    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
-    vshr.u16    out1, in,   #8  /* R is in place */
-    vsri.u16    out0, tmp,  #8  /* G & B is in place */
-    vzip.u16    out0, out1      /* everything is in place */
-.endm
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains a macro ('generate_composite_function') which can
+ * construct 2D image processing functions, based on a common template.
+ * Any combinations of source, destination and mask images with 8bpp,
+ * 16bpp, 24bpp, 32bpp color formats are supported.
+ *
+ * This macro takes care of:
+ *  - handling of leading and trailing unaligned pixels
+ *  - doing most of the work related to L2 cache preload
+ *  - encourages the use of software pipelining for better instructions
+ *    scheduling
+ *
+ * The user of this macro has to provide some configuration parameters
+ * (bit depths for the images, prefetch distance, etc.) and a set of
+ * macros, which should implement basic code chunks responsible for
+ * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
+ * examples.
+ *
+ * TODO:
+ *  - try overlapped pixel method (from Ian Rickards) when processing
+ *    exactly two blocks of pixels
+ *  - maybe add an option to do reverse scanline processing
+ */
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY,       0
+.set FLAG_DST_READWRITE,       1
+.set FLAG_DEINTERLEAVE_32BPP,  2
+
+/*
+ * Offset in stack where mask and source pointer/stride can be accessed
+ * from 'init' macro. This is useful for doing special handling for solid mask.
+ */
+.set ARGS_STACK_OFFSET,        40
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
+.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
+.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
+
+/*
+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
+ * pixel data).
+ */
+
+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
+    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+.endm
+
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+.if numbytes == 32
+    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
+                              %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif numbytes == 16
+    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
+.elseif numbytes == 8
+    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
+.elseif numbytes == 4
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
+        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
+    .elseif elem_size == 16
+        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+    .endif
+.elseif numbytes == 2
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
+        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+    .endif
+.elseif numbytes == 1
+    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixld_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixld numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+.macro pixst_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixst numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[0]}, [TMP1, :16]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[1]}, [TMP2, :16]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[2]}, [TMP1, :16]
+    vld1.16 {d&reg1&[3]}, [TMP2, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    vld1.32 {d&reg1&[1]}, [TMP2, :32]
+.else
+    .error "unsupported"
+.endif
+.endm
+
+.macro pixld2_s elem_size, reg1, reg2, mem_operand
+.if elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    sub     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    vld1.32 {d&reg2&[0]}, [TMP2, :32]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[1]}, [TMP1, :32]
+    vld1.32 {d&reg2&[1]}, [TMP2, :32]
+.else
+    pixld1_s elem_size, reg1, mem_operand
+    pixld1_s elem_size, reg2, mem_operand
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #2
+    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if numbytes == 32
+    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
+    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
+    pixdeinterleave elem_size, %(basereg+4)
+.elseif numbytes == 16
+    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
+.elseif numbytes == 8
+    pixld1_s elem_size, %(basereg+1), mem_operand
+.elseif numbytes == 4
+    .if elem_size == 32
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .elseif elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 4, mem_operand
+        pixld0_s elem_size, %(basereg+0), 5, mem_operand
+        pixld0_s elem_size, %(basereg+0), 6, mem_operand
+        pixld0_s elem_size, %(basereg+0), 7, mem_operand
+    .endif
+.elseif numbytes == 2
+    .if elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .endif
+.elseif numbytes == 1
+    pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if bpp > 0
+    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.endif
+.endm
+
+.macro vuzp8 reg1, reg2
+    vuzp.8 d&reg1, d&reg2
+.endm
+
+.macro vzip8 reg1, reg2
+    vzip.8 d&reg1, d&reg2
+.endm
+
+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixdeinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vuzp8 %(basereg+0), %(basereg+1)
+    vuzp8 %(basereg+2), %(basereg+3)
+    vuzp8 %(basereg+1), %(basereg+3)
+    vuzp8 %(basereg+0), %(basereg+2)
+.endif
+.endm
+
+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vzip8 %(basereg+0), %(basereg+2)
+    vzip8 %(basereg+1), %(basereg+3)
+    vzip8 %(basereg+2), %(basereg+3)
+    vzip8 %(basereg+0), %(basereg+1)
+.endif
+.endm
+
+/*
+ * This is a macro for implementing cache preload. The main idea is that
+ * cache preload logic is mostly independent from the rest of pixels
+ * processing code. It starts at the top left pixel and moves forward
+ * across pixels and can jump across scanlines. Prefetch distance is
+ * handled in an 'incremental' way: it starts from 0 and advances to the
+ * optimal distance over time. After reaching optimal prefetch distance,
+ * it is kept constant. There are some checks which prevent prefetching
+ * unneeded pixel lines below the image (but it still can prefetch a bit
+ * more data on the right side of the image - not a big issue and may
+ * be actually helpful when rendering text glyphs). Additional trick is
+ * the use of LDR instruction for prefetch instead of PLD when moving to
+ * the next line, the point is that we have a high chance of getting TLB
+ * miss in this case, and PLD would be useless.
+ *
+ * This sounds like it may introduce a noticeable overhead (when working with
+ * fully cached data). But in reality, due to having a separate pipeline and
+ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
+ * execute simultaneously with NEON and be completely shadowed by it. Thus
+ * we get no performance overhead at all (*). This looks like a very nice
+ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
+ * but still can implement some rather advanced prefetch logic in sofware
+ * for almost zero cost!
+ *
+ * (*) The overhead of the prefetcher is visible when running some trivial
+ * pixels processing like simple copy. Anyway, having prefetch is a must
+ * when working with the graphics data.
+ */
+.macro PF a, x:vararg
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+    a x
+.endif
+.endm
+
+.macro cache_preload std_increment, boost_increment
+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+.if regs_shortage
+    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+.endif
+.if std_increment != 0
+    PF add PF_X, PF_X, #std_increment
+.endif
+    PF tst PF_CTL, #0xF
+    PF addne PF_X, PF_X, #boost_increment
+    PF subne PF_CTL, PF_CTL, #1
+    PF cmp PF_X, ORIG_W
+.if src_bpp_shift >= 0
+    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+.endif
+.if dst_r_bpp != 0
+    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+.endif
+.if mask_bpp_shift >= 0
+    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+.endif
+    PF subge PF_X, PF_X, ORIG_W
+    PF subges PF_CTL, PF_CTL, #0x10
+.if src_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endif
+.if dst_r_bpp != 0
+    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+.endif
+.if mask_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+.endif
+.endif
+.endm
+
+.macro cache_preload_simple
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+.if src_bpp > 0
+    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+.endif
+.if dst_r_bpp > 0
+    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
+.endif
+.if mask_bpp > 0
+    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
+.endif
+.endif
+.endm
+
+.macro fetch_mask_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+.endm
+
+/*
+ * Macro which is used to process leading pixels until destination
+ * pointer is properly aligned (at 16 bytes boundary). When destination
+ * buffer uses 16bpp format, this is unnecessary, or even pointless.
+ */
+.macro ensure_destination_ptr_alignment process_pixblock_head, \
+                                        process_pixblock_tail, \
+                                        process_pixblock_tail_head
+.if dst_w_bpp != 24
+    tst         DST_R, #0xF
+    beq         2f
+
+.irp lowbit, 1, 2, 4, 8, 16
+local skip1
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_R, #lowbit
+    beq         1f
+.endif
+    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+.if dst_r_bpp > 0
+    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+.else
+    add         DST_R, DST_R, #lowbit
+.endif
+    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+    sub         W, W, #(lowbit * 8 / dst_w_bpp)
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    process_pixblock_tail
+
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp lowbit, 1, 2, 4, 8, 16
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_W, #lowbit
+    beq         1f
+.endif
+    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+1:
+.endif
+.endr
+.endif
+2:
+.endm
+
+/*
+ * Special code for processing up to (pixblock_size - 1) remaining
+ * trailing pixels. As SIMD processing performs operation on
+ * pixblock_size pixels, anything smaller than this has to be loaded
+ * and stored in a special way. Loading and storing of pixel data is
+ * performed in such a way that we fill some 'slots' in the NEON
+ * registers (some slots naturally are unused), then perform compositing
+ * operation as usual. In the end, the data is taken from these 'slots'
+ * and saved to memory.
+ *
+ * cache_preload_flag - allows to suppress prefetch if
+ *                      set to 0
+ * dst_aligned_flag   - selects whether destination buffer
+ *                      is aligned
+ */
+.macro process_trailing_pixels cache_preload_flag, \
+                               dst_aligned_flag, \
+                               process_pixblock_head, \
+                               process_pixblock_tail, \
+                               process_pixblock_tail_head
+    tst         W, #(pixblock_size - 1)
+    beq         2f
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+    pixld_src   chunk_size, src_bpp, src_basereg, SRC
+    pixld       chunk_size, mask_bpp, mask_basereg, MASK
+.if dst_aligned_flag != 0
+    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.else
+    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.endif
+.if cache_preload_flag != 0
+    PF add      PF_X, PF_X, #chunk_size
+.endif
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+.if cache_preload_flag != 0
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+.endif
+    process_pixblock_tail
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+.if dst_aligned_flag != 0
+    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.else
+    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.endif
+1:
+.endif
+.endr
+2:
+.endm
+
+/*
+ * Macro, which performs all the needed operations to switch to the next
+ * scanline and start the next loop iteration unless all the scanlines
+ * are already processed.
+ */
+.macro advance_to_next_scanline start_of_loop_label
+.if regs_shortage
+    ldrd        W, [sp] /* load W and H (width and height) from stack */
+.else
+    mov         W, ORIG_W
+.endif
+    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
+.if src_bpp != 0
+    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
+.endif
+.if mask_bpp != 0
+    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+.endif
+.if (dst_w_bpp != 24)
+    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
+    sub         SRC, SRC, W, lsl #src_bpp_shift
+.endif
+.if (mask_bpp != 24) && (mask_bpp != 0)
+    sub         MASK, MASK, W, lsl #mask_bpp_shift
+.endif
+    subs        H, H, #1
+    mov         DST_R, DST_W
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.endif
+    bge         start_of_loop_label
+.endm
+
+/*
+ * Registers are allocated in the following way by default:
+ * d0, d1, d2, d3     - reserved for loading source pixel data
+ * d4, d5, d6, d7     - reserved for loading destination pixel data
+ * d24, d25, d26, d27 - reserved for loading mask pixel data
+ * d28, d29, d30, d31 - final destination pixel data for writeback to memory
+ */
+.macro generate_composite_function fname, \
+                                   src_bpp_, \
+                                   mask_bpp_, \
+                                   dst_w_bpp_, \
+                                   flags, \
+                                   pixblock_size_, \
+                                   prefetch_distance, \
+                                   init, \
+                                   cleanup, \
+                                   process_pixblock_head, \
+                                   process_pixblock_tail, \
+                                   process_pixblock_tail_head, \
+                                   dst_w_basereg_ = 28, \
+                                   dst_r_basereg_ = 4, \
+                                   src_basereg_   = 0, \
+                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    push        {r4-r12, lr}        /* save all registers */
+
+/*
+ * Select prefetch type for this function. If prefetch distance is
+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+ * has to be used instead of ADVANCED.
+ */
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+.if prefetch_distance == 0
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+.endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+/*
+ * Assign symbolic names to registers
+ */
+    W           .req        r0      /* width (is updated during processing) */
+    H           .req        r1      /* height (is updated during processing) */
+    DST_W       .req        r2      /* destination buffer pointer for writes */
+    DST_STRIDE  .req        r3      /* destination image stride */
+    SRC         .req        r4      /* source buffer pointer */
+    SRC_STRIDE  .req        r5      /* source image stride */
+    DST_R       .req        r6      /* destination buffer pointer for reads */
+
+    MASK        .req        r7      /* mask pointer */
+    MASK_STRIDE .req        r8      /* mask stride */
+
+    PF_CTL      .req        r9      /* combined lines counter and prefetch */
+                                    /* distance increment counter */
+    PF_X        .req        r10     /* pixel index in a scanline for current */
+                                    /* pretetch position */
+    PF_SRC      .req        r11     /* pointer to source scanline start */
+                                    /* for prefetch purposes */
+    PF_DST      .req        r12     /* pointer to destination scanline start */
+                                    /* for prefetch purposes */
+    PF_MASK     .req        r14     /* pointer to mask scanline start */
+                                    /* for prefetch purposes */
+/*
+ * Check whether we have enough registers for all the local variables.
+ * If we don't have enough registers, original width and height are
+ * kept on top of stack (and 'regs_shortage' variable is set to indicate
+ * this for the rest of code). Even if there are enough registers, the
+ * allocation scheme may be a bit different depending on whether source
+ * or mask is not used.
+ */
+.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
+    ORIG_W      .req        r10     /* saved original width */
+    DUMMY       .req        r12     /* temporary register */
+    .set        regs_shortage, 0
+.elseif mask_bpp == 0
+    ORIG_W      .req        r7      /* saved original width */
+    DUMMY       .req        r8      /* temporary register */
+    .set        regs_shortage, 0
+.elseif src_bpp == 0
+    ORIG_W      .req        r4      /* saved original width */
+    DUMMY       .req        r5      /* temporary register */
+    .set        regs_shortage, 0
+.else
+    ORIG_W      .req        r1      /* saved original width */
+    DUMMY       .req        r1      /* temporary register */
+    .set        regs_shortage, 1
+.endif
+
+    .set mask_bpp_shift, -1
+.if src_bpp == 32
+    .set src_bpp_shift, 2
+.elseif src_bpp == 24
+    .set src_bpp_shift, 0
+.elseif src_bpp == 16
+    .set src_bpp_shift, 1
+.elseif src_bpp == 8
+    .set src_bpp_shift, 0
+.elseif src_bpp == 0
+    .set src_bpp_shift, -1
+.else
+    .error "requested src bpp (src_bpp) is not supported"
+.endif
+.if mask_bpp == 32
+    .set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 8
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 0
+    .set mask_bpp_shift, -1
+.else
+    .error "requested mask bpp (mask_bpp) is not supported"
+.endif
+.if dst_w_bpp == 32
+    .set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+    .set dst_bpp_shift, 0
+.elseif dst_w_bpp == 16
+    .set dst_bpp_shift, 1
+.elseif dst_w_bpp == 8
+    .set dst_bpp_shift, 0
+.else
+    .error "requested dst bpp (dst_w_bpp) is not supported"
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+.if prefetch_distance < 0 || prefetch_distance > 15
+    .error "invalid prefetch distance (prefetch_distance)"
+.endif
+
+.if src_bpp > 0
+    ldr         SRC, [sp, #40]
+.endif
+.if mask_bpp > 0
+    ldr         MASK, [sp, #48]
+.endif
+    PF mov      PF_X, #0
+.if src_bpp > 0
+    ldr         SRC_STRIDE, [sp, #44]
+.endif
+.if mask_bpp > 0
+    ldr         MASK_STRIDE, [sp, #52]
+.endif
+    mov         DST_R, DST_W
+
+.if src_bpp == 24
+    sub         SRC_STRIDE, SRC_STRIDE, W
+    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+    sub         MASK_STRIDE, MASK_STRIDE, W
+    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+    sub         DST_STRIDE, DST_STRIDE, W
+    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+
+/*
+ * Setup advanced prefetcher initial state
+ */
+    PF mov      PF_SRC, SRC
+    PF mov      PF_DST, DST_R
+    PF mov      PF_MASK, MASK
+    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+    PF mov      PF_CTL, H, lsl #4
+    PF add      PF_CTL, #(prefetch_distance - 0x10)
+
+    init
+.if regs_shortage
+    push        {r0, r1}
+.endif
+    subs        H, H, #1
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.else
+    mov         ORIG_W, W
+.endif
+    blt         9f
+    cmp         W, #(pixblock_size * 2)
+    blt         8f
+/*
+ * This is the start of the pipelined loop, which if optimized for
+ * long scanlines
+ */
+0:
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    PF add      PF_X, PF_X, #pixblock_size
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    subs        W, W, #(pixblock_size * 2)
+    blt         2f
+1:
+    process_pixblock_tail_head
+    cache_preload_simple
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 1, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 0b
+
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+/*
+ * This is the start of the loop, designed to process images with small width
+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
+ * nor prefetch are used.
+ */
+8:
+    /* Process exactly pixblock_size pixels if needed */
+    tst         W, #pixblock_size
+    beq         1f
+    pixld       pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    process_pixblock_tail
+    pixst       pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+1:
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 8b
+9:
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      ORIG_W
+    .unreq      W
+    .unreq      H
+    .unreq      SRC_STRIDE
+    .unreq      DST_STRIDE
+    .unreq      MASK_STRIDE
+    .unreq      PF_CTL
+    .unreq      PF_X
+    .unreq      PF_SRC
+    .unreq      PF_DST
+    .unreq      PF_MASK
+    .unreq      DUMMY
+    .endfunc
+.endm
+
+/*
+ * A simplified variant of function generation template for a single
+ * scanline processing (for implementing pixman combine functions)
+ */
+.macro generate_composite_function_scanline        use_nearest_scaling, \
+                                                   fname, \
+                                                   src_bpp_, \
+                                                   mask_bpp_, \
+                                                   dst_w_bpp_, \
+                                                   flags, \
+                                                   pixblock_size_, \
+                                                   init, \
+                                                   cleanup, \
+                                                   process_pixblock_head, \
+                                                   process_pixblock_tail, \
+                                                   process_pixblock_tail_head, \
+                                                   dst_w_basereg_ = 28, \
+                                                   dst_r_basereg_ = 4, \
+                                                   src_basereg_   = 0, \
+                                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+.if use_nearest_scaling != 0
+    /*
+     * Assign symbolic names to registers for nearest scaling
+     */
+    W           .req        r0
+    DST_W       .req        r1
+    SRC         .req        r2
+    VX          .req        r3
+    UNIT_X      .req        ip
+    MASK        .req        lr
+    TMP1        .req        r4
+    TMP2        .req        r5
+    DST_R       .req        r6
+
+    .macro pixld_src x:vararg
+        pixld_s x
+    .endm
+
+    ldr         UNIT_X, [sp]
+    push        {r4-r6, lr}
+    .if mask_bpp != 0
+    ldr         MASK, [sp, #(16 + 4)]
+    .endif
+.else
+    /*
+     * Assign symbolic names to registers
+     */
+    W           .req        r0      /* width (is updated during processing) */
+    DST_W       .req        r1      /* destination buffer pointer for writes */
+    SRC         .req        r2      /* source buffer pointer */
+    DST_R       .req        ip      /* destination buffer pointer for reads */
+    MASK        .req        r3      /* mask pointer */
+
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+
+    init
+    mov         DST_R, DST_W
+
+    cmp         W, #pixblock_size
+    blt         8f
+
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    subs        W, W, #pixblock_size
+    blt         7f
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    subs        W, W, #pixblock_size
+    blt         2f
+1:
+    process_pixblock_tail_head
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+7:
+    /* Process the remaining trailing pixels in the scanline (dst aligned) */
+    process_trailing_pixels 0, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+.if use_nearest_scaling != 0
+    pop         {r4-r6, pc}  /* exit */
+.else
+    bx          lr  /* exit */
+.endif
+8:
+    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+
+.if use_nearest_scaling != 0
+    pop         {r4-r6, pc}  /* exit */
+
+    .unreq      DST_R
+    .unreq      SRC
+    .unreq      W
+    .unreq      VX
+    .unreq      UNIT_X
+    .unreq      TMP1
+    .unreq      TMP2
+    .unreq      DST_W
+    .unreq      MASK
+
+.else
+    bx          lr  /* exit */
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      W
+.endif
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    .endfunc
+.endm
+
+.macro generate_composite_function_single_scanline x:vararg
+    generate_composite_function_scanline 0, x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+    generate_composite_function_scanline 1, x
+.endm
+
+/* Default prologue/epilogue, nothing special needs to be done */
+
+.macro default_init
+.endm
+
+.macro default_cleanup
+.endm
+
+/*
+ * Prologue/epilogue variant which additionally saves/restores d8-d15
+ * registers (they need to be saved/restored by callee according to ABI).
+ * This is required if the code needs to use all the NEON registers.
+ */
+
+.macro default_init_need_all_regs
+    vpush       {d8-d15}
+.endm
+
+.macro default_cleanup_need_all_regs
+    vpop        {d8-d15}
+.endm
+
+/******************************************************************************/
+
+/*
+ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
+ * into a planar a8r8g8b8 format (with a, r, g, b color components
+ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
+ *
+ * Warning: the conversion is destructive and the original
+ *          value (in) is lost.
+ */
+.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
+    vshrn.u16   out_r, in,    #8
+    vshrn.u16   out_g, in,    #3
+    vsli.u16    in,    in,    #5
+    vmov.u8     out_a, #255
+    vsri.u8     out_r, out_r, #5
+    vsri.u8     out_g, out_g, #6
+    vshrn.u16   out_b, in,    #2
+.endm
+
+.macro convert_0565_to_x888 in, out_r, out_g, out_b
+    vshrn.u16   out_r, in,    #8
+    vshrn.u16   out_g, in,    #3
+    vsli.u16    in,    in,    #5
+    vsri.u8     out_r, out_r, #5
+    vsri.u8     out_g, out_g, #6
+    vshrn.u16   out_b, in,    #2
+.endm
+
+/*
+ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
+ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
+ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
+ * registers (tmp1, tmp2)
+ */
+.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
+    vshll.u8    tmp1, in_g, #8
+    vshll.u8    out, in_r, #8
+    vshll.u8    tmp2, in_b, #8
+    vsri.u16    out, tmp1, #5
+    vsri.u16    out, tmp2, #11
+.endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+    vshl.u16    out0, in,   #5  /* G top 6 bits */
+    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
+    vsri.u16    in,   in,   #5  /* R is ready in top bits */
+    vsri.u16    out0, out0, #6  /* G is ready in top bits */
+    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
+    vshr.u16    out1, in,   #8  /* R is in place */
+    vsri.u16    out0, tmp,  #8  /* G & B is in place */
+    vzip.u16    out0, out1      /* everything is in place */
+.endm
diff --git a/pixman/pixman/pixman-edge-accessors.c b/pixman/pixman/pixman-edge-accessors.c
index ea3a31e2f..0f2c56e74 100644
--- a/pixman/pixman/pixman-edge-accessors.c
+++ b/pixman/pixman/pixman-edge-accessors.c
@@ -1,4 +1,4 @@
-
-#define PIXMAN_FB_ACCESSORS
-
-#include "pixman-edge.c"
+
+#define PIXMAN_FB_ACCESSORS
+
+#include "pixman-edge.c"
diff --git a/pixman/pixman/pixman-edge-imp.h b/pixman/pixman/pixman-edge-imp.h
index a4698eddb..20ffda896 100644
--- a/pixman/pixman/pixman-edge-imp.h
+++ b/pixman/pixman/pixman-edge-imp.h
@@ -1,182 +1,182 @@
-/*
- * Copyright © 2004 Keith Packard
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifndef rasterize_span
-#endif
-
-static void
-RASTERIZE_EDGES (pixman_image_t  *image,
-		pixman_edge_t	*l,
-		pixman_edge_t	*r,
-		pixman_fixed_t		t,
-		pixman_fixed_t		b)
-{
-    pixman_fixed_t  y = t;
-    uint32_t  *line;
-    uint32_t *buf = (image)->bits.bits;
-    int stride = (image)->bits.rowstride;
-    int width = (image)->bits.width;
-
-    line = buf + pixman_fixed_to_int (y) * stride;
-
-    for (;;)
-    {
-	pixman_fixed_t	lx;
-	pixman_fixed_t      rx;
-	int	lxi;
-	int rxi;
-
-	lx = l->x;
-	rx = r->x;
-#if N_BITS == 1
-	/* For the non-antialiased case, round the coordinates up, in effect
-	 * sampling just slightly to the left of the pixel. This is so that
-	 * when the sample point lies exactly on the line, we round towards
-	 * north-west.
-	 *
-	 * (The AA case does a similar  adjustment in RENDER_SAMPLES_X)
-	 */
-	lx += X_FRAC_FIRST(1) - pixman_fixed_e;
-	rx += X_FRAC_FIRST(1) - pixman_fixed_e;
-#endif
-	/* clip X */
-	if (lx < 0)
-	    lx = 0;
-	if (pixman_fixed_to_int (rx) >= width)
-#if N_BITS == 1
-	    rx = pixman_int_to_fixed (width);
-#else
-	    /* Use the last pixel of the scanline, covered 100%.
-	     * We can't use the first pixel following the scanline,
-	     * because accessing it could result in a buffer overrun.
-	     */
-	    rx = pixman_int_to_fixed (width) - 1;
-#endif
-
-	/* Skip empty (or backwards) sections */
-	if (rx > lx)
-	{
-
-	    /* Find pixel bounds for span */
-	    lxi = pixman_fixed_to_int (lx);
-	    rxi = pixman_fixed_to_int (rx);
-
-#if N_BITS == 1
-	    {
-
-#define LEFT_MASK(x)							\
-		(((x) & 0x1f) ?						\
-		 SCREEN_SHIFT_RIGHT (0xffffffff, (x) & 0x1f) : 0)
-#define RIGHT_MASK(x)							\
-		(((32 - (x)) & 0x1f) ?					\
-		 SCREEN_SHIFT_LEFT (0xffffffff, (32 - (x)) & 0x1f) : 0)
-		
-#define MASK_BITS(x,w,l,n,r) {						\
-		    n = (w);						\
-		    r = RIGHT_MASK ((x) + n);				\
-		    l = LEFT_MASK (x);					\
-		    if (l) {						\
-			n -= 32 - ((x) & 0x1f);				\
-			if (n < 0) {					\
-			    n = 0;					\
-			    l &= r;					\
-			    r = 0;					\
-			}						\
-		    }							\
-		    n >>= 5;						\
-		}
-		
-		uint32_t  *a = line;
-		uint32_t  startmask;
-		uint32_t  endmask;
-		int	    nmiddle;
-		int	    width = rxi - lxi;
-		int	    x = lxi;
-		
-		a += x >> 5;
-		x &= 0x1f;
-		
-		MASK_BITS (x, width, startmask, nmiddle, endmask);
-
-		if (startmask) {
-		    WRITE(image, a, READ(image, a) | startmask);
-		    a++;
-		}
-		while (nmiddle--)
-		    WRITE(image, a++, 0xffffffff);
-		if (endmask)
-		    WRITE(image, a, READ(image, a) | endmask);
-	    }
-#else
-	    {
-		DEFINE_ALPHA(line,lxi);
-		int	    lxs;
-		int     rxs;
-
-		/* Sample coverage for edge pixels */
-		lxs = RENDER_SAMPLES_X (lx, N_BITS);
-		rxs = RENDER_SAMPLES_X (rx, N_BITS);
-
-		/* Add coverage across row */
-		if (lxi == rxi)
-		{
-		    ADD_ALPHA (rxs - lxs);
-		}
-		else
-		{
-		    int	xi;
-
-		    ADD_ALPHA (N_X_FRAC(N_BITS) - lxs);
-		    STEP_ALPHA;
-		    for (xi = lxi + 1; xi < rxi; xi++)
-		    {
-			ADD_ALPHA (N_X_FRAC(N_BITS));
-			STEP_ALPHA;
-		    }
-		    ADD_ALPHA (rxs);
-		}
-	    }
-#endif
-	}
-
-	if (y == b)
-	    break;
-
-#if N_BITS > 1
-	if (pixman_fixed_frac (y) != Y_FRAC_LAST(N_BITS))
-	{
-	    RENDER_EDGE_STEP_SMALL (l);
-	    RENDER_EDGE_STEP_SMALL (r);
-	    y += STEP_Y_SMALL(N_BITS);
-	}
-	else
-#endif
-	{
-	    RENDER_EDGE_STEP_BIG (l);
-	    RENDER_EDGE_STEP_BIG (r);
-	    y += STEP_Y_BIG(N_BITS);
-	    line += stride;
-	}
-    }
-}
-
-#undef rasterize_span
+/*
+ * Copyright © 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef rasterize_span
+#endif
+
+static void
+RASTERIZE_EDGES (pixman_image_t  *image,
+		pixman_edge_t	*l,
+		pixman_edge_t	*r,
+		pixman_fixed_t		t,
+		pixman_fixed_t		b)
+{
+    pixman_fixed_t  y = t;
+    uint32_t  *line;
+    uint32_t *buf = (image)->bits.bits;
+    int stride = (image)->bits.rowstride;
+    int width = (image)->bits.width;
+
+    line = buf + pixman_fixed_to_int (y) * stride;
+
+    for (;;)
+    {
+	pixman_fixed_t	lx;
+	pixman_fixed_t      rx;
+	int	lxi;
+	int rxi;
+
+	lx = l->x;
+	rx = r->x;
+#if N_BITS == 1
+	/* For the non-antialiased case, round the coordinates up, in effect
+	 * sampling just slightly to the left of the pixel. This is so that
+	 * when the sample point lies exactly on the line, we round towards
+	 * north-west.
+	 *
+	 * (The AA case does a similar  adjustment in RENDER_SAMPLES_X)
+	 */
+	lx += X_FRAC_FIRST(1) - pixman_fixed_e;
+	rx += X_FRAC_FIRST(1) - pixman_fixed_e;
+#endif
+	/* clip X */
+	if (lx < 0)
+	    lx = 0;
+	if (pixman_fixed_to_int (rx) >= width)
+#if N_BITS == 1
+	    rx = pixman_int_to_fixed (width);
+#else
+	    /* Use the last pixel of the scanline, covered 100%.
+	     * We can't use the first pixel following the scanline,
+	     * because accessing it could result in a buffer overrun.
+	     */
+	    rx = pixman_int_to_fixed (width) - 1;
+#endif
+
+	/* Skip empty (or backwards) sections */
+	if (rx > lx)
+	{
+
+	    /* Find pixel bounds for span */
+	    lxi = pixman_fixed_to_int (lx);
+	    rxi = pixman_fixed_to_int (rx);
+
+#if N_BITS == 1
+	    {
+
+#define LEFT_MASK(x)							\
+		(((x) & 0x1f) ?						\
+		 SCREEN_SHIFT_RIGHT (0xffffffff, (x) & 0x1f) : 0)
+#define RIGHT_MASK(x)							\
+		(((32 - (x)) & 0x1f) ?					\
+		 SCREEN_SHIFT_LEFT (0xffffffff, (32 - (x)) & 0x1f) : 0)
+		
+#define MASK_BITS(x,w,l,n,r) {						\
+		    n = (w);						\
+		    r = RIGHT_MASK ((x) + n);				\
+		    l = LEFT_MASK (x);					\
+		    if (l) {						\
+			n -= 32 - ((x) & 0x1f);				\
+			if (n < 0) {					\
+			    n = 0;					\
+			    l &= r;					\
+			    r = 0;					\
+			}						\
+		    }							\
+		    n >>= 5;						\
+		}
+		
+		uint32_t  *a = line;
+		uint32_t  startmask;
+		uint32_t  endmask;
+		int	    nmiddle;
+		int	    width = rxi - lxi;
+		int	    x = lxi;
+		
+		a += x >> 5;
+		x &= 0x1f;
+		
+		MASK_BITS (x, width, startmask, nmiddle, endmask);
+
+		if (startmask) {
+		    WRITE(image, a, READ(image, a) | startmask);
+		    a++;
+		}
+		while (nmiddle--)
+		    WRITE(image, a++, 0xffffffff);
+		if (endmask)
+		    WRITE(image, a, READ(image, a) | endmask);
+	    }
+#else
+	    {
+		DEFINE_ALPHA(line,lxi);
+		int	    lxs;
+		int     rxs;
+
+		/* Sample coverage for edge pixels */
+		lxs = RENDER_SAMPLES_X (lx, N_BITS);
+		rxs = RENDER_SAMPLES_X (rx, N_BITS);
+
+		/* Add coverage across row */
+		if (lxi == rxi)
+		{
+		    ADD_ALPHA (rxs - lxs);
+		}
+		else
+		{
+		    int	xi;
+
+		    ADD_ALPHA (N_X_FRAC(N_BITS) - lxs);
+		    STEP_ALPHA;
+		    for (xi = lxi + 1; xi < rxi; xi++)
+		    {
+			ADD_ALPHA (N_X_FRAC(N_BITS));
+			STEP_ALPHA;
+		    }
+		    ADD_ALPHA (rxs);
+		}
+	    }
+#endif
+	}
+
+	if (y == b)
+	    break;
+
+#if N_BITS > 1
+	if (pixman_fixed_frac (y) != Y_FRAC_LAST(N_BITS))
+	{
+	    RENDER_EDGE_STEP_SMALL (l);
+	    RENDER_EDGE_STEP_SMALL (r);
+	    y += STEP_Y_SMALL(N_BITS);
+	}
+	else
+#endif
+	{
+	    RENDER_EDGE_STEP_BIG (l);
+	    RENDER_EDGE_STEP_BIG (r);
+	    y += STEP_Y_BIG(N_BITS);
+	    line += stride;
+	}
+    }
+}
+
+#undef rasterize_span
diff --git a/pixman/pixman/pixman-edge.c b/pixman/pixman/pixman-edge.c
index 8d498ab44..22b0158ba 100644
--- a/pixman/pixman/pixman-edge.c
+++ b/pixman/pixman/pixman-edge.c
@@ -1,384 +1,384 @@
-/*
- * Copyright © 2004 Keith Packard
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <string.h>
-
-#include "pixman-private.h"
-#include "pixman-accessor.h"
-
-/*
- * Step across a small sample grid gap
- */
-#define RENDER_EDGE_STEP_SMALL(edge)					\
-    {									\
-	edge->x += edge->stepx_small;					\
-	edge->e += edge->dx_small;					\
-	if (edge->e > 0)						\
-	{								\
-	    edge->e -= edge->dy;					\
-	    edge->x += edge->signdx;					\
-	}								\
-    }
-
-/*
- * Step across a large sample grid gap
- */
-#define RENDER_EDGE_STEP_BIG(edge)					\
-    {									\
-	edge->x += edge->stepx_big;					\
-	edge->e += edge->dx_big;					\
-	if (edge->e > 0)						\
-	{								\
-	    edge->e -= edge->dy;					\
-	    edge->x += edge->signdx;					\
-	}								\
-    }
-
-#ifdef PIXMAN_FB_ACCESSORS
-#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_accessors
-#else
-#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_no_accessors
-#endif
-
-/*
- * 4 bit alpha
- */
-
-#define N_BITS  4
-#define RASTERIZE_EDGES rasterize_edges_4
-
-#ifndef WORDS_BIGENDIAN
-#define SHIFT_4(o)      ((o) << 2)
-#else
-#define SHIFT_4(o)      ((1 - (o)) << 2)
-#endif
-
-#define GET_4(x, o)      (((x) >> SHIFT_4 (o)) & 0xf)
-#define PUT_4(x, o, v)							\
-    (((x) & ~(0xf << SHIFT_4 (o))) | (((v) & 0xf) << SHIFT_4 (o)))
-
-#define DEFINE_ALPHA(line, x)						\
-    uint8_t   *__ap = (uint8_t *) line + ((x) >> 1);			\
-    int __ao = (x) & 1
-
-#define STEP_ALPHA      ((__ap += __ao), (__ao ^= 1))
-
-#define ADD_ALPHA(a)							\
-    {									\
-        uint8_t __o = READ (image, __ap);				\
-        uint8_t __a = (a) + GET_4 (__o, __ao);				\
-        WRITE (image, __ap, PUT_4 (__o, __ao, __a | (0 - ((__a) >> 4)))); \
-    }
-
-#include "pixman-edge-imp.h"
-
-#undef ADD_ALPHA
-#undef STEP_ALPHA
-#undef DEFINE_ALPHA
-#undef RASTERIZE_EDGES
-#undef N_BITS
-
-
-/*
- * 1 bit alpha
- */
-
-#define N_BITS 1
-#define RASTERIZE_EDGES rasterize_edges_1
-
-#include "pixman-edge-imp.h"
-
-#undef RASTERIZE_EDGES
-#undef N_BITS
-
-/*
- * 8 bit alpha
- */
-
-static force_inline uint8_t
-clip255 (int x)
-{
-    if (x > 255)
-	return 255;
-
-    return x;
-}
-
-#define ADD_SATURATE_8(buf, val, length)				\
-    do									\
-    {									\
-        int i__ = (length);						\
-        uint8_t *buf__ = (buf);						\
-        int val__ = (val);						\
-									\
-        while (i__--)							\
-        {								\
-            WRITE (image, (buf__), clip255 (READ (image, (buf__)) + (val__))); \
-            (buf__)++;							\
-	}								\
-    } while (0)
-
-/*
- * We want to detect the case where we add the same value to a long
- * span of pixels.  The triangles on the end are filled in while we
- * count how many sub-pixel scanlines contribute to the middle section.
- *
- *                 +--------------------------+
- *  fill_height =|   \                      /
- *                     +------------------+
- *                      |================|
- *                   fill_start       fill_end
- */
-static void
-rasterize_edges_8 (pixman_image_t *image,
-                   pixman_edge_t * l,
-                   pixman_edge_t * r,
-                   pixman_fixed_t  t,
-                   pixman_fixed_t  b)
-{
-    pixman_fixed_t y = t;
-    uint32_t  *line;
-    int fill_start = -1, fill_end = -1;
-    int fill_size = 0;
-    uint32_t *buf = (image)->bits.bits;
-    int stride = (image)->bits.rowstride;
-    int width = (image)->bits.width;
-
-    line = buf + pixman_fixed_to_int (y) * stride;
-
-    for (;;)
-    {
-        uint8_t *ap = (uint8_t *) line;
-        pixman_fixed_t lx, rx;
-        int lxi, rxi;
-
-        /* clip X */
-        lx = l->x;
-        if (lx < 0)
-	    lx = 0;
-
-        rx = r->x;
-
-        if (pixman_fixed_to_int (rx) >= width)
-	{
-	    /* Use the last pixel of the scanline, covered 100%.
-	     * We can't use the first pixel following the scanline,
-	     * because accessing it could result in a buffer overrun.
-	     */
-	    rx = pixman_int_to_fixed (width) - 1;
-	}
-
-        /* Skip empty (or backwards) sections */
-        if (rx > lx)
-        {
-            int lxs, rxs;
-
-            /* Find pixel bounds for span. */
-            lxi = pixman_fixed_to_int (lx);
-            rxi = pixman_fixed_to_int (rx);
-
-            /* Sample coverage for edge pixels */
-            lxs = RENDER_SAMPLES_X (lx, 8);
-            rxs = RENDER_SAMPLES_X (rx, 8);
-
-            /* Add coverage across row */
-            if (lxi == rxi)
-            {
-                WRITE (image, ap + lxi,
-		       clip255 (READ (image, ap + lxi) + rxs - lxs));
-	    }
-            else
-            {
-                WRITE (image, ap + lxi,
-		       clip255 (READ (image, ap + lxi) + N_X_FRAC (8) - lxs));
-
-                /* Move forward so that lxi/rxi is the pixel span */
-                lxi++;
-
-                /* Don't bother trying to optimize the fill unless
-		 * the span is longer than 4 pixels. */
-                if (rxi - lxi > 4)
-                {
-                    if (fill_start < 0)
-                    {
-                        fill_start = lxi;
-                        fill_end = rxi;
-                        fill_size++;
-		    }
-                    else
-                    {
-                        if (lxi >= fill_end || rxi < fill_start)
-                        {
-                            /* We're beyond what we saved, just fill it */
-                            ADD_SATURATE_8 (ap + fill_start,
-                                            fill_size * N_X_FRAC (8),
-                                            fill_end - fill_start);
-                            fill_start = lxi;
-                            fill_end = rxi;
-                            fill_size = 1;
-			}
-                        else
-                        {
-                            /* Update fill_start */
-                            if (lxi > fill_start)
-                            {
-                                ADD_SATURATE_8 (ap + fill_start,
-                                                fill_size * N_X_FRAC (8),
-                                                lxi - fill_start);
-                                fill_start = lxi;
-			    }
-                            else if (lxi < fill_start)
-                            {
-                                ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8),
-                                                fill_start - lxi);
-			    }
-
-                            /* Update fill_end */
-                            if (rxi < fill_end)
-                            {
-                                ADD_SATURATE_8 (ap + rxi,
-                                                fill_size * N_X_FRAC (8),
-                                                fill_end - rxi);
-                                fill_end = rxi;
-			    }
-                            else if (fill_end < rxi)
-                            {
-                                ADD_SATURATE_8 (ap + fill_end,
-                                                N_X_FRAC (8),
-                                                rxi - fill_end);
-			    }
-                            fill_size++;
-			}
-		    }
-		}
-                else
-                {
-                    ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8), rxi - lxi);
-		}
-
-                WRITE (image, ap + rxi, clip255 (READ (image, ap + rxi) + rxs));
-	    }
-	}
-
-        if (y == b)
-        {
-            /* We're done, make sure we clean up any remaining fill. */
-            if (fill_start != fill_end)
-            {
-                if (fill_size == N_Y_FRAC (8))
-                {
-                    MEMSET_WRAPPED (image, ap + fill_start,
-				    0xff, fill_end - fill_start);
-		}
-                else
-                {
-                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
-                                    fill_end - fill_start);
-		}
-	    }
-            break;
-	}
-
-        if (pixman_fixed_frac (y) != Y_FRAC_LAST (8))
-        {
-            RENDER_EDGE_STEP_SMALL (l);
-            RENDER_EDGE_STEP_SMALL (r);
-            y += STEP_Y_SMALL (8);
-	}
-        else
-        {
-            RENDER_EDGE_STEP_BIG (l);
-            RENDER_EDGE_STEP_BIG (r);
-            y += STEP_Y_BIG (8);
-            if (fill_start != fill_end)
-            {
-                if (fill_size == N_Y_FRAC (8))
-                {
-                    MEMSET_WRAPPED (image, ap + fill_start,
-				    0xff, fill_end - fill_start);
-		}
-                else
-                {
-                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
-                                    fill_end - fill_start);
-		}
-		
-                fill_start = fill_end = -1;
-                fill_size = 0;
-	    }
-	    
-            line += stride;
-	}
-    }
-}
-
-#ifndef PIXMAN_FB_ACCESSORS
-static
-#endif
-void
-PIXMAN_RASTERIZE_EDGES (pixman_image_t *image,
-                        pixman_edge_t * l,
-                        pixman_edge_t * r,
-                        pixman_fixed_t  t,
-                        pixman_fixed_t  b)
-{
-    switch (PIXMAN_FORMAT_BPP (image->bits.format))
-    {
-    case 1:
-	rasterize_edges_1 (image, l, r, t, b);
-	break;
-
-    case 4:
-	rasterize_edges_4 (image, l, r, t, b);
-	break;
-
-    case 8:
-	rasterize_edges_8 (image, l, r, t, b);
-	break;
-
-    default:
-        break;
-    }
-}
-
-#ifndef PIXMAN_FB_ACCESSORS
-
-PIXMAN_EXPORT void
-pixman_rasterize_edges (pixman_image_t *image,
-                        pixman_edge_t * l,
-                        pixman_edge_t * r,
-                        pixman_fixed_t  t,
-                        pixman_fixed_t  b)
-{
-    return_if_fail (image->type == BITS);
-    
-    if (image->bits.read_func || image->bits.write_func)
-	pixman_rasterize_edges_accessors (image, l, r, t, b);
-    else
-	pixman_rasterize_edges_no_accessors (image, l, r, t, b);
-}
-
-#endif
+/*
+ * Copyright © 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+
+#include "pixman-private.h"
+#include "pixman-accessor.h"
+
+/*
+ * Step across a small sample grid gap
+ */
+#define RENDER_EDGE_STEP_SMALL(edge)					\
+    {									\
+	edge->x += edge->stepx_small;					\
+	edge->e += edge->dx_small;					\
+	if (edge->e > 0)						\
+	{								\
+	    edge->e -= edge->dy;					\
+	    edge->x += edge->signdx;					\
+	}								\
+    }
+
+/*
+ * Step across a large sample grid gap
+ */
+#define RENDER_EDGE_STEP_BIG(edge)					\
+    {									\
+	edge->x += edge->stepx_big;					\
+	edge->e += edge->dx_big;					\
+	if (edge->e > 0)						\
+	{								\
+	    edge->e -= edge->dy;					\
+	    edge->x += edge->signdx;					\
+	}								\
+    }
+
+#ifdef PIXMAN_FB_ACCESSORS
+#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_accessors
+#else
+#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_no_accessors
+#endif
+
+/*
+ * 4 bit alpha
+ */
+
+#define N_BITS  4
+#define RASTERIZE_EDGES rasterize_edges_4
+
+#ifndef WORDS_BIGENDIAN
+#define SHIFT_4(o)      ((o) << 2)
+#else
+#define SHIFT_4(o)      ((1 - (o)) << 2)
+#endif
+
+#define GET_4(x, o)      (((x) >> SHIFT_4 (o)) & 0xf)
+#define PUT_4(x, o, v)							\
+    (((x) & ~(0xf << SHIFT_4 (o))) | (((v) & 0xf) << SHIFT_4 (o)))
+
+#define DEFINE_ALPHA(line, x)						\
+    uint8_t   *__ap = (uint8_t *) line + ((x) >> 1);			\
+    int __ao = (x) & 1
+
+#define STEP_ALPHA      ((__ap += __ao), (__ao ^= 1))
+
+#define ADD_ALPHA(a)							\
+    {									\
+        uint8_t __o = READ (image, __ap);				\
+        uint8_t __a = (a) + GET_4 (__o, __ao);				\
+        WRITE (image, __ap, PUT_4 (__o, __ao, __a | (0 - ((__a) >> 4)))); \
+    }
+
+#include "pixman-edge-imp.h"
+
+#undef ADD_ALPHA
+#undef STEP_ALPHA
+#undef DEFINE_ALPHA
+#undef RASTERIZE_EDGES
+#undef N_BITS
+
+
+/*
+ * 1 bit alpha
+ */
+
+#define N_BITS 1
+#define RASTERIZE_EDGES rasterize_edges_1
+
+#include "pixman-edge-imp.h"
+
+#undef RASTERIZE_EDGES
+#undef N_BITS
+
+/*
+ * 8 bit alpha
+ */
+
+static force_inline uint8_t
+clip255 (int x)
+{
+    if (x > 255)
+	return 255;
+
+    return x;
+}
+
+#define ADD_SATURATE_8(buf, val, length)				\
+    do									\
+    {									\
+        int i__ = (length);						\
+        uint8_t *buf__ = (buf);						\
+        int val__ = (val);						\
+									\
+        while (i__--)							\
+        {								\
+            WRITE (image, (buf__), clip255 (READ (image, (buf__)) + (val__))); \
+            (buf__)++;							\
+	}								\
+    } while (0)
+
+/*
+ * We want to detect the case where we add the same value to a long
+ * span of pixels.  The triangles on the end are filled in while we
+ * count how many sub-pixel scanlines contribute to the middle section.
+ *
+ *                 +--------------------------+
+ *  fill_height =|   \                      /
+ *                     +------------------+
+ *                      |================|
+ *                   fill_start       fill_end
+ */
+static void
+rasterize_edges_8 (pixman_image_t *image,
+                   pixman_edge_t * l,
+                   pixman_edge_t * r,
+                   pixman_fixed_t  t,
+                   pixman_fixed_t  b)
+{
+    pixman_fixed_t y = t;
+    uint32_t  *line;
+    int fill_start = -1, fill_end = -1;
+    int fill_size = 0;
+    uint32_t *buf = (image)->bits.bits;
+    int stride = (image)->bits.rowstride;
+    int width = (image)->bits.width;
+
+    line = buf + pixman_fixed_to_int (y) * stride;
+
+    for (;;)
+    {
+        uint8_t *ap = (uint8_t *) line;
+        pixman_fixed_t lx, rx;
+        int lxi, rxi;
+
+        /* clip X */
+        lx = l->x;
+        if (lx < 0)
+	    lx = 0;
+
+        rx = r->x;
+
+        if (pixman_fixed_to_int (rx) >= width)
+	{
+	    /* Use the last pixel of the scanline, covered 100%.
+	     * We can't use the first pixel following the scanline,
+	     * because accessing it could result in a buffer overrun.
+	     */
+	    rx = pixman_int_to_fixed (width) - 1;
+	}
+
+        /* Skip empty (or backwards) sections */
+        if (rx > lx)
+        {
+            int lxs, rxs;
+
+            /* Find pixel bounds for span. */
+            lxi = pixman_fixed_to_int (lx);
+            rxi = pixman_fixed_to_int (rx);
+
+            /* Sample coverage for edge pixels */
+            lxs = RENDER_SAMPLES_X (lx, 8);
+            rxs = RENDER_SAMPLES_X (rx, 8);
+
+            /* Add coverage across row */
+            if (lxi == rxi)
+            {
+                WRITE (image, ap + lxi,
+		       clip255 (READ (image, ap + lxi) + rxs - lxs));
+	    }
+            else
+            {
+                WRITE (image, ap + lxi,
+		       clip255 (READ (image, ap + lxi) + N_X_FRAC (8) - lxs));
+
+                /* Move forward so that lxi/rxi is the pixel span */
+                lxi++;
+
+                /* Don't bother trying to optimize the fill unless
+		 * the span is longer than 4 pixels. */
+                if (rxi - lxi > 4)
+                {
+                    if (fill_start < 0)
+                    {
+                        fill_start = lxi;
+                        fill_end = rxi;
+                        fill_size++;
+		    }
+                    else
+                    {
+                        if (lxi >= fill_end || rxi < fill_start)
+                        {
+                            /* We're beyond what we saved, just fill it */
+                            ADD_SATURATE_8 (ap + fill_start,
+                                            fill_size * N_X_FRAC (8),
+                                            fill_end - fill_start);
+                            fill_start = lxi;
+                            fill_end = rxi;
+                            fill_size = 1;
+			}
+                        else
+                        {
+                            /* Update fill_start */
+                            if (lxi > fill_start)
+                            {
+                                ADD_SATURATE_8 (ap + fill_start,
+                                                fill_size * N_X_FRAC (8),
+                                                lxi - fill_start);
+                                fill_start = lxi;
+			    }
+                            else if (lxi < fill_start)
+                            {
+                                ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8),
+                                                fill_start - lxi);
+			    }
+
+                            /* Update fill_end */
+                            if (rxi < fill_end)
+                            {
+                                ADD_SATURATE_8 (ap + rxi,
+                                                fill_size * N_X_FRAC (8),
+                                                fill_end - rxi);
+                                fill_end = rxi;
+			    }
+                            else if (fill_end < rxi)
+                            {
+                                ADD_SATURATE_8 (ap + fill_end,
+                                                N_X_FRAC (8),
+                                                rxi - fill_end);
+			    }
+                            fill_size++;
+			}
+		    }
+		}
+                else
+                {
+                    ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8), rxi - lxi);
+		}
+
+                WRITE (image, ap + rxi, clip255 (READ (image, ap + rxi) + rxs));
+	    }
+	}
+
+        if (y == b)
+        {
+            /* We're done, make sure we clean up any remaining fill. */
+            if (fill_start != fill_end)
+            {
+                if (fill_size == N_Y_FRAC (8))
+                {
+                    MEMSET_WRAPPED (image, ap + fill_start,
+				    0xff, fill_end - fill_start);
+		}
+                else
+                {
+                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
+                                    fill_end - fill_start);
+		}
+	    }
+            break;
+	}
+
+        if (pixman_fixed_frac (y) != Y_FRAC_LAST (8))
+        {
+            RENDER_EDGE_STEP_SMALL (l);
+            RENDER_EDGE_STEP_SMALL (r);
+            y += STEP_Y_SMALL (8);
+	}
+        else
+        {
+            RENDER_EDGE_STEP_BIG (l);
+            RENDER_EDGE_STEP_BIG (r);
+            y += STEP_Y_BIG (8);
+            if (fill_start != fill_end)
+            {
+                if (fill_size == N_Y_FRAC (8))
+                {
+                    MEMSET_WRAPPED (image, ap + fill_start,
+				    0xff, fill_end - fill_start);
+		}
+                else
+                {
+                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
+                                    fill_end - fill_start);
+		}
+		
+                fill_start = fill_end = -1;
+                fill_size = 0;
+	    }
+	    
+            line += stride;
+	}
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+static
+#endif
+void
+PIXMAN_RASTERIZE_EDGES (pixman_image_t *image,
+                        pixman_edge_t * l,
+                        pixman_edge_t * r,
+                        pixman_fixed_t  t,
+                        pixman_fixed_t  b)
+{
+    switch (PIXMAN_FORMAT_BPP (image->bits.format))
+    {
+    case 1:
+	rasterize_edges_1 (image, l, r, t, b);
+	break;
+
+    case 4:
+	rasterize_edges_4 (image, l, r, t, b);
+	break;
+
+    case 8:
+	rasterize_edges_8 (image, l, r, t, b);
+	break;
+
+    default:
+        break;
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+
+PIXMAN_EXPORT void
+pixman_rasterize_edges (pixman_image_t *image,
+                        pixman_edge_t * l,
+                        pixman_edge_t * r,
+                        pixman_fixed_t  t,
+                        pixman_fixed_t  b)
+{
+    return_if_fail (image->type == BITS);
+    
+    if (image->bits.read_func || image->bits.write_func)
+	pixman_rasterize_edges_accessors (image, l, r, t, b);
+    else
+	pixman_rasterize_edges_no_accessors (image, l, r, t, b);
+}
+
+#endif
diff --git a/pixman/pixman/pixman-matrix.c b/pixman/pixman/pixman-matrix.c
index 0b3ae78b3..8d0d97325 100644
--- a/pixman/pixman/pixman-matrix.c
+++ b/pixman/pixman/pixman-matrix.c
@@ -1,766 +1,766 @@
-/*
- * Copyright © 2008 Keith Packard
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that copyright
- * notice and this permission notice appear in supporting documentation, and
- * that the name of the copyright holders not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  The copyright holders make no representations
- * about the suitability of this software for any purpose.  It is provided "as
- * is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THIS SOFTWARE.
- */
-
-/*
- * Matrix interfaces
- */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <math.h>
-#include <string.h>
-#include "pixman-private.h"
-
-#define F(x)    pixman_int_to_fixed (x)
-
-PIXMAN_EXPORT void
-pixman_transform_init_identity (struct pixman_transform *matrix)
-{
-    int i;
-
-    memset (matrix, '\0', sizeof (struct pixman_transform));
-    for (i = 0; i < 3; i++)
-	matrix->matrix[i][i] = F (1);
-}
-
-typedef pixman_fixed_32_32_t pixman_fixed_34_30_t;
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_point_3d (const struct pixman_transform *transform,
-                           struct pixman_vector *         vector)
-{
-    struct pixman_vector result;
-    pixman_fixed_32_32_t partial;
-    pixman_fixed_48_16_t v;
-    int i, j;
-
-    for (j = 0; j < 3; j++)
-    {
-	v = 0;
-	for (i = 0; i < 3; i++)
-	{
-	    partial = ((pixman_fixed_48_16_t) transform->matrix[j][i] *
-	               (pixman_fixed_48_16_t) vector->vector[i]);
-	    v += partial >> 16;
-	}
-	
-	if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
-	    return FALSE;
-	
-	result.vector[j] = (pixman_fixed_t) v;
-    }
-    
-    *vector = result;
-
-    if (!result.vector[2])
-	return FALSE;
-
-    return TRUE;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_point (const struct pixman_transform *transform,
-                        struct pixman_vector *         vector)
-{
-    pixman_fixed_32_32_t partial;
-    pixman_fixed_34_30_t v[3];
-    pixman_fixed_48_16_t quo;
-    int i, j;
-
-    for (j = 0; j < 3; j++)
-    {
-	v[j] = 0;
-	
-	for (i = 0; i < 3; i++)
-	{
-	    partial = ((pixman_fixed_32_32_t) transform->matrix[j][i] *
-	               (pixman_fixed_32_32_t) vector->vector[i]);
-	    v[j] += partial >> 2;
-	}
-    }
-    
-    if (!(v[2] >> 16))
-	return FALSE;
-
-    for (j = 0; j < 2; j++)
-    {
-	quo = v[j] / (v[2] >> 16);
-	if (quo > pixman_max_fixed_48_16 || quo < pixman_min_fixed_48_16)
-	    return FALSE;
-	vector->vector[j] = (pixman_fixed_t) quo;
-    }
-    
-    vector->vector[2] = pixman_fixed_1;
-    return TRUE;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_multiply (struct pixman_transform *      dst,
-                           const struct pixman_transform *l,
-                           const struct pixman_transform *r)
-{
-    struct pixman_transform d;
-    int dx, dy;
-    int o;
-
-    for (dy = 0; dy < 3; dy++)
-    {
-	for (dx = 0; dx < 3; dx++)
-	{
-	    pixman_fixed_48_16_t v;
-	    pixman_fixed_32_32_t partial;
-	    
-	    v = 0;
-	    for (o = 0; o < 3; o++)
-	    {
-		partial =
-		    (pixman_fixed_32_32_t) l->matrix[dy][o] *
-		    (pixman_fixed_32_32_t) r->matrix[o][dx];
-
-		v += partial >> 16;
-	    }
-
-	    if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
-		return FALSE;
-	    
-	    d.matrix[dy][dx] = (pixman_fixed_t) v;
-	}
-    }
-
-    *dst = d;
-    return TRUE;
-}
-
-PIXMAN_EXPORT void
-pixman_transform_init_scale (struct pixman_transform *t,
-                             pixman_fixed_t           sx,
-                             pixman_fixed_t           sy)
-{
-    memset (t, '\0', sizeof (struct pixman_transform));
-
-    t->matrix[0][0] = sx;
-    t->matrix[1][1] = sy;
-    t->matrix[2][2] = F (1);
-}
-
-static pixman_fixed_t
-fixed_inverse (pixman_fixed_t x)
-{
-    return (pixman_fixed_t) ((((pixman_fixed_48_16_t) F (1)) * F (1)) / x);
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_scale (struct pixman_transform *forward,
-                        struct pixman_transform *reverse,
-                        pixman_fixed_t           sx,
-                        pixman_fixed_t           sy)
-{
-    struct pixman_transform t;
-
-    if (sx == 0 || sy == 0)
-	return FALSE;
-
-    if (forward)
-    {
-	pixman_transform_init_scale (&t, sx, sy);
-	if (!pixman_transform_multiply (forward, &t, forward))
-	    return FALSE;
-    }
-    
-    if (reverse)
-    {
-	pixman_transform_init_scale (&t, fixed_inverse (sx),
-	                             fixed_inverse (sy));
-	if (!pixman_transform_multiply (reverse, reverse, &t))
-	    return FALSE;
-    }
-    
-    return TRUE;
-}
-
-PIXMAN_EXPORT void
-pixman_transform_init_rotate (struct pixman_transform *t,
-                              pixman_fixed_t           c,
-                              pixman_fixed_t           s)
-{
-    memset (t, '\0', sizeof (struct pixman_transform));
-
-    t->matrix[0][0] = c;
-    t->matrix[0][1] = -s;
-    t->matrix[1][0] = s;
-    t->matrix[1][1] = c;
-    t->matrix[2][2] = F (1);
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_rotate (struct pixman_transform *forward,
-                         struct pixman_transform *reverse,
-                         pixman_fixed_t           c,
-                         pixman_fixed_t           s)
-{
-    struct pixman_transform t;
-
-    if (forward)
-    {
-	pixman_transform_init_rotate (&t, c, s);
-	if (!pixman_transform_multiply (forward, &t, forward))
-	    return FALSE;
-    }
-
-    if (reverse)
-    {
-	pixman_transform_init_rotate (&t, c, -s);
-	if (!pixman_transform_multiply (reverse, reverse, &t))
-	    return FALSE;
-    }
-    
-    return TRUE;
-}
-
-PIXMAN_EXPORT void
-pixman_transform_init_translate (struct pixman_transform *t,
-                                 pixman_fixed_t           tx,
-                                 pixman_fixed_t           ty)
-{
-    memset (t, '\0', sizeof (struct pixman_transform));
-
-    t->matrix[0][0] = F (1);
-    t->matrix[0][2] = tx;
-    t->matrix[1][1] = F (1);
-    t->matrix[1][2] = ty;
-    t->matrix[2][2] = F (1);
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_translate (struct pixman_transform *forward,
-                            struct pixman_transform *reverse,
-                            pixman_fixed_t           tx,
-                            pixman_fixed_t           ty)
-{
-    struct pixman_transform t;
-
-    if (forward)
-    {
-	pixman_transform_init_translate (&t, tx, ty);
-
-	if (!pixman_transform_multiply (forward, &t, forward))
-	    return FALSE;
-    }
-
-    if (reverse)
-    {
-	pixman_transform_init_translate (&t, -tx, -ty);
-
-	if (!pixman_transform_multiply (reverse, reverse, &t))
-	    return FALSE;
-    }
-    return TRUE;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_bounds (const struct pixman_transform *matrix,
-                         struct pixman_box16 *          b)
-
-{
-    struct pixman_vector v[4];
-    int i;
-    int x1, y1, x2, y2;
-
-    v[0].vector[0] = F (b->x1);
-    v[0].vector[1] = F (b->y1);
-    v[0].vector[2] = F (1);
-
-    v[1].vector[0] = F (b->x2);
-    v[1].vector[1] = F (b->y1);
-    v[1].vector[2] = F (1);
-
-    v[2].vector[0] = F (b->x2);
-    v[2].vector[1] = F (b->y2);
-    v[2].vector[2] = F (1);
-
-    v[3].vector[0] = F (b->x1);
-    v[3].vector[1] = F (b->y2);
-    v[3].vector[2] = F (1);
-
-    for (i = 0; i < 4; i++)
-    {
-	if (!pixman_transform_point (matrix, &v[i]))
-	    return FALSE;
-
-	x1 = pixman_fixed_to_int (v[i].vector[0]);
-	y1 = pixman_fixed_to_int (v[i].vector[1]);
-	x2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[0]));
-	y2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[1]));
-
-	if (i == 0)
-	{
-	    b->x1 = x1;
-	    b->y1 = y1;
-	    b->x2 = x2;
-	    b->y2 = y2;
-	}
-	else
-	{
-	    if (x1 < b->x1) b->x1 = x1;
-	    if (y1 < b->y1) b->y1 = y1;
-	    if (x2 > b->x2) b->x2 = x2;
-	    if (y2 > b->y2) b->y2 = y2;
-	}
-    }
-
-    return TRUE;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_invert (struct pixman_transform *      dst,
-                         const struct pixman_transform *src)
-{
-    struct pixman_f_transform m, r;
-
-    pixman_f_transform_from_pixman_transform (&m, src);
-
-    if (!pixman_f_transform_invert (&r, &m))
-	return FALSE;
-
-    if (!pixman_transform_from_pixman_f_transform (dst, &r))
-	return FALSE;
-
-    return TRUE;
-}
-
-static pixman_bool_t
-within_epsilon (pixman_fixed_t a,
-                pixman_fixed_t b,
-                pixman_fixed_t epsilon)
-{
-    pixman_fixed_t t = a - b;
-
-    if (t < 0)
-	t = -t;
-
-    return t <= epsilon;
-}
-
-#define EPSILON (pixman_fixed_t) (2)
-
-#define IS_SAME(a, b) (within_epsilon (a, b, EPSILON))
-#define IS_ZERO(a)    (within_epsilon (a, 0, EPSILON))
-#define IS_ONE(a)     (within_epsilon (a, F (1), EPSILON))
-#define IS_UNIT(a)			    \
-    (within_epsilon (a, F (1), EPSILON) ||  \
-     within_epsilon (a, F (-1), EPSILON) || \
-     IS_ZERO (a))
-#define IS_INT(a)    (IS_ZERO (pixman_fixed_frac (a)))
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_is_identity (const struct pixman_transform *t)
-{
-    return (IS_SAME (t->matrix[0][0], t->matrix[1][1]) &&
-	    IS_SAME (t->matrix[0][0], t->matrix[2][2]) &&
-	    !IS_ZERO (t->matrix[0][0]) &&
-	    IS_ZERO (t->matrix[0][1]) &&
-	    IS_ZERO (t->matrix[0][2]) &&
-	    IS_ZERO (t->matrix[1][0]) &&
-	    IS_ZERO (t->matrix[1][2]) &&
-	    IS_ZERO (t->matrix[2][0]) &&
-	    IS_ZERO (t->matrix[2][1]));
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_is_scale (const struct pixman_transform *t)
-{
-    return (!IS_ZERO (t->matrix[0][0]) &&
-            IS_ZERO (t->matrix[0][1]) &&
-            IS_ZERO (t->matrix[0][2]) &&
-
-            IS_ZERO (t->matrix[1][0]) &&
-            !IS_ZERO (t->matrix[1][1]) &&
-            IS_ZERO (t->matrix[1][2]) &&
-
-            IS_ZERO (t->matrix[2][0]) &&
-            IS_ZERO (t->matrix[2][1]) &&
-            !IS_ZERO (t->matrix[2][2]));
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_is_int_translate (const struct pixman_transform *t)
-{
-    return (IS_ONE (t->matrix[0][0]) &&
-            IS_ZERO (t->matrix[0][1]) &&
-            IS_INT (t->matrix[0][2]) &&
-
-            IS_ZERO (t->matrix[1][0]) &&
-            IS_ONE (t->matrix[1][1]) &&
-            IS_INT (t->matrix[1][2]) &&
-
-            IS_ZERO (t->matrix[2][0]) &&
-            IS_ZERO (t->matrix[2][1]) &&
-            IS_ONE (t->matrix[2][2]));
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_is_inverse (const struct pixman_transform *a,
-                             const struct pixman_transform *b)
-{
-    struct pixman_transform t;
-
-    if (!pixman_transform_multiply (&t, a, b))
-	return FALSE;
-
-    return pixman_transform_is_identity (&t);
-}
-
-PIXMAN_EXPORT void
-pixman_f_transform_from_pixman_transform (struct pixman_f_transform *    ft,
-                                          const struct pixman_transform *t)
-{
-    int i, j;
-
-    for (j = 0; j < 3; j++)
-    {
-	for (i = 0; i < 3; i++)
-	    ft->m[j][i] = pixman_fixed_to_double (t->matrix[j][i]);
-    }
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_transform_from_pixman_f_transform (struct pixman_transform *        t,
-                                          const struct pixman_f_transform *ft)
-{
-    int i, j;
-
-    for (j = 0; j < 3; j++)
-    {
-	for (i = 0; i < 3; i++)
-	{
-	    double d = ft->m[j][i];
-	    if (d < -32767.0 || d > 32767.0)
-		return FALSE;
-	    d = d * 65536.0 + 0.5;
-	    t->matrix[j][i] = (pixman_fixed_t) floor (d);
-	}
-    }
-    
-    return TRUE;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_f_transform_invert (struct pixman_f_transform *      dst,
-                           const struct pixman_f_transform *src)
-{
-    double det;
-    int i, j;
-    static int a[3] = { 2, 2, 1 };
-    static int b[3] = { 1, 0, 0 };
-
-    det = 0;
-    for (i = 0; i < 3; i++)
-    {
-	double p;
-	int ai = a[i];
-	int bi = b[i];
-	p = src->m[i][0] * (src->m[ai][2] * src->m[bi][1] -
-	                    src->m[ai][1] * src->m[bi][2]);
-	if (i == 1)
-	    p = -p;
-	det += p;
-    }
-    
-    if (det == 0)
-	return FALSE;
-    
-    det = 1 / det;
-    for (j = 0; j < 3; j++)
-    {
-	for (i = 0; i < 3; i++)
-	{
-	    double p;
-	    int ai = a[i];
-	    int aj = a[j];
-	    int bi = b[i];
-	    int bj = b[j];
-
-	    p = (src->m[ai][aj] * src->m[bi][bj] -
-	         src->m[ai][bj] * src->m[bi][aj]);
-	    
-	    if (((i + j) & 1) != 0)
-		p = -p;
-	    
-	    dst->m[j][i] = det * p;
-	}
-    }
-
-    return TRUE;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_f_transform_point (const struct pixman_f_transform *t,
-                          struct pixman_f_vector *         v)
-{
-    struct pixman_f_vector result;
-    int i, j;
-    double a;
-
-    for (j = 0; j < 3; j++)
-    {
-	a = 0;
-	for (i = 0; i < 3; i++)
-	    a += t->m[j][i] * v->v[i];
-	result.v[j] = a;
-    }
-    
-    if (!result.v[2])
-	return FALSE;
-
-    for (j = 0; j < 2; j++)
-	v->v[j] = result.v[j] / result.v[2];
-
-    v->v[2] = 1;
-
-    return TRUE;
-}
-
-PIXMAN_EXPORT void
-pixman_f_transform_point_3d (const struct pixman_f_transform *t,
-                             struct pixman_f_vector *         v)
-{
-    struct pixman_f_vector result;
-    int i, j;
-    double a;
-
-    for (j = 0; j < 3; j++)
-    {
-	a = 0;
-	for (i = 0; i < 3; i++)
-	    a += t->m[j][i] * v->v[i];
-	result.v[j] = a;
-    }
-    
-    *v = result;
-}
-
-PIXMAN_EXPORT void
-pixman_f_transform_multiply (struct pixman_f_transform *      dst,
-                             const struct pixman_f_transform *l,
-                             const struct pixman_f_transform *r)
-{
-    struct pixman_f_transform d;
-    int dx, dy;
-    int o;
-
-    for (dy = 0; dy < 3; dy++)
-    {
-	for (dx = 0; dx < 3; dx++)
-	{
-	    double v = 0;
-	    for (o = 0; o < 3; o++)
-		v += l->m[dy][o] * r->m[o][dx];
-	    d.m[dy][dx] = v;
-	}
-    }
-    
-    *dst = d;
-}
-
-PIXMAN_EXPORT void
-pixman_f_transform_init_scale (struct pixman_f_transform *t,
-                               double                     sx,
-                               double                     sy)
-{
-    t->m[0][0] = sx;
-    t->m[0][1] = 0;
-    t->m[0][2] = 0;
-    t->m[1][0] = 0;
-    t->m[1][1] = sy;
-    t->m[1][2] = 0;
-    t->m[2][0] = 0;
-    t->m[2][1] = 0;
-    t->m[2][2] = 1;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_f_transform_scale (struct pixman_f_transform *forward,
-                          struct pixman_f_transform *reverse,
-                          double                     sx,
-                          double                     sy)
-{
-    struct pixman_f_transform t;
-
-    if (sx == 0 || sy == 0)
-	return FALSE;
-
-    if (forward)
-    {
-	pixman_f_transform_init_scale (&t, sx, sy);
-	pixman_f_transform_multiply (forward, &t, forward);
-    }
-    
-    if (reverse)
-    {
-	pixman_f_transform_init_scale (&t, 1 / sx, 1 / sy);
-	pixman_f_transform_multiply (reverse, reverse, &t);
-    }
-    
-    return TRUE;
-}
-
-PIXMAN_EXPORT void
-pixman_f_transform_init_rotate (struct pixman_f_transform *t,
-                                double                     c,
-                                double                     s)
-{
-    t->m[0][0] = c;
-    t->m[0][1] = -s;
-    t->m[0][2] = 0;
-    t->m[1][0] = s;
-    t->m[1][1] = c;
-    t->m[1][2] = 0;
-    t->m[2][0] = 0;
-    t->m[2][1] = 0;
-    t->m[2][2] = 1;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_f_transform_rotate (struct pixman_f_transform *forward,
-                           struct pixman_f_transform *reverse,
-                           double                     c,
-                           double                     s)
-{
-    struct pixman_f_transform t;
-
-    if (forward)
-    {
-	pixman_f_transform_init_rotate (&t, c, s);
-	pixman_f_transform_multiply (forward, &t, forward);
-    }
-    
-    if (reverse)
-    {
-	pixman_f_transform_init_rotate (&t, c, -s);
-	pixman_f_transform_multiply (reverse, reverse, &t);
-    }
-
-    return TRUE;
-}
-
-PIXMAN_EXPORT void
-pixman_f_transform_init_translate (struct pixman_f_transform *t,
-                                   double                     tx,
-                                   double                     ty)
-{
-    t->m[0][0] = 1;
-    t->m[0][1] = 0;
-    t->m[0][2] = tx;
-    t->m[1][0] = 0;
-    t->m[1][1] = 1;
-    t->m[1][2] = ty;
-    t->m[2][0] = 0;
-    t->m[2][1] = 0;
-    t->m[2][2] = 1;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_f_transform_translate (struct pixman_f_transform *forward,
-                              struct pixman_f_transform *reverse,
-                              double                     tx,
-                              double                     ty)
-{
-    struct pixman_f_transform t;
-
-    if (forward)
-    {
-	pixman_f_transform_init_translate (&t, tx, ty);
-	pixman_f_transform_multiply (forward, &t, forward);
-    }
-
-    if (reverse)
-    {
-	pixman_f_transform_init_translate (&t, -tx, -ty);
-	pixman_f_transform_multiply (reverse, reverse, &t);
-    }
-
-    return TRUE;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_f_transform_bounds (const struct pixman_f_transform *t,
-                           struct pixman_box16 *            b)
-{
-    struct pixman_f_vector v[4];
-    int i;
-    int x1, y1, x2, y2;
-
-    v[0].v[0] = b->x1;
-    v[0].v[1] = b->y1;
-    v[0].v[2] = 1;
-    v[1].v[0] = b->x2;
-    v[1].v[1] = b->y1;
-    v[1].v[2] = 1;
-    v[2].v[0] = b->x2;
-    v[2].v[1] = b->y2;
-    v[2].v[2] = 1;
-    v[3].v[0] = b->x1;
-    v[3].v[1] = b->y2;
-    v[3].v[2] = 1;
-
-    for (i = 0; i < 4; i++)
-    {
-	if (!pixman_f_transform_point (t, &v[i]))
-	    return FALSE;
-
-	x1 = floor (v[i].v[0]);
-	y1 = floor (v[i].v[1]);
-	x2 = ceil (v[i].v[0]);
-	y2 = ceil (v[i].v[1]);
-
-	if (i == 0)
-	{
-	    b->x1 = x1;
-	    b->y1 = y1;
-	    b->x2 = x2;
-	    b->y2 = y2;
-	}
-	else
-	{
-	    if (x1 < b->x1) b->x1 = x1;
-	    if (y1 < b->y1) b->y1 = y1;
-	    if (x2 > b->x2) b->x2 = x2;
-	    if (y2 > b->y2) b->y2 = y2;
-	}
-    }
-
-    return TRUE;
-}
-
-PIXMAN_EXPORT void
-pixman_f_transform_init_identity (struct pixman_f_transform *t)
-{
-    int i, j;
-
-    for (j = 0; j < 3; j++)
-    {
-	for (i = 0; i < 3; i++)
-	    t->m[j][i] = i == j ? 1 : 0;
-    }
-}
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+/*
+ * Matrix interfaces
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+#include <string.h>
+#include "pixman-private.h"
+
+#define F(x)    pixman_int_to_fixed (x)
+
+PIXMAN_EXPORT void
+pixman_transform_init_identity (struct pixman_transform *matrix)
+{
+    int i;
+
+    memset (matrix, '\0', sizeof (struct pixman_transform));
+    for (i = 0; i < 3; i++)
+	matrix->matrix[i][i] = F (1);
+}
+
+typedef pixman_fixed_32_32_t pixman_fixed_34_30_t;
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point_3d (const struct pixman_transform *transform,
+                           struct pixman_vector *         vector)
+{
+    struct pixman_vector result;
+    pixman_fixed_32_32_t partial;
+    pixman_fixed_48_16_t v;
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	v = 0;
+	for (i = 0; i < 3; i++)
+	{
+	    partial = ((pixman_fixed_48_16_t) transform->matrix[j][i] *
+	               (pixman_fixed_48_16_t) vector->vector[i]);
+	    v += partial >> 16;
+	}
+	
+	if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
+	    return FALSE;
+	
+	result.vector[j] = (pixman_fixed_t) v;
+    }
+    
+    *vector = result;
+
+    if (!result.vector[2])
+	return FALSE;
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point (const struct pixman_transform *transform,
+                        struct pixman_vector *         vector)
+{
+    pixman_fixed_32_32_t partial;
+    pixman_fixed_34_30_t v[3];
+    pixman_fixed_48_16_t quo;
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	v[j] = 0;
+	
+	for (i = 0; i < 3; i++)
+	{
+	    partial = ((pixman_fixed_32_32_t) transform->matrix[j][i] *
+	               (pixman_fixed_32_32_t) vector->vector[i]);
+	    v[j] += partial >> 2;
+	}
+    }
+    
+    if (!(v[2] >> 16))
+	return FALSE;
+
+    for (j = 0; j < 2; j++)
+    {
+	quo = v[j] / (v[2] >> 16);
+	if (quo > pixman_max_fixed_48_16 || quo < pixman_min_fixed_48_16)
+	    return FALSE;
+	vector->vector[j] = (pixman_fixed_t) quo;
+    }
+    
+    vector->vector[2] = pixman_fixed_1;
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_multiply (struct pixman_transform *      dst,
+                           const struct pixman_transform *l,
+                           const struct pixman_transform *r)
+{
+    struct pixman_transform d;
+    int dx, dy;
+    int o;
+
+    for (dy = 0; dy < 3; dy++)
+    {
+	for (dx = 0; dx < 3; dx++)
+	{
+	    pixman_fixed_48_16_t v;
+	    pixman_fixed_32_32_t partial;
+	    
+	    v = 0;
+	    for (o = 0; o < 3; o++)
+	    {
+		partial =
+		    (pixman_fixed_32_32_t) l->matrix[dy][o] *
+		    (pixman_fixed_32_32_t) r->matrix[o][dx];
+
+		v += partial >> 16;
+	    }
+
+	    if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
+		return FALSE;
+	    
+	    d.matrix[dy][dx] = (pixman_fixed_t) v;
+	}
+    }
+
+    *dst = d;
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_scale (struct pixman_transform *t,
+                             pixman_fixed_t           sx,
+                             pixman_fixed_t           sy)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = sx;
+    t->matrix[1][1] = sy;
+    t->matrix[2][2] = F (1);
+}
+
+static pixman_fixed_t
+fixed_inverse (pixman_fixed_t x)
+{
+    return (pixman_fixed_t) ((((pixman_fixed_48_16_t) F (1)) * F (1)) / x);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_scale (struct pixman_transform *forward,
+                        struct pixman_transform *reverse,
+                        pixman_fixed_t           sx,
+                        pixman_fixed_t           sy)
+{
+    struct pixman_transform t;
+
+    if (sx == 0 || sy == 0)
+	return FALSE;
+
+    if (forward)
+    {
+	pixman_transform_init_scale (&t, sx, sy);
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+    
+    if (reverse)
+    {
+	pixman_transform_init_scale (&t, fixed_inverse (sx),
+	                             fixed_inverse (sy));
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_rotate (struct pixman_transform *t,
+                              pixman_fixed_t           c,
+                              pixman_fixed_t           s)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = c;
+    t->matrix[0][1] = -s;
+    t->matrix[1][0] = s;
+    t->matrix[1][1] = c;
+    t->matrix[2][2] = F (1);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_rotate (struct pixman_transform *forward,
+                         struct pixman_transform *reverse,
+                         pixman_fixed_t           c,
+                         pixman_fixed_t           s)
+{
+    struct pixman_transform t;
+
+    if (forward)
+    {
+	pixman_transform_init_rotate (&t, c, s);
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+
+    if (reverse)
+    {
+	pixman_transform_init_rotate (&t, c, -s);
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_translate (struct pixman_transform *t,
+                                 pixman_fixed_t           tx,
+                                 pixman_fixed_t           ty)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = F (1);
+    t->matrix[0][2] = tx;
+    t->matrix[1][1] = F (1);
+    t->matrix[1][2] = ty;
+    t->matrix[2][2] = F (1);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_translate (struct pixman_transform *forward,
+                            struct pixman_transform *reverse,
+                            pixman_fixed_t           tx,
+                            pixman_fixed_t           ty)
+{
+    struct pixman_transform t;
+
+    if (forward)
+    {
+	pixman_transform_init_translate (&t, tx, ty);
+
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+
+    if (reverse)
+    {
+	pixman_transform_init_translate (&t, -tx, -ty);
+
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_bounds (const struct pixman_transform *matrix,
+                         struct pixman_box16 *          b)
+
+{
+    struct pixman_vector v[4];
+    int i;
+    int x1, y1, x2, y2;
+
+    v[0].vector[0] = F (b->x1);
+    v[0].vector[1] = F (b->y1);
+    v[0].vector[2] = F (1);
+
+    v[1].vector[0] = F (b->x2);
+    v[1].vector[1] = F (b->y1);
+    v[1].vector[2] = F (1);
+
+    v[2].vector[0] = F (b->x2);
+    v[2].vector[1] = F (b->y2);
+    v[2].vector[2] = F (1);
+
+    v[3].vector[0] = F (b->x1);
+    v[3].vector[1] = F (b->y2);
+    v[3].vector[2] = F (1);
+
+    for (i = 0; i < 4; i++)
+    {
+	if (!pixman_transform_point (matrix, &v[i]))
+	    return FALSE;
+
+	x1 = pixman_fixed_to_int (v[i].vector[0]);
+	y1 = pixman_fixed_to_int (v[i].vector[1]);
+	x2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[0]));
+	y2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[1]));
+
+	if (i == 0)
+	{
+	    b->x1 = x1;
+	    b->y1 = y1;
+	    b->x2 = x2;
+	    b->y2 = y2;
+	}
+	else
+	{
+	    if (x1 < b->x1) b->x1 = x1;
+	    if (y1 < b->y1) b->y1 = y1;
+	    if (x2 > b->x2) b->x2 = x2;
+	    if (y2 > b->y2) b->y2 = y2;
+	}
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_invert (struct pixman_transform *      dst,
+                         const struct pixman_transform *src)
+{
+    struct pixman_f_transform m, r;
+
+    pixman_f_transform_from_pixman_transform (&m, src);
+
+    if (!pixman_f_transform_invert (&r, &m))
+	return FALSE;
+
+    if (!pixman_transform_from_pixman_f_transform (dst, &r))
+	return FALSE;
+
+    return TRUE;
+}
+
+static pixman_bool_t
+within_epsilon (pixman_fixed_t a,
+                pixman_fixed_t b,
+                pixman_fixed_t epsilon)
+{
+    pixman_fixed_t t = a - b;
+
+    if (t < 0)
+	t = -t;
+
+    return t <= epsilon;
+}
+
+#define EPSILON (pixman_fixed_t) (2)
+
+#define IS_SAME(a, b) (within_epsilon (a, b, EPSILON))
+#define IS_ZERO(a)    (within_epsilon (a, 0, EPSILON))
+#define IS_ONE(a)     (within_epsilon (a, F (1), EPSILON))
+#define IS_UNIT(a)			    \
+    (within_epsilon (a, F (1), EPSILON) ||  \
+     within_epsilon (a, F (-1), EPSILON) || \
+     IS_ZERO (a))
+#define IS_INT(a)    (IS_ZERO (pixman_fixed_frac (a)))
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_identity (const struct pixman_transform *t)
+{
+    return (IS_SAME (t->matrix[0][0], t->matrix[1][1]) &&
+	    IS_SAME (t->matrix[0][0], t->matrix[2][2]) &&
+	    !IS_ZERO (t->matrix[0][0]) &&
+	    IS_ZERO (t->matrix[0][1]) &&
+	    IS_ZERO (t->matrix[0][2]) &&
+	    IS_ZERO (t->matrix[1][0]) &&
+	    IS_ZERO (t->matrix[1][2]) &&
+	    IS_ZERO (t->matrix[2][0]) &&
+	    IS_ZERO (t->matrix[2][1]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_scale (const struct pixman_transform *t)
+{
+    return (!IS_ZERO (t->matrix[0][0]) &&
+            IS_ZERO (t->matrix[0][1]) &&
+            IS_ZERO (t->matrix[0][2]) &&
+
+            IS_ZERO (t->matrix[1][0]) &&
+            !IS_ZERO (t->matrix[1][1]) &&
+            IS_ZERO (t->matrix[1][2]) &&
+
+            IS_ZERO (t->matrix[2][0]) &&
+            IS_ZERO (t->matrix[2][1]) &&
+            !IS_ZERO (t->matrix[2][2]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_int_translate (const struct pixman_transform *t)
+{
+    return (IS_ONE (t->matrix[0][0]) &&
+            IS_ZERO (t->matrix[0][1]) &&
+            IS_INT (t->matrix[0][2]) &&
+
+            IS_ZERO (t->matrix[1][0]) &&
+            IS_ONE (t->matrix[1][1]) &&
+            IS_INT (t->matrix[1][2]) &&
+
+            IS_ZERO (t->matrix[2][0]) &&
+            IS_ZERO (t->matrix[2][1]) &&
+            IS_ONE (t->matrix[2][2]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_inverse (const struct pixman_transform *a,
+                             const struct pixman_transform *b)
+{
+    struct pixman_transform t;
+
+    if (!pixman_transform_multiply (&t, a, b))
+	return FALSE;
+
+    return pixman_transform_is_identity (&t);
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_from_pixman_transform (struct pixman_f_transform *    ft,
+                                          const struct pixman_transform *t)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	    ft->m[j][i] = pixman_fixed_to_double (t->matrix[j][i]);
+    }
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_from_pixman_f_transform (struct pixman_transform *        t,
+                                          const struct pixman_f_transform *ft)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	{
+	    double d = ft->m[j][i];
+	    if (d < -32767.0 || d > 32767.0)
+		return FALSE;
+	    d = d * 65536.0 + 0.5;
+	    t->matrix[j][i] = (pixman_fixed_t) floor (d);
+	}
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_invert (struct pixman_f_transform *      dst,
+                           const struct pixman_f_transform *src)
+{
+    double det;
+    int i, j;
+    static int a[3] = { 2, 2, 1 };
+    static int b[3] = { 1, 0, 0 };
+
+    det = 0;
+    for (i = 0; i < 3; i++)
+    {
+	double p;
+	int ai = a[i];
+	int bi = b[i];
+	p = src->m[i][0] * (src->m[ai][2] * src->m[bi][1] -
+	                    src->m[ai][1] * src->m[bi][2]);
+	if (i == 1)
+	    p = -p;
+	det += p;
+    }
+    
+    if (det == 0)
+	return FALSE;
+    
+    det = 1 / det;
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	{
+	    double p;
+	    int ai = a[i];
+	    int aj = a[j];
+	    int bi = b[i];
+	    int bj = b[j];
+
+	    p = (src->m[ai][aj] * src->m[bi][bj] -
+	         src->m[ai][bj] * src->m[bi][aj]);
+	    
+	    if (((i + j) & 1) != 0)
+		p = -p;
+	    
+	    dst->m[j][i] = det * p;
+	}
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_point (const struct pixman_f_transform *t,
+                          struct pixman_f_vector *         v)
+{
+    struct pixman_f_vector result;
+    int i, j;
+    double a;
+
+    for (j = 0; j < 3; j++)
+    {
+	a = 0;
+	for (i = 0; i < 3; i++)
+	    a += t->m[j][i] * v->v[i];
+	result.v[j] = a;
+    }
+    
+    if (!result.v[2])
+	return FALSE;
+
+    for (j = 0; j < 2; j++)
+	v->v[j] = result.v[j] / result.v[2];
+
+    v->v[2] = 1;
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_point_3d (const struct pixman_f_transform *t,
+                             struct pixman_f_vector *         v)
+{
+    struct pixman_f_vector result;
+    int i, j;
+    double a;
+
+    for (j = 0; j < 3; j++)
+    {
+	a = 0;
+	for (i = 0; i < 3; i++)
+	    a += t->m[j][i] * v->v[i];
+	result.v[j] = a;
+    }
+    
+    *v = result;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_multiply (struct pixman_f_transform *      dst,
+                             const struct pixman_f_transform *l,
+                             const struct pixman_f_transform *r)
+{
+    struct pixman_f_transform d;
+    int dx, dy;
+    int o;
+
+    for (dy = 0; dy < 3; dy++)
+    {
+	for (dx = 0; dx < 3; dx++)
+	{
+	    double v = 0;
+	    for (o = 0; o < 3; o++)
+		v += l->m[dy][o] * r->m[o][dx];
+	    d.m[dy][dx] = v;
+	}
+    }
+    
+    *dst = d;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_scale (struct pixman_f_transform *t,
+                               double                     sx,
+                               double                     sy)
+{
+    t->m[0][0] = sx;
+    t->m[0][1] = 0;
+    t->m[0][2] = 0;
+    t->m[1][0] = 0;
+    t->m[1][1] = sy;
+    t->m[1][2] = 0;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_scale (struct pixman_f_transform *forward,
+                          struct pixman_f_transform *reverse,
+                          double                     sx,
+                          double                     sy)
+{
+    struct pixman_f_transform t;
+
+    if (sx == 0 || sy == 0)
+	return FALSE;
+
+    if (forward)
+    {
+	pixman_f_transform_init_scale (&t, sx, sy);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+    
+    if (reverse)
+    {
+	pixman_f_transform_init_scale (&t, 1 / sx, 1 / sy);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_rotate (struct pixman_f_transform *t,
+                                double                     c,
+                                double                     s)
+{
+    t->m[0][0] = c;
+    t->m[0][1] = -s;
+    t->m[0][2] = 0;
+    t->m[1][0] = s;
+    t->m[1][1] = c;
+    t->m[1][2] = 0;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_rotate (struct pixman_f_transform *forward,
+                           struct pixman_f_transform *reverse,
+                           double                     c,
+                           double                     s)
+{
+    struct pixman_f_transform t;
+
+    if (forward)
+    {
+	pixman_f_transform_init_rotate (&t, c, s);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+    
+    if (reverse)
+    {
+	pixman_f_transform_init_rotate (&t, c, -s);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_translate (struct pixman_f_transform *t,
+                                   double                     tx,
+                                   double                     ty)
+{
+    t->m[0][0] = 1;
+    t->m[0][1] = 0;
+    t->m[0][2] = tx;
+    t->m[1][0] = 0;
+    t->m[1][1] = 1;
+    t->m[1][2] = ty;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_translate (struct pixman_f_transform *forward,
+                              struct pixman_f_transform *reverse,
+                              double                     tx,
+                              double                     ty)
+{
+    struct pixman_f_transform t;
+
+    if (forward)
+    {
+	pixman_f_transform_init_translate (&t, tx, ty);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+
+    if (reverse)
+    {
+	pixman_f_transform_init_translate (&t, -tx, -ty);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_bounds (const struct pixman_f_transform *t,
+                           struct pixman_box16 *            b)
+{
+    struct pixman_f_vector v[4];
+    int i;
+    int x1, y1, x2, y2;
+
+    v[0].v[0] = b->x1;
+    v[0].v[1] = b->y1;
+    v[0].v[2] = 1;
+    v[1].v[0] = b->x2;
+    v[1].v[1] = b->y1;
+    v[1].v[2] = 1;
+    v[2].v[0] = b->x2;
+    v[2].v[1] = b->y2;
+    v[2].v[2] = 1;
+    v[3].v[0] = b->x1;
+    v[3].v[1] = b->y2;
+    v[3].v[2] = 1;
+
+    for (i = 0; i < 4; i++)
+    {
+	if (!pixman_f_transform_point (t, &v[i]))
+	    return FALSE;
+
+	x1 = floor (v[i].v[0]);
+	y1 = floor (v[i].v[1]);
+	x2 = ceil (v[i].v[0]);
+	y2 = ceil (v[i].v[1]);
+
+	if (i == 0)
+	{
+	    b->x1 = x1;
+	    b->y1 = y1;
+	    b->x2 = x2;
+	    b->y2 = y2;
+	}
+	else
+	{
+	    if (x1 < b->x1) b->x1 = x1;
+	    if (y1 < b->y1) b->y1 = y1;
+	    if (x2 > b->x2) b->x2 = x2;
+	    if (y2 > b->y2) b->y2 = y2;
+	}
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_identity (struct pixman_f_transform *t)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	    t->m[j][i] = i == j ? 1 : 0;
+    }
+}
diff --git a/pixman/pixman/pixman-timer.c b/pixman/pixman/pixman-timer.c
index f5ae18e89..c45d7b4fa 100644
--- a/pixman/pixman/pixman-timer.c
+++ b/pixman/pixman/pixman-timer.c
@@ -1,66 +1,66 @@
-/*
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Red Hat not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  Red Hat makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL RED HAT
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "pixman-private.h"
-
-#ifdef PIXMAN_TIMERS
-
-static pixman_timer_t *timers;
-
-static void
-dump_timers (void)
-{
-    pixman_timer_t *timer;
-
-    for (timer = timers; timer != NULL; timer = timer->next)
-    {
-	printf ("%s:   total: %llu     n: %llu      avg: %f\n",
-	        timer->name,
-	        timer->total,
-	        timer->n_times,
-	        timer->total / (double)timer->n_times);
-    }
-}
-
-void
-pixman_timer_register (pixman_timer_t *timer)
-{
-    static int initialized;
-
-    int atexit (void (*function)(void));
-
-    if (!initialized)
-    {
-	atexit (dump_timers);
-	initialized = 1;
-    }
-
-    timer->next = timers;
-    timers = timer;
-}
-
-#endif
+/*
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL RED HAT
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman-private.h"
+
+#ifdef PIXMAN_TIMERS
+
+static pixman_timer_t *timers;
+
+static void
+dump_timers (void)
+{
+    pixman_timer_t *timer;
+
+    for (timer = timers; timer != NULL; timer = timer->next)
+    {
+	printf ("%s:   total: %llu     n: %llu      avg: %f\n",
+	        timer->name,
+	        timer->total,
+	        timer->n_times,
+	        timer->total / (double)timer->n_times);
+    }
+}
+
+void
+pixman_timer_register (pixman_timer_t *timer)
+{
+    static int initialized;
+
+    int atexit (void (*function)(void));
+
+    if (!initialized)
+    {
+	atexit (dump_timers);
+	initialized = 1;
+    }
+
+    timer->next = timers;
+    timers = timer;
+}
+
+#endif
diff --git a/pixman/pixman/pixman-version.h.in b/pixman/pixman/pixman-version.h.in
index 256b2e6f1..022bf1a3c 100644
--- a/pixman/pixman/pixman-version.h.in
+++ b/pixman/pixman/pixman-version.h.in
@@ -1,50 +1,50 @@
-/*
- * Copyright © 2008 Red Hat, Inc.
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy,
- * modify, merge, publish, distribute, sublicense, and/or sell copies
- * of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Author: Carl D. Worth <cworth@cworth.org>
- */
-
-#ifndef PIXMAN_VERSION_H__
-#define PIXMAN_VERSION_H__
-
-#ifndef PIXMAN_H__
-#  error pixman-version.h should only be included by pixman.h
-#endif
-
-#define PIXMAN_VERSION_MAJOR @PIXMAN_VERSION_MAJOR@
-#define PIXMAN_VERSION_MINOR @PIXMAN_VERSION_MINOR@
-#define PIXMAN_VERSION_MICRO @PIXMAN_VERSION_MICRO@
-
-#define PIXMAN_VERSION_STRING "@PIXMAN_VERSION_MAJOR@.@PIXMAN_VERSION_MINOR@.@PIXMAN_VERSION_MICRO@"
-
-#define PIXMAN_VERSION_ENCODE(major, minor, micro) (	\
-	  ((major) * 10000)				\
-	+ ((minor) *   100)				\
-	+ ((micro) *     1))
-
-#define PIXMAN_VERSION PIXMAN_VERSION_ENCODE(	\
-	PIXMAN_VERSION_MAJOR,			\
-	PIXMAN_VERSION_MINOR,			\
-	PIXMAN_VERSION_MICRO)
-
-#endif /* PIXMAN_VERSION_H__ */
+/*
+ * Copyright © 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Author: Carl D. Worth <cworth@cworth.org>
+ */
+
+#ifndef PIXMAN_VERSION_H__
+#define PIXMAN_VERSION_H__
+
+#ifndef PIXMAN_H__
+#  error pixman-version.h should only be included by pixman.h
+#endif
+
+#define PIXMAN_VERSION_MAJOR @PIXMAN_VERSION_MAJOR@
+#define PIXMAN_VERSION_MINOR @PIXMAN_VERSION_MINOR@
+#define PIXMAN_VERSION_MICRO @PIXMAN_VERSION_MICRO@
+
+#define PIXMAN_VERSION_STRING "@PIXMAN_VERSION_MAJOR@.@PIXMAN_VERSION_MINOR@.@PIXMAN_VERSION_MICRO@"
+
+#define PIXMAN_VERSION_ENCODE(major, minor, micro) (	\
+	  ((major) * 10000)				\
+	+ ((minor) *   100)				\
+	+ ((micro) *     1))
+
+#define PIXMAN_VERSION PIXMAN_VERSION_ENCODE(	\
+	PIXMAN_VERSION_MAJOR,			\
+	PIXMAN_VERSION_MINOR,			\
+	PIXMAN_VERSION_MICRO)
+
+#endif /* PIXMAN_VERSION_H__ */
diff --git a/pixman/test/fuzzer-find-diff.pl b/pixman/test/fuzzer-find-diff.pl
index a43f6fb83..53d9b8de1 100644
--- a/pixman/test/fuzzer-find-diff.pl
+++ b/pixman/test/fuzzer-find-diff.pl
@@ -1,68 +1,68 @@
-#!/usr/bin/env perl
-
-$usage = "Usage:
-  fuzzer-find-diff.pl reference_binary new_binary [number_of_tests_to_run]
-
-The first two input arguments are the commands to run the test programs
-based on fuzzer_test_main() function from 'util.c' (preferably they should
-be statically compiled, this can be achieved via '--disable-shared' pixman
-configure option). The third optional argument is the number of test rounds
-to run (if not specified, then testing runs infinitely or until some problem
-is detected).
-
-Usage examples:
-  fuzzer-find-diff.pl ./blitters-test-with-sse-disabled ./blitters-test 9000000
-  fuzzer-find-diff.pl ./blitters-test \"ssh ppc64_host /path/to/blitters-test\"
-";
-
-$#ARGV >= 1 or die $usage;
-
-$batch_size = 10000;
-
-if ($#ARGV >= 2) {
-    $number_of_tests = int($ARGV[2]);
-} else {
-    $number_of_tests = -1
-}
-
-sub test_range {
-    my $min = shift;
-    my $max = shift;
-
-    if (`$ARGV[0] $min $max 2>/dev/null` eq `$ARGV[1] $min $max 2>/dev/null`) {
-        return;
-    }
-
-    while ($max != $min + 1) {
-        my $avg = int(($min + $max) / 2);
-        my $res1 = `$ARGV[0] $min $avg 2>/dev/null`;
-        my $res2 = `$ARGV[1] $min $avg 2>/dev/null`;
-        if ($res1 ne $res2) {
-            $max = $avg;
-        } else {
-            $min = $avg;
-        }
-    }
-    return $max;
-}
-
-$base = 1;
-while ($number_of_tests <= 0 || $base <= $number_of_tests) {
-    printf("testing %-12d\r", $base + $batch_size - 1);
-    my $res = test_range($base, $base + $batch_size - 1);
-    if ($res) {
-        printf("Failure: results are different for test %d:\n", $res);
-
-        printf("\n-- ref --\n");
-        print `$ARGV[0] $res`;
-        printf("-- new --\n");
-        print `$ARGV[1] $res`;
-
-        printf("The problematic conditions can be reproduced by running:\n");
-        printf("$ARGV[1] %d\n", $res);
-
-        exit(1);
-    }
-    $base += $batch_size;
-}
-printf("Success: %d tests finished\n", $base - 1);
+#!/usr/bin/env perl
+
+$usage = "Usage:
+  fuzzer-find-diff.pl reference_binary new_binary [number_of_tests_to_run]
+
+The first two input arguments are the commands to run the test programs
+based on fuzzer_test_main() function from 'util.c' (preferably they should
+be statically compiled, this can be achieved via '--disable-shared' pixman
+configure option). The third optional argument is the number of test rounds
+to run (if not specified, then testing runs infinitely or until some problem
+is detected).
+
+Usage examples:
+  fuzzer-find-diff.pl ./blitters-test-with-sse-disabled ./blitters-test 9000000
+  fuzzer-find-diff.pl ./blitters-test \"ssh ppc64_host /path/to/blitters-test\"
+";
+
+$#ARGV >= 1 or die $usage;
+
+$batch_size = 10000;
+
+if ($#ARGV >= 2) {
+    $number_of_tests = int($ARGV[2]);
+} else {
+    $number_of_tests = -1
+}
+
+sub test_range {
+    my $min = shift;
+    my $max = shift;
+
+    if (`$ARGV[0] $min $max 2>/dev/null` eq `$ARGV[1] $min $max 2>/dev/null`) {
+        return;
+    }
+
+    while ($max != $min + 1) {
+        my $avg = int(($min + $max) / 2);
+        my $res1 = `$ARGV[0] $min $avg 2>/dev/null`;
+        my $res2 = `$ARGV[1] $min $avg 2>/dev/null`;
+        if ($res1 ne $res2) {
+            $max = $avg;
+        } else {
+            $min = $avg;
+        }
+    }
+    return $max;
+}
+
+$base = 1;
+while ($number_of_tests <= 0 || $base <= $number_of_tests) {
+    printf("testing %-12d\r", $base + $batch_size - 1);
+    my $res = test_range($base, $base + $batch_size - 1);
+    if ($res) {
+        printf("Failure: results are different for test %d:\n", $res);
+
+        printf("\n-- ref --\n");
+        print `$ARGV[0] $res`;
+        printf("-- new --\n");
+        print `$ARGV[1] $res`;
+
+        printf("The problematic conditions can be reproduced by running:\n");
+        printf("$ARGV[1] %d\n", $res);
+
+        exit(1);
+    }
+    $base += $batch_size;
+}
+printf("Success: %d tests finished\n", $base - 1);
diff --git a/pixman/test/region-test.c b/pixman/test/region-test.c
index 9d5a41eb9..a1fc4a837 100644
--- a/pixman/test/region-test.c
+++ b/pixman/test/region-test.c
@@ -1,123 +1,123 @@
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "utils.h"
-
-int
-main ()
-{
-    pixman_region32_t r1;
-    pixman_region32_t r2;
-    pixman_region32_t r3;
-    pixman_box32_t boxes[] = {
-	{ 10, 10, 20, 20 },
-	{ 30, 30, 30, 40 },
-	{ 50, 45, 60, 44 },
-    };
-    pixman_box32_t boxes2[] = {
-	{ 2, 6, 7, 6 },
-	{ 4, 1, 6, 7 },
-    };
-    pixman_box32_t boxes3[] = {
-	{ 2, 6, 7, 6 },
-	{ 4, 1, 6, 1 },
-    };
-    int i, j;
-    pixman_box32_t *b;
-    pixman_image_t *image, *fill;
-    pixman_color_t white = {
-	0xffff,
-	0xffff,
-	0xffff,
-	0xffff
-    };
-
-    /* This used to go into an infinite loop before pixman-region.c
-     * was fixed to not use explict "short" variables
-     */
-    pixman_region32_init_rect (&r1, 0, 0, 20, 64000);
-    pixman_region32_init_rect (&r2, 0, 0, 20, 64000);
-    pixman_region32_init_rect (&r3, 0, 0, 20, 64000);
-
-    pixman_region32_subtract (&r1, &r2, &r3);
-
-
-    /* This would produce a region containing an empty
-     * rectangle in it. Such regions are considered malformed,
-     * but using an empty rectangle for initialization should
-     * work.
-     */
-    pixman_region32_init_rects (&r1, boxes, 3);
-
-    b = pixman_region32_rectangles (&r1, &i);
-
-    assert (i == 1);
-    
-    while (i--)
-    {
-	assert (b[i].x1 < b[i].x2);
-	assert (b[i].y1 < b[i].y2);
-    }
-
-    /* This would produce a rectangle containing the bounding box
-     * of the two rectangles. The correct result is to eliminate
-     * the broken rectangle.
-     */
-    pixman_region32_init_rects (&r1, boxes2, 2);
-
-    b = pixman_region32_rectangles (&r1, &i);
-
-    assert (i == 1);
-
-    assert (b[0].x1 == 4);
-    assert (b[0].y1 == 1);
-    assert (b[0].x2 == 6);
-    assert (b[0].y2 == 7);
-
-    /* This should produce an empty region */
-    pixman_region32_init_rects (&r1, boxes3, 2);
-
-    b = pixman_region32_rectangles (&r1, &i);
-
-    assert (i == 0);
-
-    fill = pixman_image_create_solid_fill (&white);
-    for (i = 0; i < 100; i++)
-    {
-	int image_size = 128;
-
-	pixman_region32_init (&r1);
-
-	/* Add some random rectangles */
-	for (j = 0; j < 64; j++)
-	    pixman_region32_union_rect (&r1, &r1,
-					lcg_rand_n (image_size),
-					lcg_rand_n (image_size),
-					lcg_rand_n (25),
-					lcg_rand_n (25));
-
-	/* Clip to image size */
-	pixman_region32_init_rect (&r2, 0, 0, image_size, image_size);
-	pixman_region32_intersect (&r1, &r1, &r2);
-	pixman_region32_fini (&r2);
-
-	/* render region to a1 mask */
-	image = pixman_image_create_bits (PIXMAN_a1, image_size, image_size, NULL, 0);
-	pixman_image_set_clip_region32 (image, &r1);
-	pixman_image_composite32 (PIXMAN_OP_SRC,
-				  fill, NULL, image,
-				  0, 0, 0, 0, 0, 0,
-				  image_size, image_size);
-	pixman_region32_init_from_image (&r2, image);
-
-	pixman_image_unref (image);
-
-	assert (pixman_region32_equal (&r1, &r2));
-	pixman_region32_fini (&r1);
-	pixman_region32_fini (&r2);
-
-    }
-    pixman_image_unref (fill);
-
-    return 0;
-}
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+int
+main ()
+{
+    pixman_region32_t r1;
+    pixman_region32_t r2;
+    pixman_region32_t r3;
+    pixman_box32_t boxes[] = {
+	{ 10, 10, 20, 20 },
+	{ 30, 30, 30, 40 },
+	{ 50, 45, 60, 44 },
+    };
+    pixman_box32_t boxes2[] = {
+	{ 2, 6, 7, 6 },
+	{ 4, 1, 6, 7 },
+    };
+    pixman_box32_t boxes3[] = {
+	{ 2, 6, 7, 6 },
+	{ 4, 1, 6, 1 },
+    };
+    int i, j;
+    pixman_box32_t *b;
+    pixman_image_t *image, *fill;
+    pixman_color_t white = {
+	0xffff,
+	0xffff,
+	0xffff,
+	0xffff
+    };
+
+    /* This used to go into an infinite loop before pixman-region.c
+     * was fixed to not use explict "short" variables
+     */
+    pixman_region32_init_rect (&r1, 0, 0, 20, 64000);
+    pixman_region32_init_rect (&r2, 0, 0, 20, 64000);
+    pixman_region32_init_rect (&r3, 0, 0, 20, 64000);
+
+    pixman_region32_subtract (&r1, &r2, &r3);
+
+
+    /* This would produce a region containing an empty
+     * rectangle in it. Such regions are considered malformed,
+     * but using an empty rectangle for initialization should
+     * work.
+     */
+    pixman_region32_init_rects (&r1, boxes, 3);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 1);
+    
+    while (i--)
+    {
+	assert (b[i].x1 < b[i].x2);
+	assert (b[i].y1 < b[i].y2);
+    }
+
+    /* This would produce a rectangle containing the bounding box
+     * of the two rectangles. The correct result is to eliminate
+     * the broken rectangle.
+     */
+    pixman_region32_init_rects (&r1, boxes2, 2);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 1);
+
+    assert (b[0].x1 == 4);
+    assert (b[0].y1 == 1);
+    assert (b[0].x2 == 6);
+    assert (b[0].y2 == 7);
+
+    /* This should produce an empty region */
+    pixman_region32_init_rects (&r1, boxes3, 2);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 0);
+
+    fill = pixman_image_create_solid_fill (&white);
+    for (i = 0; i < 100; i++)
+    {
+	int image_size = 128;
+
+	pixman_region32_init (&r1);
+
+	/* Add some random rectangles */
+	for (j = 0; j < 64; j++)
+	    pixman_region32_union_rect (&r1, &r1,
+					lcg_rand_n (image_size),
+					lcg_rand_n (image_size),
+					lcg_rand_n (25),
+					lcg_rand_n (25));
+
+	/* Clip to image size */
+	pixman_region32_init_rect (&r2, 0, 0, image_size, image_size);
+	pixman_region32_intersect (&r1, &r1, &r2);
+	pixman_region32_fini (&r2);
+
+	/* render region to a1 mask */
+	image = pixman_image_create_bits (PIXMAN_a1, image_size, image_size, NULL, 0);
+	pixman_image_set_clip_region32 (image, &r1);
+	pixman_image_composite32 (PIXMAN_OP_SRC,
+				  fill, NULL, image,
+				  0, 0, 0, 0, 0, 0,
+				  image_size, image_size);
+	pixman_region32_init_from_image (&r2, image);
+
+	pixman_image_unref (image);
+
+	assert (pixman_region32_equal (&r1, &r2));
+	pixman_region32_fini (&r1);
+	pixman_region32_fini (&r2);
+
+    }
+    pixman_image_unref (fill);
+
+    return 0;
+}