From 150771e7aabf4c864b0b970c5b8d773634793abe Mon Sep 17 00:00:00 2001
From: marha <marha@users.sourceforge.net>
Date: Tue, 4 Jun 2013 09:07:26 +0200
Subject: xwininfo fontconfig libX11 libXau libXdmcp libXext mesa libXinerama
 libxcb libxcb/xcb-proto libfontenc pixman xkbcomp mkfontscale
 xkeyboard-config git update 4 Jun 2013

xserver          commit c21344add2fc589df83b29be5831c36a372201bd
libxcb           commit 9ae84ad187e2ba440c40f44b8eb21c82c2fdbf12
libxcb/xcb-proto commit bdfedfa57a13ff805580cfacafc70f9cc55df363
xkeyboard-config commit dad9ade4e83d1ef5a517fcc4cc9ad3a79b47acce
libX11           commit 8496122eb00ce6cd5d2308ee54f64b68c378e455
libXdmcp         commit 0b443c1b769b9c9a3b45b4252afe07e18b709ff4
libXext          commit d8366afbb0d2e4fbb1e419b1187f490522270bea
libfontenc       commit 3acba630d8b57084f7e92c15732408711ed5137a
libXinerama      commit 6e1d1dc328ba8162bba2f4694e7f3c706a1491ff
libXau           commit 899790011304c4029e15abf410e49ce7cec17e0a
xkbcomp          commit ed582f4fccd4e23abcfba8b3b03649fea6414f44
pixman           commit 2acfac5f8e097ee2ae225d986f981b55d65dd152
mkfontscale      commit 19e2cb7c6a3ec2c5b1bc0d24866fa685eef0ee13
xwininfo         commit ba0d1b0da21d2dbdd81098ed5778f3792b472e13
fontconfig       commit cd9b1033a68816a7acfbba1718ba0aa5888f6ec7
mesa             commit 7bafd88c153e395274b632e7eae4bc9fc3aec1d2
---
 pixman/configure.ac                   |  13 +-
 pixman/pixman/pixman-arm-neon-asm.h   |   2 +-
 pixman/pixman/pixman-fast-path.c      |  99 +----
 pixman/pixman/pixman-filter.c         |   2 +
 pixman/pixman/pixman-general.c        |  92 +++--
 pixman/pixman/pixman-image.c          |  10 +-
 pixman/pixman/pixman-implementation.c |  89 +++--
 pixman/pixman/pixman-mips-dspr2-asm.S | 723 ++++++++++++++++++++++++----------
 pixman/pixman/pixman-mips-dspr2-asm.h |  51 ++-
 pixman/pixman/pixman-mips-dspr2.c     |  31 ++
 pixman/pixman/pixman-mips-dspr2.h     |  42 ++
 pixman/pixman/pixman-mmx.c            |  82 ++--
 pixman/pixman/pixman-noop.c           | 183 ++++-----
 pixman/pixman/pixman-private.h        |  70 ++--
 pixman/pixman/pixman-region.c         |   6 +-
 pixman/pixman/pixman-sse2.c           |  93 ++---
 pixman/pixman/pixman-utils.c          |  11 +
 pixman/pixman/pixman-vmx.c            |   2 +
 pixman/pixman/pixman.c                |   2 +-
 pixman/pixman/refactor                | 478 ----------------------
 pixman/test/blitters-test.c           |  13 +-
 pixman/test/lowlevel-blt-bench.c      |  13 +-
 pixman/test/prng-test.c               |   5 +-
 pixman/test/utils-prng.c              |  58 ++-
 pixman/test/utils-prng.h              |   5 +-
 25 files changed, 1063 insertions(+), 1112 deletions(-)
 delete mode 100644 pixman/pixman/refactor

(limited to 'pixman')

diff --git a/pixman/configure.ac b/pixman/configure.ac
index 38f89b31e..221179ff1 100644
--- a/pixman/configure.ac
+++ b/pixman/configure.ac
@@ -53,8 +53,8 @@ AC_PREREQ([2.57])
 #
 
 m4_define([pixman_major], 0)
-m4_define([pixman_minor], 29)
-m4_define([pixman_micro], 3)
+m4_define([pixman_minor], 31)
+m4_define([pixman_micro], 1)
 
 m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
 
@@ -279,7 +279,7 @@ AC_MSG_CHECKING(whether to use Loongson MMI assembler)
 
 xserver_save_CFLAGS=$CFLAGS
 CFLAGS=" $LS_CFLAGS $CFLAGS -I$srcdir"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
 #ifndef __mips_loongson_vector_rev
 #error "Loongson Multimedia Instructions are only available on Loongson"
 #endif
@@ -844,6 +844,13 @@ if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
    AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()])
 fi
 
+dnl =====================================
+dnl Check for missing sqrtf() as, e.g., for Solaris 9
+
+AC_SEARCH_LIBS([sqrtf], [m], [],
+               [AC_DEFINE([sqrtf], [sqrt],
+                          [Define to sqrt if you do not have the `sqrtf' function.])])
+
 dnl =====================================
 dnl Thread local storage
 
diff --git a/pixman/pixman/pixman-arm-neon-asm.h b/pixman/pixman/pixman-arm-neon-asm.h
index 1673b080f..d0d92d74c 100644
--- a/pixman/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman/pixman-arm-neon-asm.h
@@ -385,7 +385,7 @@
  * execute simultaneously with NEON and be completely shadowed by it. Thus
  * we get no performance overhead at all (*). This looks like a very nice
  * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
- * but still can implement some rather advanced prefetch logic in sofware
+ * but still can implement some rather advanced prefetch logic in software
  * for almost zero cost!
  *
  * (*) The overhead of the prefetcher is visible when running some trivial
diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c
index 247aea645..3982dce8b 100644
--- a/pixman/pixman/pixman-fast-path.c
+++ b/pixman/pixman/pixman-fast-path.c
@@ -2261,89 +2261,27 @@ fast_write_back_r5g6b5 (pixman_iter_t *iter)
     }
 }
 
-typedef struct
-{
-    pixman_format_code_t	format;
-    pixman_iter_get_scanline_t	get_scanline;
-    pixman_iter_write_back_t	write_back;
-} fetcher_info_t;
-
-static const fetcher_info_t fetchers[] =
-{
-    { PIXMAN_r5g6b5, fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
-    { PIXMAN_null }
-};
-
-static pixman_bool_t
-fast_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
-{
-    pixman_image_t *image = iter->image;
-
-#define FLAGS								\
+#define IMAGE_FLAGS							\
     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
 
-    if ((iter->iter_flags & ITER_NARROW)			&&
-	(iter->image_flags & FLAGS) == FLAGS)
-    {
-	const fetcher_info_t *f;
-
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
-	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
-
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
-
-		iter->get_scanline = f->get_scanline;
-		return TRUE;
-	    }
-	}
-    }
-
-    return FALSE;
-}
-
-static pixman_bool_t
-fast_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static const pixman_iter_info_t fast_iters[] = 
 {
-    pixman_image_t *image = iter->image;
-
-    if ((iter->iter_flags & ITER_NARROW)		&&
-	(iter->image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS)
-    {
-	const fetcher_info_t *f;
-
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
-	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
-
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
-
-		if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
-		    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
-		{
-		    iter->get_scanline = fast_dest_fetch_noop;
-		}
-		else
-		{
-		    iter->get_scanline = f->get_scanline;
-		}
-		iter->write_back = f->write_back;
-		return TRUE;
-	    }
-	}
-    }
-    return FALSE;
-}
-
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW | ITER_SRC,
+      _pixman_iter_init_bits_stride, fast_fetch_r5g6b5, NULL },
+
+    { PIXMAN_r5g6b5, FAST_PATH_STD_DEST_FLAGS,
+      ITER_NARROW | ITER_DEST,
+      _pixman_iter_init_bits_stride,
+      fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
+    
+    { PIXMAN_r5g6b5, FAST_PATH_STD_DEST_FLAGS,
+      ITER_NARROW | ITER_DEST | ITER_IGNORE_RGB | ITER_IGNORE_ALPHA,
+      _pixman_iter_init_bits_stride,
+      fast_dest_fetch_noop, fast_write_back_r5g6b5 },
+
+    { PIXMAN_null },
+};
 
 pixman_implementation_t *
 _pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
@@ -2351,8 +2289,7 @@ _pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
 
     imp->fill = fast_path_fill;
-    imp->src_iter_init = fast_src_iter_init;
-    imp->dest_iter_init = fast_dest_iter_init;
+    imp->iter_info = fast_iters;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-filter.c b/pixman/pixman/pixman-filter.c
index 26b39d571..5ff7b6eaa 100644
--- a/pixman/pixman/pixman-filter.c
+++ b/pixman/pixman/pixman-filter.c
@@ -28,7 +28,9 @@
 #include <stdio.h>
 #include <math.h>
 #include <assert.h>
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include "pixman-private.h"
 
 typedef double (* kernel_func_t) (double x);
diff --git a/pixman/pixman/pixman-general.c b/pixman/pixman/pixman-general.c
index 93a1b9acf..4da5da5e2 100644
--- a/pixman/pixman/pixman-general.c
+++ b/pixman/pixman/pixman-general.c
@@ -37,43 +37,47 @@
 #include <string.h>
 #include "pixman-private.h"
 
-static pixman_bool_t
-general_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static void
+general_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *info)
 {
     pixman_image_t *image = iter->image;
 
-    if (image->type == LINEAR)
-	_pixman_linear_gradient_iter_init (image, iter);
-    else if (image->type == RADIAL)
+    switch (image->type)
+    {
+    case BITS:
+        if ((iter->iter_flags & ITER_SRC) == ITER_SRC)
+            _pixman_bits_image_src_iter_init (image, iter);
+        else
+            _pixman_bits_image_dest_iter_init (image, iter);
+        break;
+
+    case LINEAR:
+        _pixman_linear_gradient_iter_init (image, iter);
+        break;
+
+    case RADIAL:
 	_pixman_radial_gradient_iter_init (image, iter);
-    else if (image->type == CONICAL)
+        break;
+
+    case CONICAL:
 	_pixman_conical_gradient_iter_init (image, iter);
-    else if (image->type == BITS)
-	_pixman_bits_image_src_iter_init (image, iter);
-    else if (image->type == SOLID)
+        break;
+
+    case SOLID:
         _pixman_log_error (FUNC, "Solid image not handled by noop");
-    else         
-	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+        break;
 
-    return TRUE;
+    default:
+	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+        break;
+    }
 }
 
-static pixman_bool_t
-general_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static const pixman_iter_info_t general_iters[] =
 {
-    if (iter->image->type == BITS)
-    {
-	_pixman_bits_image_dest_iter_init (iter->image, iter);
-
-	return TRUE;
-    }
-    else
-    {
-	_pixman_log_error (FUNC, "Trying to write to a non-writable image");
-
-	return FALSE;
-    }
-}
+    { PIXMAN_any, 0, 0, general_iter_init, NULL, NULL },
+    { PIXMAN_null },
+};
 
 typedef struct op_info_t op_info_t;
 struct op_info_t
@@ -116,7 +120,7 @@ general_composite_rect  (pixman_implementation_t *imp,
     pixman_iter_t src_iter, mask_iter, dest_iter;
     pixman_combine_32_func_t compose;
     pixman_bool_t component_alpha;
-    iter_flags_t narrow, src_iter_flags;
+    iter_flags_t width_flag, src_iter_flags;
     int Bpp;
     int i;
 
@@ -124,12 +128,12 @@ general_composite_rect  (pixman_implementation_t *imp,
 	(!mask_image || mask_image->common.flags & FAST_PATH_NARROW_FORMAT) &&
 	(dest_image->common.flags & FAST_PATH_NARROW_FORMAT))
     {
-	narrow = ITER_NARROW;
+	width_flag = ITER_NARROW;
 	Bpp = 4;
     }
     else
     {
-	narrow = 0;
+	width_flag = ITER_WIDE;
 	Bpp = 16;
     }
 
@@ -145,7 +149,7 @@ general_composite_rect  (pixman_implementation_t *imp,
     mask_buffer = src_buffer + width * Bpp;
     dest_buffer = mask_buffer + width * Bpp;
 
-    if (!narrow)
+    if (width_flag == ITER_WIDE)
     {
 	/* To make sure there aren't any NANs in the buffers */
 	memset (src_buffer, 0, width * Bpp);
@@ -154,11 +158,12 @@ general_composite_rect  (pixman_implementation_t *imp,
     }
     
     /* src iter */
-    src_iter_flags = narrow | op_flags[op].src;
+    src_iter_flags = width_flag | op_flags[op].src | ITER_SRC;
 
-    _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src_image,
-					  src_x, src_y, width, height,
-					  src_buffer, src_iter_flags, info->src_flags);
+    _pixman_implementation_iter_init (imp->toplevel, &src_iter, src_image,
+                                      src_x, src_y, width, height,
+                                      src_buffer, src_iter_flags,
+                                      info->src_flags);
 
     /* mask iter */
     if ((src_iter_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
@@ -176,17 +181,19 @@ general_composite_rect  (pixman_implementation_t *imp,
         mask_image->common.component_alpha    &&
         PIXMAN_FORMAT_RGB (mask_image->bits.format);
 
-    _pixman_implementation_src_iter_init (
-	imp->toplevel, &mask_iter, mask_image, mask_x, mask_y, width, height,
-	mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB), info->mask_flags);
+    _pixman_implementation_iter_init (
+	imp->toplevel, &mask_iter,
+	mask_image, mask_x, mask_y, width, height, mask_buffer,
+	ITER_SRC | width_flag | (component_alpha? 0 : ITER_IGNORE_RGB),
+	info->mask_flags);
 
     /* dest iter */
-    _pixman_implementation_dest_iter_init (
+    _pixman_implementation_iter_init (
 	imp->toplevel, &dest_iter, dest_image, dest_x, dest_y, width, height,
-	dest_buffer, narrow | op_flags[op].dst, info->dest_flags);
+	dest_buffer, ITER_DEST | width_flag | op_flags[op].dst, info->dest_flags);
 
     compose = _pixman_implementation_lookup_combiner (
-	imp->toplevel, op, component_alpha, narrow);
+	imp->toplevel, op, component_alpha, width_flag != ITER_WIDE);
 
     for (i = 0; i < height; ++i)
     {
@@ -219,8 +226,7 @@ _pixman_implementation_create_general (void)
     _pixman_setup_combiner_functions_32 (imp);
     _pixman_setup_combiner_functions_float (imp);
 
-    imp->src_iter_init = general_src_iter_init;
-    imp->dest_iter_init = general_dest_iter_init;
+    imp->iter_info = general_iters;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-image.c b/pixman/pixman/pixman-image.c
index 65041b43b..4f9c2f966 100644
--- a/pixman/pixman/pixman-image.c
+++ b/pixman/pixman/pixman-image.c
@@ -502,8 +502,10 @@ compute_image_info (pixman_image_t *image)
 	break;
     }
 
-    /* Alpha map */
-    if (!image->common.alpha_map)
+    /* Alpha maps are only supported for BITS images, so it's always
+     * safe to ignore their presense for non-BITS images
+     */
+    if (!image->common.alpha_map || image->type != BITS)
     {
 	flags |= FAST_PATH_NO_ALPHA_MAP;
     }
@@ -918,10 +920,10 @@ _pixman_image_get_solid (pixman_implementation_t *imp,
 	pixman_iter_t iter;
 
     otherwise:
-	_pixman_implementation_src_iter_init (
+	_pixman_implementation_iter_init (
 	    imp, &iter, image, 0, 0, 1, 1,
 	    (uint8_t *)&result,
-	    ITER_NARROW, image->common.flags);
+	    ITER_NARROW | ITER_SRC, image->common.flags);
 	
 	result = *iter.get_scanline (&iter, NULL);
     }
diff --git a/pixman/pixman/pixman-implementation.c b/pixman/pixman/pixman-implementation.c
index cfb82bb1f..160847ad0 100644
--- a/pixman/pixman/pixman-implementation.c
+++ b/pixman/pixman/pixman-implementation.c
@@ -285,18 +285,26 @@ _pixman_implementation_fill (pixman_implementation_t *imp,
     return FALSE;
 }
 
-pixman_bool_t
-_pixman_implementation_src_iter_init (pixman_implementation_t	*imp,
-				      pixman_iter_t             *iter,
-				      pixman_image_t		*image,
-				      int			 x,
-				      int			 y,
-				      int			 width,
-				      int			 height,
-				      uint8_t			*buffer,
-				      iter_flags_t		 iter_flags,
-				      uint32_t                   image_flags)
+static uint32_t *
+get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
 {
+    return NULL;
+}
+
+void
+_pixman_implementation_iter_init (pixman_implementation_t *imp,
+                                  pixman_iter_t           *iter,
+                                  pixman_image_t          *image,
+                                  int                      x,
+                                  int                      y,
+                                  int                      width,
+                                  int                      height,
+                                  uint8_t                 *buffer,
+                                  iter_flags_t             iter_flags,
+                                  uint32_t                 image_flags)
+{
+    pixman_format_code_t format;
+
     iter->image = image;
     iter->buffer = (uint32_t *)buffer;
     iter->x = x;
@@ -306,47 +314,38 @@ _pixman_implementation_src_iter_init (pixman_implementation_t	*imp,
     iter->iter_flags = iter_flags;
     iter->image_flags = image_flags;
 
-    while (imp)
+    if (!iter->image)
     {
-	if (imp->src_iter_init && (*imp->src_iter_init) (imp, iter))
-	    return TRUE;
-
-	imp = imp->fallback;
+	iter->get_scanline = get_scanline_null;
+	return;
     }
 
-    return FALSE;
-}
-
-pixman_bool_t
-_pixman_implementation_dest_iter_init (pixman_implementation_t	*imp,
-				       pixman_iter_t            *iter,
-				       pixman_image_t		*image,
-				       int			 x,
-				       int			 y,
-				       int			 width,
-				       int			 height,
-				       uint8_t			*buffer,
-				       iter_flags_t		 iter_flags,
-				       uint32_t                  image_flags)
-{
-    iter->image = image;
-    iter->buffer = (uint32_t *)buffer;
-    iter->x = x;
-    iter->y = y;
-    iter->width = width;
-    iter->height = height;
-    iter->iter_flags = iter_flags;
-    iter->image_flags = image_flags;
+    format = iter->image->common.extended_format_code;
 
     while (imp)
     {
-	if (imp->dest_iter_init && (*imp->dest_iter_init) (imp, iter))
-	    return TRUE;
-
-	imp = imp->fallback;
+        if (imp->iter_info)
+        {
+            const pixman_iter_info_t *info;
+
+            for (info = imp->iter_info; info->format != PIXMAN_null; ++info)
+            {
+                if ((info->format == PIXMAN_any || info->format == format) &&
+                    (info->image_flags & image_flags) == info->image_flags &&
+                    (info->iter_flags & iter_flags) == info->iter_flags)
+                {
+                    iter->get_scanline = info->get_scanline;
+                    iter->write_back = info->write_back;
+
+                    if (info->initializer)
+                        info->initializer (iter, info);
+                    return;
+                }
+            }
+        }
+
+        imp = imp->fallback;
     }
-
-    return FALSE;
 }
 
 pixman_bool_t
diff --git a/pixman/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman/pixman-mips-dspr2-asm.S
index 3adbb2afe..866e93e58 100644
--- a/pixman/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman/pixman-mips-dspr2-asm.S
@@ -699,6 +699,127 @@ LEAF_MIPS_DSPR2(pixman_composite_src_0888_0565_rev_asm_mips)
 END(pixman_composite_src_0888_0565_rev_asm_mips)
 #endif
 
+LEAF_MIPS_DSPR2(pixman_composite_src_pixbuf_8888_asm_mips)
+/*
+ * a0 - dst  (a8b8g8r8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li       v0, 0x00ff00ff
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    addiu    a1, a1, 8
+    addiu    a2, a2, -2
+    srl      t2, t0, 24
+    srl      t3, t1, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t2, t3, t0, t1, v0, t4, t5, t6, t7, t8, t9
+
+    sll      t0, t0, 8
+    sll      t1, t1, 8
+    andi     t2, t2, 0xff
+    andi     t3, t3, 0xff
+    or       t0, t0, t2
+    or       t1, t1, t3
+    wsbh     t0, t0
+    wsbh     t1, t1
+    rotr     t0, t0, 16
+    rotr     t1, t1, 16
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    lw       t0, 0(a1)
+    srl      t1, t0, 24
+
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t3, t4, t5
+
+    sll      t0, t0, 8
+    andi     t1, t1, 0xff
+    or       t0, t0, t1
+    wsbh     t0, t0
+    rotr     t0, t0, 16
+    sw       t0, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_composite_src_pixbuf_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_rpixbuf_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li       v0, 0x00ff00ff
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    addiu    a1, a1, 8
+    addiu    a2, a2, -2
+    srl      t2, t0, 24
+    srl      t3, t1, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t2, t3, t0, t1, v0, t4, t5, t6, t7, t8, t9
+
+    sll      t0, t0, 8
+    sll      t1, t1, 8
+    andi     t2, t2, 0xff
+    andi     t3, t3, 0xff
+    or       t0, t0, t2
+    or       t1, t1, t3
+    rotr     t0, t0, 8
+    rotr     t1, t1, 8
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    lw       t0, 0(a1)
+    srl      t1, t0, 24
+
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t3, t4, t5
+
+    sll      t0, t0, 8
+    andi     t1, t1, 0xff
+    or       t0, t0, t1
+    rotr     t0, t0, 8
+    sw       t0, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_composite_src_rpixbuf_8888_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8888_asm_mips)
 /*
  * a0 - dst  (a8r8g8b8)
@@ -840,34 +961,35 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips)
  * a3 - w
  */
 
-    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5
-    beqz         a3, 4f
+    beqz         a3, 8f
      nop
+    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5
+
     li           t6, 0xff
     addiu        t7, zero, -1 /* t7 = 0xffffffff */
     srl          t8, a1, 24   /* t8 = srca */
     li           t9, 0x00ff00ff
+
     addiu        t1, a3, -1
-    beqz         t1, 3f       /* last pixel */
+    beqz         t1, 4f       /* last pixel */
      nop
-    beq          t8, t6, 2f   /* if (srca == 0xff) */
-     nop
-1:
-                              /* a1 = src */
+
+0:
     lw           t0, 0(a2)    /* t0 = mask */
     lw           t1, 4(a2)    /* t1 = mask */
+    addiu        a3, a3, -2   /* w = w - 2 */
     or           t2, t0, t1
-    beqz         t2, 12f      /* if (t0 == 0) && (t1 == 0) */
+    beqz         t2, 3f      /* if (t0 == 0) && (t1 == 0) */
      addiu       a2, a2, 8
-    and          t3, t0, t1
-    move         t4, a1       /* t4 = src */
-    move         t5, a1       /* t5 = src */
+    and          t2, t0, t1
+    beq          t2, t7, 1f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+     nop
+
+//if(ma)
     lw           t2, 0(a0)    /* t2 = dst */
-    beq          t3, t7, 11f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     lw          t3, 4(a0)    /* t3 = dst */
+    lw           t3, 4(a0)    /* t3 = dst */
     MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
     MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, s0, s1, s2, s3, s4, s5
-11:
     not          t0, t0
     not          t1, t1
     MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
@@ -875,62 +997,79 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips)
     addu_s.qb    t3, t5, t3
     sw           t2, 0(a0)
     sw           t3, 4(a0)
-12:
-    addiu        a3, a3, -2
     addiu        t1, a3, -1
-    bgtz         t1, 1b
+    bgtz         t1, 0b
      addiu       a0, a0, 8
-    b            3f
+    b            4f
+     nop
+1:
+//if (t0 == 0xffffffff) && (t1 == 0xffffffff):
+    beq          t8, t6, 2f   /* if (srca == 0xff) */
      nop
-2:
-                              /* a1 = src */
-    lw           t0, 0(a2)    /* t0 = mask */
-    lw           t1, 4(a2)    /* t1 = mask */
-    or           t2, t0, t1
-    beqz         t2, 22f      /* if (t0 == 0) & (t1 == 0) */
-     addiu       a2, a2, 8
-    and          t2, t0, t1
-    move         t4, a1
-    beq          t2, t7, 21f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     move        t5, a1
     lw           t2, 0(a0)    /* t2 = dst */
     lw           t3, 4(a0)    /* t3 = dst */
-    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
-    not          t0, t0
-    not          t1, t1
-    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
-    addu_s.qb    t4, t4, t2
-    addu_s.qb    t5, t5, t3
-21:
-    sw           t4, 0(a0)
-    sw           t5, 4(a0)
-22:
-    addiu        a3, a3, -2
+    not          t0, a1
+    not          t1, a1
+    srl          t0, t0, 24
+    srl          t1, t1, 24
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, a1, t2
+    addu_s.qb    t3, a1, t3
+    sw           t2, 0(a0)
+    sw           t3, 4(a0)
     addiu        t1, a3, -1
-    bgtz         t1, 2b
+    bgtz         t1, 0b
      addiu       a0, a0, 8
+    b            4f
+     nop
+2:
+    sw           a1, 0(a0)
+    sw           a1, 4(a0)
 3:
-    blez         a3, 4f
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 8
+
+4:
+    beqz         a3, 7f
      nop
                               /* a1 = src */
-    lw           t1, 0(a2)    /* t1 = mask */
-    beqz         t1, 4f
+    lw           t0, 0(a2)    /* t0 = mask */
+    beqz         t0, 7f       /* if (t0 == 0) */
      nop
-    move         t2, a1       /* t2 = src */
-    beq          t1, t7, 31f
-     lw          t0, 0(a0)    /* t0 = dst */
-
-    MIPS_UN8x4_MUL_UN8x4  a1, t1, t2, t9, t3, t4, t5, t6
-    MIPS_UN8x4_MUL_UN8    t1, t8, t1, t9, t3, t4, t5
-31:
-    not          t1, t1
-    MIPS_UN8x4_MUL_UN8x4  t0, t1, t0, t9, t3, t4, t5, t6
-    addu_s.qb    t0, t2, t0
-    sw           t0, 0(a0)
-4:
+    beq          t0, t7, 5f  /* if (t0 == 0xffffffff) */
+     nop
+//if(ma)
+    lw           t1, 0(a0)    /* t1 = dst */
+    MIPS_UN8x4_MUL_UN8x4  a1, t0, t2, t9, t3, t4, t5, s0
+    MIPS_UN8x4_MUL_UN8    t0, t8, t0, t9, t3, t4, t5
+    not          t0, t0
+    MIPS_UN8x4_MUL_UN8x4  t1, t0, t1, t9, t3, t4, t5, s0
+    addu_s.qb    t1, t2, t1
+    sw           t1, 0(a0)
     RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
     j            ra
      nop
+5:
+//if (t0 == 0xffffffff)
+    beq          t8, t6, 6f   /* if (srca == 0xff) */
+     nop
+    lw           t1, 0(a0)    /* t1 = dst */
+    not          t0, a1
+    srl          t0, t0, 24
+    MIPS_UN8x4_MUL_UN8 t1, t0, t1, t9, t2, t3, t4
+    addu_s.qb    t1, a1, t1
+    sw           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+    j            ra
+     nop
+6:
+    sw           a1, 0(a0)
+7:
+    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+8:
+    j            ra
+     nop
 
 END(pixman_composite_over_n_8888_8888_ca_asm_mips)
 
@@ -942,106 +1081,126 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
  * a3 - w
  */
 
-    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
-    beqz         a3, 4f
+    beqz         a3, 8f
      nop
-    li           t5, 0xf800f800
-    li           t6, 0x07e007e0
-    li           t7, 0x001F001F
-    li           t9, 0x00ff00ff
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
 
+    li           t6, 0xff
+    addiu        t7, zero, -1 /* t7 = 0xffffffff */
     srl          t8, a1, 24   /* t8 = srca */
+    li           t9, 0x00ff00ff
+    li           s6, 0xf800f800
+    li           s7, 0x07e007e0
+    li           s8, 0x001F001F
+
     addiu        t1, a3, -1
-    beqz         t1, 3f       /* last pixel */
+    beqz         t1, 4f       /* last pixel */
      nop
-    li           s0, 0xff     /* s0 = 0xff */
-    addiu        s1, zero, -1 /* s1 = 0xffffffff */
 
-    beq          t8, s0, 2f   /* if (srca == 0xff) */
-     nop
-1:
-                              /* a1 = src */
+0:
     lw           t0, 0(a2)    /* t0 = mask */
     lw           t1, 4(a2)    /* t1 = mask */
+    addiu        a3, a3, -2   /* w = w - 2 */
     or           t2, t0, t1
-    beqz         t2, 12f      /* if (t0 == 0) && (t1 == 0) */
+    beqz         t2, 3f      /* if (t0 == 0) && (t1 == 0) */
      addiu       a2, a2, 8
-    and          t3, t0, t1
-    move         s2, a1       /* s2 = src */
-    move         s3, a1       /* s3 = src */
+    and          t2, t0, t1
+    beq          t2, t7, 1f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+     nop
+
+//if(ma)
     lhu          t2, 0(a0)    /* t2 = dst */
-    beq          t3, s1, 11f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     lhu         t3, 2(a0)    /* t3 = dst */
-    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
-    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, t4, s4, s5, s6, s7, s8
-11:
+    lhu          t3, 2(a0)    /* t3 = dst */
+    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
+    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, s0, s1, s2, s3, s4, s5
     not          t0, t0
     not          t1, t1
-    CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
-    MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t0, t1
-    addu_s.qb    s2, s2, s4
-    addu_s.qb    s3, s3, s5
-    CONVERT_2x8888_TO_2x0565 s2, s3, t2, t3, t5, t6, t7, s4, s5
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, s7, s8, s0, s1, s2, s3
+    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, t4, t2
+    addu_s.qb    t3, t5, t3
+    CONVERT_2x8888_TO_2x0565 t2, t3, t2, t3, s6, s7, s8, s0, s1
     sh           t2, 0(a0)
     sh           t3, 2(a0)
-12:
-    addiu        a3, a3, -2
     addiu        t1, a3, -1
-    bgtz         t1, 1b
+    bgtz         t1, 0b
      addiu       a0, a0, 4
-    b            3f
+    b            4f
+     nop
+1:
+//if (t0 == 0xffffffff) && (t1 == 0xffffffff):
+    beq          t8, t6, 2f   /* if (srca == 0xff) */
      nop
-2:
-                              /* a1 = src */
-    lw           t0, 0(a2)    /* t0 = mask */
-    lw           t1, 4(a2)    /* t1 = mask */
-    or           t2, t0, t1
-    beqz         t2, 22f      /* if (t0 == 0) & (t1 == 0) */
-     addiu       a2, a2, 8
-    and          t3, t0, t1
-    move         t2, a1
-    beq          t3, s1, 21f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     move        t3, a1
     lhu          t2, 0(a0)    /* t2 = dst */
     lhu          t3, 2(a0)    /* t3 = dst */
-    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
-    not          t0, t0
-    not          t1, t1
-    CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
-    MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t2, t3
-    addu_s.qb    t2, s2, s4
-    addu_s.qb    t3, s3, s5
-21:
-    CONVERT_2x8888_TO_2x0565 t2, t3, t0, t1, t5, t6, t7, s2, s3
-    sh           t0, 0(a0)
-    sh           t1, 2(a0)
-22:
-    addiu        a3, a3, -2
+    not          t0, a1
+    not          t1, a1
+    srl          t0, t0, 24
+    srl          t1, t1, 24
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, s7, s8, s0, s1, s2, s3
+    MIPS_2xUN8x4_MUL_2xUN8   t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, a1, t2
+    addu_s.qb    t3, a1, t3
+    CONVERT_2x8888_TO_2x0565 t2, t3, t2, t3, s6, s7, s8, s0, s1
+    sh           t2, 0(a0)
+    sh           t3, 2(a0)
     addiu        t1, a3, -1
-    bgtz         t1, 2b
+    bgtz         t1, 0b
      addiu       a0, a0, 4
+    b            4f
+     nop
+2:
+    CONVERT_1x8888_TO_1x0565 a1, t2, s0, s1
+    sh           t2, 0(a0)
+    sh           t2, 2(a0)
 3:
-    blez         a3, 4f
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 4
+
+4:
+    beqz         a3, 7f
      nop
                               /* a1 = src */
-    lw           t1, 0(a2)    /* t1 = mask */
-    beqz         t1, 4f
+    lw           t0, 0(a2)    /* t0 = mask */
+    beqz         t0, 7f       /* if (t0 == 0) */
      nop
-    move         t2, a1       /* t2 = src */
-    beq          t1, t7, 31f
-     lhu         t0, 0(a0)    /* t0 = dst */
-
-    MIPS_UN8x4_MUL_UN8x4     a1, t1, t2, t9, t3, t4, t5, t6
-    MIPS_UN8x4_MUL_UN8       t1, t8, t1, t9, t3, t4, t5
-31:
-    not          t1, t1
-    CONVERT_1x0565_TO_1x8888 t0, s1, s2, s3
-    MIPS_UN8x4_MUL_UN8x4     s1, t1, t3, t9, t4, t5, t6, t7
-    addu_s.qb    t0, t2, t3
-    CONVERT_1x8888_TO_1x0565 t0, s1, s2, s3
-    sh           s1, 0(a0)
-4:
-    RESTORE_REGS_FROM_STACK  20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    beq          t0, t7, 5f  /* if (t0 == 0xffffffff) */
+     nop
+//if(ma)
+    lhu          t1, 0(a0)    /* t1 = dst */
+    MIPS_UN8x4_MUL_UN8x4     a1, t0, t2, t9, t3, t4, t5, s0
+    MIPS_UN8x4_MUL_UN8       t0, t8, t0, t9, t3, t4, t5
+    not          t0, t0
+    CONVERT_1x0565_TO_1x8888 t1, s1, s2, s3
+    MIPS_UN8x4_MUL_UN8x4     s1, t0, s1, t9, t3, t4, t5, s0
+    addu_s.qb    s1, t2, s1
+    CONVERT_1x8888_TO_1x0565 s1, t1, s0, s2
+    sh           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    j            ra
+     nop
+5:
+//if (t0 == 0xffffffff)
+    beq          t8, t6, 6f   /* if (srca == 0xff) */
+     nop
+    lhu          t1, 0(a0)    /* t1 = dst */
+    not          t0, a1
+    srl          t0, t0, 24
+    CONVERT_1x0565_TO_1x8888 t1, s1, s2, s3
+    MIPS_UN8x4_MUL_UN8       s1, t0, s1, t9, t2, t3, t4
+    addu_s.qb    s1, a1, s1
+    CONVERT_1x8888_TO_1x0565 s1, t1, s0, s2
+    sh           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    j            ra
+     nop
+6:
+    CONVERT_1x8888_TO_1x0565 a1, t1, s0, s2
+    sh           t1, 0(a0)
+7:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+8:
     j            ra
      nop
 
@@ -2936,101 +3095,265 @@ END(pixman_composite_over_reverse_n_8888_asm_mips)
 LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips)
 /*
  * a0 - dst  (a8)
- * a1 - src  (a8r8g8b8)
+ * a1 - src  (32bit constant)
  * a2 - w
  */
 
-    beqz              a2, 5f
+    li                t9, 0x00ff00ff
+    beqz              a2, 3f
      nop
-
-    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
-    move              t7, a1
-    srl               t5, t7, 24
-    replv.ph          t5, t5
-    srl               t9, a2, 2   /* t1 = how many multiples of 4 src pixels */
-    beqz              t9, 2f      /* branch if less than 4 src pixels */
+    srl               t7, a2, 2   /* t7 = how many multiples of 4 dst pixels */
+    beqz              t7, 1f      /* branch if less than 4 src pixels */
      nop
 
-1:
-    addiu             t9, t9, -1
-    addiu             a2, a2, -4
+    srl               t8, a1, 24
+    replv.ph          t8, t8
+
+0:
+    beqz              t7, 1f
+     addiu            t7, t7, -1
     lbu               t0, 0(a0)
     lbu               t1, 1(a0)
     lbu               t2, 2(a0)
     lbu               t3, 3(a0)
 
-    muleu_s.ph.qbl    s0, t0, t5
-    muleu_s.ph.qbr    s1, t0, t5
-    muleu_s.ph.qbl    s2, t1, t5
-    muleu_s.ph.qbr    s3, t1, t5
-    muleu_s.ph.qbl    s4, t2, t5
-    muleu_s.ph.qbr    s5, t2, t5
-    muleu_s.ph.qbl    s6, t3, t5
-    muleu_s.ph.qbr    s7, t3, t5
-
-    shrl.ph           t4, s0, 8
-    shrl.ph           t6, s1, 8
-    shrl.ph           t7, s2, 8
-    shrl.ph           t8, s3, 8
-    addq.ph           t0, s0, t4
-    addq.ph           t1, s1, t6
-    addq.ph           t2, s2, t7
-    addq.ph           t3, s3, t8
-    shra_r.ph         t0, t0, 8
-    shra_r.ph         t1, t1, 8
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr.qb.ph       t0, t3, t1
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, t8
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
     shra_r.ph         t2, t2, 8
     shra_r.ph         t3, t3, 8
-    shrl.ph           t4, s4, 8
-    shrl.ph           t6, s5, 8
-    shrl.ph           t7, s6, 8
-    shrl.ph           t8, s7, 8
-    addq.ph           s0, s4, t4
-    addq.ph           s1, s5, t6
-    addq.ph           s2, s6, t7
-    addq.ph           s3, s7, t8
-    shra_r.ph         t4, s0, 8
-    shra_r.ph         t6, s1, 8
-    shra_r.ph         t7, s2, 8
-    shra_r.ph         t8, s3, 8
-
-    precr.qb.ph       s0, t0, t1
-    precr.qb.ph       s1, t2, t3
-    precr.qb.ph       s2, t4, t6
-    precr.qb.ph       s3, t7, t8
+    precr.qb.ph       t2, t2, t3
 
-    sb                s0, 0(a0)
-    sb                s1, 1(a0)
-    sb                s2, 2(a0)
-    sb                s3, 3(a0)
-    bgtz              t9, 1b
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a2, a2, -4
+    b                 0b
      addiu            a0, a0, 4
-2:
-    beqz              a2, 4f
+
+1:
+    beqz              a2, 3f
      nop
-3:
-    lbu               t1, 0(a0)
+    srl               t8, a1, 24
+2:
+    lbu               t0, 0(a0)
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0x00ff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
 
-    muleu_s.ph.qbl    t4, t1, t5
-    muleu_s.ph.qbr    t7, t1, t5
-    shrl.ph           t6, t4, 8
-    shrl.ph           t0, t7, 8
-    addq.ph           t8, t4, t6
-    addq.ph           t9, t7, t0
-    shra_r.ph         t8, t8, 8
-    shra_r.ph         t9, t9, 8
-    precr.qb.ph       t2, t8, t9
     sb                t2, 0(a0)
     addiu             a2, a2, -1
-    bnez              a2, 3b
+    bnez              a2, 2b
      addiu            a0, a0, 1
-4:
-    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
-5:
+
+3:
     j                 ra
      nop
 
 END(pixman_composite_in_n_8_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
+/*
+ * a0     - dst  (a8r8g8b8)
+ * a1     - src  (a8r8g8b8)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2, s3
+    lw       t8, 16(sp) /* t8 = unit_x */
+    li       t6, 0x00ff00ff
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    sra      t1, a3, 16 /* t0 = vx >> 16 */
+    sll      t1, t1, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t1, a1, t1
+    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    lw       t2, 0(a0)  /* t2 = destination (a8r8g8b8) */
+    lw       t3, 4(a0)  /* t3 = destination (a8r8g8b8) */
+
+    OVER_2x8888_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t9, s0, s1, s2, s3
+
+    sw       t4, 0(a0)
+    sw       t5, 4(a0)
+    addiu    a2, a2, -2
+    addiu    t1, a2, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    lw       t1, 0(a0)  /* t1 = destination (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    OVER_8888_8888 t0, t1, t2, t6, t4, t5, t3, t7
+
+    sw       t2, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
+/*
+ * a0     - dst  (r5g6b5)
+ * a1     - src  (a8r8g8b8)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, v0, v1
+    lw       t8, 40(sp) /* t8 = unit_x */
+    li       t4, 0x00ff00ff
+    li       t5, 0xf800f800
+    li       t6, 0x07e007e0
+    li       t7, 0x001F001F
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+    sra      t1, a3, 16 /* t0 = vx >> 16 */
+    sll      t1, t1, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t1, a1, t1
+    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+    lhu      t2, 0(a0)  /* t2 = destination (r5g6b5) */
+    lhu      t3, 2(a0)  /* t3 = destination (r5g6b5) */
+
+    CONVERT_2x0565_TO_2x8888 t2, t3, v0, v1, t6, t7, s0, s1, s2, s3
+    OVER_2x8888_2x8888       t0, t1, v0, v1, t2, t3, t4, t9, s0, s1, s2, s3, s4
+    CONVERT_2x8888_TO_2x0565 t2, t3, v0, v1, t5, t6, t7, t9, s2
+
+    sh       v0, 0(a0)
+    sh       v1, 2(a0)
+    addiu    a2, a2, -2
+    addiu    t1, a2, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    lhu      t1, 0(a0)  /* t1 = destination (r5g6b5) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    CONVERT_1x0565_TO_1x8888 t1, t2, t5, t6
+    OVER_8888_8888           t0, t2, t1, t4, t3, t5, t6, t7
+    CONVERT_1x8888_TO_1x0565 t1, t2, t5, t6
+
+    sh       t2, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, v0, v1
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips)
+/*
+ * a0     - dst (a8r8g8b8)
+ * a1     - src (r5g6b5)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    beqz     a2, 3f
+     nop
+
+    lw       v0, 16(sp) /* v0 = unit_x */
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+
+    li       t4, 0x07e007e0
+    li       t5, 0x001F001F
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 1  /* t0 = t0 * 2 ((r5g6b5)) */
+    addu     t0, a1, t0
+    lhu      t0, 0(t0)  /* t0 = source ((r5g6b5)) */
+    addu     a3, a3, v0 /* a3 = vx + unit_x */
+    sra      t1, a3, 16 /* t1 = vx >> 16 */
+    sll      t1, t1, 1  /* t1 = t1 * 2 ((r5g6b5)) */
+    addu     t1, a1, t1
+    lhu      t1, 0(t1)  /* t1 = source ((r5g6b5)) */
+    addu     a3, a3, v0 /* a3 = vx + unit_x */
+    addiu    a2, a2, -2
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
+
+    sw       t2, 0(a0)
+    sw       t3, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 1  /* t0 = t0 * 2 ((r5g6b5)) */
+    addu     t0, a1, t0
+    lhu      t0, 0(t0)  /* t0 = source ((r5g6b5)) */
+
+    CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3
+
+    sw       t1, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
 /*
  * a0     - dst  (r5g6b5)
diff --git a/pixman/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman/pixman-mips-dspr2-asm.h
index b330c0f0d..cab122d80 100644
--- a/pixman/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman/pixman-mips-dspr2-asm.h
@@ -354,17 +354,16 @@ LEAF_MIPS32R2(symbol)                                   \
                                 out1_565, out2_565,  \
                                 maskR, maskG, maskB, \
                                 scratch1, scratch2
-    precrq.ph.w       \scratch1, \in2_8888, \in1_8888
-    precr_sra.ph.w    \in2_8888, \in1_8888, 0
-    shll.ph           \scratch1, \scratch1, 8
-    srl               \in2_8888, \in2_8888, 3
-    and               \scratch2, \in2_8888, \maskB
-    and               \scratch1, \scratch1, \maskR
-    srl               \in2_8888, \in2_8888, 2
-    and               \out2_565, \in2_8888, \maskG
-    or                \out2_565, \out2_565, \scratch2
-    or                \out1_565, \out2_565, \scratch1
-    srl               \out2_565, \out1_565, 16
+    precr.qb.ph    \scratch1, \in2_8888, \in1_8888
+    precrq.qb.ph   \in2_8888, \in2_8888, \in1_8888
+    and            \out1_565, \scratch1, \maskR
+    shrl.ph        \scratch1, \scratch1, 3
+    shll.ph        \in2_8888, \in2_8888, 3
+    and            \scratch1, \scratch1, \maskB
+    or             \out1_565, \out1_565, \scratch1
+    and            \in2_8888, \in2_8888, \maskG
+    or             \out1_565, \out1_565, \in2_8888
+    srl            \out2_565, \out1_565, 16
 .endm
 
 /*
@@ -587,6 +586,36 @@ LEAF_MIPS32R2(symbol)                                   \
     addu_s.qb          \out_8888, \out_8888, \s_8888
 .endm
 
+/*
+ * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two
+ * a8r8g8b8 destination pixels (d1_8888 and d2_8888). It also requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro OVER_2x8888_2x8888 s1_8888,   \
+                          s2_8888,   \
+                          d1_8888,   \
+                          d2_8888,   \
+                          out1_8888, \
+                          out2_8888, \
+                          maskLSR,   \
+                          scratch1, scratch2, scratch3, \
+                          scratch4, scratch5, scratch6
+    not                    \scratch1,  \s1_8888
+    srl                    \scratch1,  \scratch1,  24
+    not                    \scratch2,  \s2_8888
+    srl                    \scratch2,  \scratch2,  24
+    MIPS_2xUN8x4_MUL_2xUN8 \d1_8888,   \d2_8888, \
+                           \scratch1,  \scratch2,  \
+                           \out1_8888, \out2_8888, \
+                           \maskLSR, \
+                           \scratch3,  \scratch4, \scratch5, \
+                           \scratch6,  \d1_8888,  \d2_8888
+
+    addu_s.qb              \out1_8888, \out1_8888, \s1_8888
+    addu_s.qb              \out2_8888, \out2_8888, \s2_8888
+.endm
+
 .macro MIPS_UN8x4_MUL_UN8_ADD_UN8x4 s_8888,   \
                                     m_8,      \
                                     d_8888,   \
diff --git a/pixman/pixman/pixman-mips-dspr2.c b/pixman/pixman/pixman-mips-dspr2.c
index 1ea244576..e10c9df0a 100644
--- a/pixman/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman/pixman-mips-dspr2.c
@@ -54,6 +54,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_8888_rev,
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_0565_rev,
                                     uint8_t, 3, uint16_t, 1)
 #endif
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_pixbuf_8888,
+                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_rpixbuf_8888,
+                                    uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
                                     uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565,
@@ -121,6 +125,13 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_0565_8_0565, uint16_t, 1,
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8888_8888, uint32_t, 1,
                                          uint32_t, 1, uint32_t, 1)
 
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_8888, OVER,
+                                         uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_0565, OVER,
+                                         uint32_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (0565_8888, SRC,
+                                         uint16_t, uint32_t)
+
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC,
                                           uint32_t, uint32_t)
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_0565, SRC,
@@ -292,6 +303,10 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, x8r8g8b8, mips_composite_src_0888_8888_rev),
     PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, r5g6b5,   mips_composite_src_0888_0565_rev),
 #endif
+    PIXMAN_STD_FAST_PATH (SRC, pixbuf,   pixbuf,  a8r8g8b8, mips_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, pixbuf,   pixbuf,  a8b8g8r8, mips_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, rpixbuf,  rpixbuf, a8r8g8b8, mips_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, rpixbuf,  rpixbuf, a8b8g8r8, mips_composite_src_pixbuf_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8r8g8b8, mips_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   x8r8g8b8, mips_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8b8g8r8, mips_composite_src_n_8_8888),
@@ -357,6 +372,22 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mips_composite_over_reverse_n_8888),
     PIXMAN_STD_FAST_PATH (IN,           solid, null, a8,       mips_composite_in_n_8),
 
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mips_8888_8888),
+
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_0565),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_0565),
+
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, mips_0565_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8888),
+    /* Note: NONE repeat is not supported yet */
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
+
     PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
     PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
 
diff --git a/pixman/pixman/pixman-mips-dspr2.h b/pixman/pixman/pixman-mips-dspr2.h
index 4ac9ff95d..955ed70b8 100644
--- a/pixman/pixman/pixman-mips-dspr2.h
+++ b/pixman/pixman/pixman-mips-dspr2.h
@@ -246,6 +246,48 @@ mips_composite_##name (pixman_implementation_t *imp,                     \
     }                                                                    \
 }
 
+/****************************************************************************/
+
+#define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST(name, op,                    \
+                                                src_type, dst_type)          \
+void                                                                         \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (                    \
+                                                   dst_type *       dst,     \
+                                                   const src_type * src,     \
+                                                   int32_t          w,       \
+                                                   pixman_fixed_t   vx,      \
+                                                   pixman_fixed_t   unit_x); \
+                                                                             \
+static force_inline void                                                     \
+scaled_nearest_scanline_mips_##name##_##op (dst_type *       pd,             \
+                                            const src_type * ps,             \
+                                            int32_t          w,              \
+                                            pixman_fixed_t   vx,             \
+                                            pixman_fixed_t   unit_x,         \
+                                            pixman_fixed_t   max_vx,         \
+                                            pixman_bool_t    zero_src)       \
+{                                                                            \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (pd, ps, w,      \
+                                                             vx, unit_x);    \
+}                                                                            \
+                                                                             \
+FAST_NEAREST_MAINLOOP (mips_##name##_cover_##op,                             \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, COVER)                            \
+FAST_NEAREST_MAINLOOP (mips_##name##_none_##op,                              \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, NONE)                             \
+FAST_NEAREST_MAINLOOP (mips_##name##_pad_##op,                               \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, PAD)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                    \
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                            \
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                             \
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+
+
 /*****************************************************************************/
 
 #define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST(flags, name, op,           \
diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c
index 14790c029..c94d282a9 100644
--- a/pixman/pixman/pixman-mmx.c
+++ b/pixman/pixman/pixman-mmx.c
@@ -301,6 +301,29 @@ negate (__m64 mask)
     return _mm_xor_si64 (mask, MC (4x00ff));
 }
 
+/* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
+ * and maps its result to the same range.
+ *
+ * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
+ * Notation, Notation, Notation", the first of which is
+ *
+ *   prod(a, b) = (a * b + 128) / 255.
+ *
+ * By approximating the division by 255 as 257/65536 it can be replaced by a
+ * multiply and a right shift. This is the implementation that we use in
+ * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
+ * 3DNow!, and unavailable at the time of the book's publication) to perform
+ * the multiplication and right shift in a single operation.
+ *
+ *   prod(a, b) = ((a * b + 128) * 257) >> 16.
+ *
+ * A third way (how pix_multiply() was implemented prior to 14208344) exists
+ * also that performs the multiplication by 257 with adds and shifts.
+ *
+ * Where temp = a * b + 128
+ *
+ *   prod(a, b) = (temp + (temp >> 8)) >> 8.
+ */
 static force_inline __m64
 pix_multiply (__m64 a, __m64 b)
 {
@@ -3899,52 +3922,23 @@ mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
-typedef struct
-{
-    pixman_format_code_t	format;
-    pixman_iter_get_scanline_t	get_scanline;
-} fetcher_info_t;
-
-static const fetcher_info_t fetchers[] =
-{
-    { PIXMAN_x8r8g8b8,		mmx_fetch_x8r8g8b8 },
-    { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
-    { PIXMAN_a8,		mmx_fetch_a8 },
-    { PIXMAN_null }
-};
-
-static pixman_bool_t
-mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
-{
-    pixman_image_t *image = iter->image;
-
-#define FLAGS								\
+#define IMAGE_FLAGS							\
     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
 
-    if ((iter->iter_flags & ITER_NARROW)			&&
-	(iter->image_flags & FLAGS) == FLAGS)
-    {
-	const fetcher_info_t *f;
-
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
-	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
-
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
-
-		iter->get_scanline = f->get_scanline;
-		return TRUE;
-	    }
-	}
-    }
-
-    return FALSE;
-}
+static const pixman_iter_info_t mmx_iters[] = 
+{
+    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
+    },
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
+    },
+    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
+    },
+    { PIXMAN_null },
+};
 
 static const pixman_fast_path_t mmx_fast_paths[] =
 {
@@ -4074,7 +4068,7 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
     imp->blt = mmx_blt;
     imp->fill = mmx_fill;
 
-    imp->src_iter_init = mmx_src_iter_init;
+    imp->iter_info = mmx_iters;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-noop.c b/pixman/pixman/pixman-noop.c
index e39996d9d..e59890492 100644
--- a/pixman/pixman/pixman-noop.c
+++ b/pixman/pixman/pixman-noop.c
@@ -37,12 +37,6 @@ noop_composite (pixman_implementation_t *imp,
     return;
 }
 
-static void
-dest_write_back_direct (pixman_iter_t *iter)
-{
-    iter->buffer += iter->image->bits.rowstride;
-}
-
 static uint32_t *
 noop_get_scanline (pixman_iter_t *iter, const uint32_t *mask)
 {
@@ -53,110 +47,102 @@ noop_get_scanline (pixman_iter_t *iter, const uint32_t *mask)
     return result;
 }
 
-static uint32_t *
-get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
-{
-    return NULL;
+static void
+noop_init_solid_narrow (pixman_iter_t *iter,
+			const pixman_iter_info_t *info)
+{ 
+    pixman_image_t *image = iter->image;
+    uint32_t *buffer = iter->buffer;
+    uint32_t *end = buffer + iter->width;
+    uint32_t color;
+
+    if (iter->image->type == SOLID)
+	color = image->solid.color_32;
+    else
+	color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
+
+    while (buffer < end)
+	*(buffer++) = color;
 }
 
-static pixman_bool_t
-noop_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static void
+noop_init_solid_wide (pixman_iter_t *iter,
+		      const pixman_iter_info_t *info)
 {
     pixman_image_t *image = iter->image;
+    argb_t *buffer = (argb_t *)iter->buffer;
+    argb_t *end = buffer + iter->width;
+    argb_t color;
 
-#define FLAGS						\
-    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
-
-    if (!image)
-    {
-	iter->get_scanline = get_scanline_null;
-    }
-    else if ((iter->iter_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
-	     (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
-    {
-	iter->get_scanline = _pixman_iter_get_scanline_noop;
-    }
-    else if (image->common.extended_format_code == PIXMAN_solid		&&
-	     (iter->image->type == SOLID ||
-	      (iter->image_flags & FAST_PATH_NO_ALPHA_MAP)))
-    {
-	if (iter->iter_flags & ITER_NARROW)
-	{
-	    uint32_t *buffer = iter->buffer;
-	    uint32_t *end = buffer + iter->width;
-	    uint32_t color;
-
-	    if (image->type == SOLID)
-		color = image->solid.color_32;
-	    else
-		color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
-
-	    while (buffer < end)
-		*(buffer++) = color;
-	}
-	else
-	{
-	    argb_t *buffer = (argb_t *)iter->buffer;
-	    argb_t *end = buffer + iter->width;
-	    argb_t color;
-
-	    if (image->type == SOLID)
-		color = image->solid.color_float;
-	    else
-		color = image->bits.fetch_pixel_float (&image->bits, 0, 0);
-
-	    while (buffer < end)
-		*(buffer++) = color;
-	}
-
-	iter->get_scanline = _pixman_iter_get_scanline_noop;
-    }
-    else if (image->common.extended_format_code == PIXMAN_a8r8g8b8	&&
-	     (iter->iter_flags & ITER_NARROW)				&&
-	     (iter->image_flags & FLAGS) == FLAGS			&&
-	     iter->x >= 0 && iter->y >= 0				&&
-	     iter->x + iter->width <= image->bits.width			&&
-	     iter->y + iter->height <= image->bits.height)
-    {
-	iter->buffer =
-	    image->bits.bits + iter->y * image->bits.rowstride + iter->x;
-
-	iter->get_scanline = noop_get_scanline;
-    }
+    if (iter->image->type == SOLID)
+	color = image->solid.color_float;
     else
-    {
-	return FALSE;
-    }
+	color = image->bits.fetch_pixel_float (&image->bits, 0, 0);
 
-    return TRUE;
+    while (buffer < end)
+	*(buffer++) = color;
 }
 
-static pixman_bool_t
-noop_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static void
+noop_init_direct_buffer (pixman_iter_t *iter, const pixman_iter_info_t *info)
 {
     pixman_image_t *image = iter->image;
-    uint32_t image_flags = iter->image_flags;
-    uint32_t iter_flags = iter->iter_flags;
-    
-    if ((image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS	&&
-	(iter_flags & ITER_NARROW) == ITER_NARROW				&&
-	((image->common.extended_format_code == PIXMAN_a8r8g8b8)	||
-	 (image->common.extended_format_code == PIXMAN_x8r8g8b8 &&
-	  (iter_flags & (ITER_LOCALIZED_ALPHA)))))
-    {
-	iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x;
-
-	iter->get_scanline = _pixman_iter_get_scanline_noop;
-	iter->write_back = dest_write_back_direct;
-
-	return TRUE;
-    }
-    else
-    {
-	return FALSE;
-    }
+
+    iter->buffer =
+	image->bits.bits + iter->y * image->bits.rowstride + iter->x;
 }
 
+static void
+dest_write_back_direct (pixman_iter_t *iter)
+{
+    iter->buffer += iter->image->bits.rowstride;
+}
+
+static const pixman_iter_info_t noop_iters[] =
+{
+    /* Source iters */
+    { PIXMAN_any,
+      0, ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_SRC,
+      NULL,
+      _pixman_iter_get_scanline_noop,
+      NULL
+    },
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP, ITER_NARROW | ITER_SRC,
+      noop_init_solid_narrow,
+      _pixman_iter_get_scanline_noop,
+      NULL,
+    },
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP, ITER_WIDE | ITER_SRC,
+      noop_init_solid_wide,
+      _pixman_iter_get_scanline_noop,
+      NULL
+    },
+    { PIXMAN_a8r8g8b8,
+      FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |
+          FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,
+      ITER_NARROW | ITER_SRC,
+      noop_init_direct_buffer,
+      noop_get_scanline,
+      NULL
+    },
+    /* Dest iters */
+    { PIXMAN_a8r8g8b8,
+      FAST_PATH_STD_DEST_FLAGS, ITER_NARROW | ITER_DEST,
+      noop_init_direct_buffer,
+      _pixman_iter_get_scanline_noop,
+      dest_write_back_direct
+    },
+    { PIXMAN_x8r8g8b8,
+      FAST_PATH_STD_DEST_FLAGS, ITER_NARROW | ITER_DEST | ITER_LOCALIZED_ALPHA,
+      noop_init_direct_buffer,
+      _pixman_iter_get_scanline_noop,
+      dest_write_back_direct
+    },
+    { PIXMAN_null },
+};
+
 static const pixman_fast_path_t noop_fast_paths[] =
 {
     { PIXMAN_OP_DST, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, noop_composite },
@@ -169,8 +155,7 @@ _pixman_implementation_create_noop (pixman_implementation_t *fallback)
     pixman_implementation_t *imp =
 	_pixman_implementation_create (fallback, noop_fast_paths);
  
-    imp->src_iter_init = noop_src_iter_init;
-    imp->dest_iter_init = noop_dest_iter_init;
+    imp->iter_info = noop_iters;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h
index 6d9c05321..af4a0b6e0 100644
--- a/pixman/pixman/pixman-private.h
+++ b/pixman/pixman/pixman-private.h
@@ -212,7 +212,8 @@ typedef void      (* pixman_iter_write_back_t)   (pixman_iter_t *iter);
 
 typedef enum
 {
-    ITER_NARROW =		(1 << 0),
+    ITER_NARROW =               (1 << 0),
+    ITER_WIDE =                 (1 << 1),
 
     /* "Localized alpha" is when the alpha channel is used only to compute
      * the alpha value of the destination. This means that the computation
@@ -229,9 +230,15 @@ typedef enum
      * we can treat it as if it were ARGB, which means in some cases we can
      * avoid copying it to a temporary buffer.
      */
-    ITER_LOCALIZED_ALPHA =	(1 << 1),
-    ITER_IGNORE_ALPHA =		(1 << 2),
-    ITER_IGNORE_RGB =		(1 << 3)
+    ITER_LOCALIZED_ALPHA =	(1 << 2),
+    ITER_IGNORE_ALPHA =		(1 << 3),
+    ITER_IGNORE_RGB =		(1 << 4),
+
+    /* These indicate whether the iterator is for a source
+     * or a destination image
+     */
+    ITER_SRC =			(1 << 5),
+    ITER_DEST =			(1 << 6)
 } iter_flags_t;
 
 struct pixman_iter_t
@@ -255,6 +262,19 @@ struct pixman_iter_t
     int				stride;
 };
 
+typedef struct pixman_iter_info_t pixman_iter_info_t;
+typedef void (* pixman_iter_initializer_t) (pixman_iter_t *iter,
+                                            const pixman_iter_info_t *info);
+struct pixman_iter_info_t
+{
+    pixman_format_code_t	format;
+    uint32_t			image_flags;
+    iter_flags_t		iter_flags;
+    pixman_iter_initializer_t	initializer;
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+};
+
 void
 _pixman_bits_image_setup_accessors (bits_image_t *image);
 
@@ -454,8 +474,6 @@ typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
 					     int                      width,
 					     int                      height,
 					     uint32_t                 filler);
-typedef pixman_bool_t (*pixman_iter_init_func_t) (pixman_implementation_t *imp,
-						  pixman_iter_t           *iter);
 
 void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
 void _pixman_setup_combiner_functions_float (pixman_implementation_t *imp);
@@ -477,11 +495,10 @@ struct pixman_implementation_t
     pixman_implementation_t *	toplevel;
     pixman_implementation_t *	fallback;
     const pixman_fast_path_t *	fast_paths;
+    const pixman_iter_info_t *  iter_info;
 
     pixman_blt_func_t		blt;
     pixman_fill_func_t		fill;
-    pixman_iter_init_func_t     src_iter_init;
-    pixman_iter_init_func_t     dest_iter_init;
 
     pixman_combine_32_func_t	combine_32[PIXMAN_N_OPERATORS];
     pixman_combine_32_func_t	combine_32_ca[PIXMAN_N_OPERATORS];
@@ -542,29 +559,17 @@ _pixman_implementation_fill (pixman_implementation_t *imp,
                              int                      height,
                              uint32_t                 filler);
 
-pixman_bool_t
-_pixman_implementation_src_iter_init (pixman_implementation_t       *imp,
-				      pixman_iter_t                 *iter,
-				      pixman_image_t                *image,
-				      int                            x,
-				      int                            y,
-				      int                            width,
-				      int                            height,
-				      uint8_t                       *buffer,
-				      iter_flags_t                   flags,
-				      uint32_t                       image_flags);
-
-pixman_bool_t
-_pixman_implementation_dest_iter_init (pixman_implementation_t       *imp,
-				       pixman_iter_t                 *iter,
-				       pixman_image_t                *image,
-				       int                            x,
-				       int                            y,
-				       int                            width,
-				       int                            height,
-				       uint8_t                       *buffer,
-				       iter_flags_t                   flags,
-				       uint32_t                       image_flags);
+void
+_pixman_implementation_iter_init (pixman_implementation_t       *imp,
+                                  pixman_iter_t                 *iter,
+                                  pixman_image_t                *image,
+                                  int                            x,
+                                  int                            y,
+                                  int                            width,
+                                  int                            height,
+                                  uint8_t                       *buffer,
+                                  iter_flags_t                   flags,
+                                  uint32_t                       image_flags);
 
 /* Specific implementations */
 pixman_implementation_t *
@@ -647,6 +652,9 @@ _pixman_compute_composite_region32 (pixman_region32_t * region,
 uint32_t *
 _pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask);
 
+void
+_pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *info);
+
 /* These "formats" all have depth 0, so they
  * will never clash with any real ones
  */
diff --git a/pixman/pixman/pixman-region.c b/pixman/pixman/pixman-region.c
index 2d6f1571c..59bc9c797 100644
--- a/pixman/pixman/pixman-region.c
+++ b/pixman/pixman/pixman-region.c
@@ -1858,7 +1858,7 @@ pixman_region_subtract_o (region_type_t * region,
         else if (r2->x1 <= x1)
         {
             /*
-	     * Subtrahend preceeds minuend: nuke left edge of minuend.
+	     * Subtrahend precedes minuend: nuke left edge of minuend.
 	     */
             x1 = r2->x2;
             if (x1 >= r1->x2)
@@ -1982,7 +1982,7 @@ PREFIX (_subtract) (region_type_t *reg_d,
     }
 
     /* Add those rectangles in region 1 that aren't in region 2,
-       do yucky substraction for overlaps, and
+       do yucky subtraction for overlaps, and
        just throw away rectangles in region 2 that aren't in region 1 */
     if (!pixman_op (reg_d, reg_m, reg_s, pixman_region_subtract_o, TRUE, FALSE))
 	return FALSE;
@@ -2042,7 +2042,7 @@ PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region */
     }
 
     /* Add those rectangles in region 1 that aren't in region 2,
-     * do yucky substraction for overlaps, and
+     * do yucky subtraction for overlaps, and
      * just throw away rectangles in region 2 that aren't in region 1
      */
     inv_reg.extents = *inv_rect;
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index c7e9a4bb2..dde923524 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -5554,19 +5554,27 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
 
-#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
-
-#define BILINEAR_DECLARE_VARIABLES						\
+#if BILINEAR_INTERPOLATION_BITS < 8
+# define BILINEAR_DECLARE_VARIABLES						\
     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
-    const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
-    const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
-    const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
-    const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
+    const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
+					  unit_x, -unit_x, unit_x, -unit_x);	\
+    const __m128i xmm_zero = _mm_setzero_si128 ();				\
+    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
+				   vx, -(vx + 1), vx, -(vx + 1))
+#else
+# define BILINEAR_DECLARE_VARIABLES						\
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
-					  unit_x, unit_x, unit_x, unit_x);	\
+					  -unit_x, -unit_x, -unit_x, -unit_x);	\
     const __m128i xmm_zero = _mm_setzero_si128 ();				\
-    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
+    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx,				\
+				   -(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1))
+#endif
 
 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
 do {										\
@@ -5585,8 +5593,8 @@ do {										\
     if (BILINEAR_INTERPOLATION_BITS < 8)					\
     {										\
 	/* calculate horizontal weights */					\
-	xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,		\
-		   _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
 	/* horizontal interpolation */						\
 	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
@@ -5595,8 +5603,8 @@ do {										\
     else									\
     {										\
 	/* calculate horizontal weights */					\
-	xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,		\
-		_mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
 	/* horizontal interpolation */						\
 	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
@@ -6332,52 +6340,23 @@ sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
-typedef struct
-{
-    pixman_format_code_t	format;
-    pixman_iter_get_scanline_t	get_scanline;
-} fetcher_info_t;
-
-static const fetcher_info_t fetchers[] =
-{
-    { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
-    { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },
-    { PIXMAN_a8,		sse2_fetch_a8 },
-    { PIXMAN_null }
-};
-
-static pixman_bool_t
-sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
-{
-    pixman_image_t *image = iter->image;
-
-#define FLAGS								\
+#define IMAGE_FLAGS							\
     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
 
-    if ((iter->iter_flags & ITER_NARROW)			&&
-	(iter->image_flags & FLAGS) == FLAGS)
-    {
-	const fetcher_info_t *f;
-
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
-	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
-
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
-
-		iter->get_scanline = f->get_scanline;
-		return TRUE;
-	    }
-	}
-    }
-
-    return FALSE;
-}
+static const pixman_iter_info_t sse2_iters[] = 
+{
+    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
+    },
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
+    },
+    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
+    },
+    { PIXMAN_null },
+};
 
 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
 __attribute__((__force_align_arg_pointer__))
@@ -6435,7 +6414,7 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
     imp->blt = sse2_blt;
     imp->fill = sse2_fill;
 
-    imp->src_iter_init = sse2_src_iter_init;
+    imp->iter_info = sse2_iters;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-utils.c b/pixman/pixman/pixman-utils.c
index f31171f6d..98723a800 100644
--- a/pixman/pixman/pixman-utils.c
+++ b/pixman/pixman/pixman-utils.c
@@ -214,6 +214,17 @@ _pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
+void
+_pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *info)
+{
+    pixman_image_t *image = iter->image;
+    uint8_t *b = (uint8_t *)image->bits.bits;
+    int s = image->bits.rowstride * 4;
+
+    iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (info->format) / 8;
+    iter->stride = s;
+}
+
 #define N_TMP_BOXES (16)
 
 pixman_bool_t
diff --git a/pixman/pixman/pixman-vmx.c b/pixman/pixman/pixman-vmx.c
index 6868704a8..f629003ab 100644
--- a/pixman/pixman/pixman-vmx.c
+++ b/pixman/pixman/pixman-vmx.c
@@ -25,7 +25,9 @@
  * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
  */
 
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 #include <altivec.h>
diff --git a/pixman/pixman/pixman.c b/pixman/pixman/pixman.c
index 184f0c4e6..9555ceaaf 100644
--- a/pixman/pixman/pixman.c
+++ b/pixman/pixman/pixman.c
@@ -605,7 +605,7 @@ pixman_image_composite32 (pixman_op_t      op,
     else
     {
 	mask_format = PIXMAN_null;
-	info.mask_flags = FAST_PATH_IS_OPAQUE;
+	info.mask_flags = FAST_PATH_IS_OPAQUE | FAST_PATH_NO_ALPHA_MAP;
     }
 
     dest_format = dest->common.extended_format_code;
diff --git a/pixman/pixman/refactor b/pixman/pixman/refactor
deleted file mode 100644
index 52fceab17..000000000
--- a/pixman/pixman/refactor
+++ /dev/null
@@ -1,478 +0,0 @@
-Roadmap
-
-- Move all the fetchers etc. into pixman-image to make pixman-compose.c
-  less intimidating.
-
-  DONE
-
-- Make combiners for unified alpha take a mask argument. That way
-  we won't need two separate paths for unified vs component in the
-  general compositing code.
-
-  DONE, except that the Altivec code needs to be updated. Luca is
-  looking into that.
-
-- Delete separate 'unified alpha' path
- 
-  DONE
-
-- Split images into their own files
-
-  DONE
-
-- Split the gradient walker code out into its own file
-
-  DONE
-
-- Add scanline getters per image
-
-  DONE
-
-- Generic 64 bit fetcher 
-
-  DONE
-
-- Split fast path tables into their respective architecture dependent
-  files.
-
-See "Render Algorithm" below for rationale
-
-Images will eventually have these virtual functions:
-
-       get_scanline()
-       get_scanline_wide()
-       get_pixel()
-       get_pixel_wide()
-       get_untransformed_pixel()
-       get_untransformed_pixel_wide()
-       get_unfiltered_pixel()
-       get_unfiltered_pixel_wide()
-
-       store_scanline()
-       store_scanline_wide()
-
-1.
-
-Initially we will just have get_scanline() and get_scanline_wide();
-these will be based on the ones in pixman-compose. Hopefully this will
-reduce the complexity in pixman_composite_rect_general().
-
-Note that there is access considerations - the compose function is
-being compiled twice.
-
-
-2.
-
-Split image types into their own source files. Export noop virtual
-reinit() call.  Call this whenever a property of the image changes.
-
-
-3. 
-
-Split the get_scanline() call into smaller functions that are
-initialized by the reinit() call.
-
-The Render Algorithm:
-	(first repeat, then filter, then transform, then clip)
-
-Starting from a destination pixel (x, y), do
-
-	1 x = x - xDst + xSrc
-	  y = y - yDst + ySrc
-
-	2 reject pixel that is outside the clip
-
-	This treats clipping as something that happens after
-	transformation, which I think is correct for client clips. For
-	hierarchy clips it is wrong, but who really cares? Without
-	GraphicsExposes hierarchy clips are basically irrelevant. Yes,
-	you could imagine cases where the pixels of a subwindow of a
-	redirected, transformed window should be treated as
-	transparent. I don't really care
-
-	Basically, I think the render spec should say that pixels that
-	are unavailable due to the hierarcy have undefined content,
-	and that GraphicsExposes are not generated. Ie., basically
-	that using non-redirected windows as sources is fail. This is
-	at least consistent with the current implementation and we can
-	update the spec later if someone makes it work.
-
-	The implication for render is that it should stop passing the
-	hierarchy clip to pixman. In pixman, if a souce image has a
-	clip it should be used in computing the composite region and
-	nowhere else, regardless of what "has_client_clip" says. The
-	default should be for there to not be any clip.
-
-	I would really like to get rid of the client clip as well for
-	source images, but unfortunately there is at least one
-	application in the wild that uses them.
-
-	3 Transform pixel: (x, y) = T(x, y)
-
-	4 Call p = GetUntransformedPixel (x, y)
-
-	5 If the image has an alpha map, then
-
-		Call GetUntransformedPixel (x, y) on the alpha map
-		
-		add resulting alpha channel to p
-
-	   return p
-
-	Where GetUnTransformedPixel is:
-
-	6 switch (filter)
-	  {
-	  case NEAREST:
-		return GetUnfilteredPixel (x, y);
-		break;
-
-	  case BILINEAR:
-		return GetUnfilteredPixel (...) // 4 times 
-		break;
-
-	  case CONVOLUTION:
-		return GetUnfilteredPixel (...) // as many times as necessary.
-		break;
-	  }
-
-	Where GetUnfilteredPixel (x, y) is
-
-	7 switch (repeat)
-	   {
-	   case REPEAT_NORMAL:
-	   case REPEAT_PAD:
-	   case REPEAT_REFLECT:
-		// adjust x, y as appropriate
-		break;
-
-	   case REPEAT_NONE:
-	        if (x, y) is outside image bounds
-		     return 0;
-		break;
-	   }
-
-	   return GetRawPixel(x, y)
-
-	Where GetRawPixel (x, y) is
-
-	8 Compute the pixel in question, depending on image type.
-
-For gradients, repeat has a totally different meaning, so
-UnfilteredPixel() and RawPixel() must be the same function so that
-gradients can do their own repeat algorithm.
-
-So, the GetRawPixel
-
-	for bits must deal with repeats
-	for gradients must deal with repeats (differently)
-	for solids, should ignore repeats.
-
-	for polygons, when we add them, either ignore repeats or do
-	something similar to bits (in which case, we may want an extra
-	layer of indirection to modify the coordinates).
-
-It is then possible to build things like "get scanline" or "get tile" on
-top of this. In the simplest case, just repeatedly calling GetPixel()
-would work, but specialized get_scanline()s or get_tile()s could be
-plugged in for common cases. 
-
-By not plugging anything in for images with access functions, we only
-have to compile the pixel functions twice, not the scanline functions.
-
-And we can get rid of fetchers for the bizarre formats that no one
-uses. Such as b2g3r3 etc. r1g2b1? Seriously? It is also worth
-considering a generic format based pixel fetcher for these edge cases.
-
-Since the actual routines depend on the image attributes, the images
-must be notified when those change and update their function pointers
-appropriately. So there should probably be a virtual function called
-(* reinit) or something like that.
-
-There will also be wide fetchers for both pixels and lines. The line
-fetcher will just call the wide pixel fetcher. The wide pixel fetcher
-will just call expand, except for 10 bit formats.
-
-Rendering pipeline:
-
-Drawable:
-	0. if (picture has alpha map)
-		0.1. Position alpha map according to the alpha_x/alpha_y
-	        0.2. Where the two drawables intersect, the alpha channel
-		     Replace the alpha channel of source with the one
-		     from the alpha map. Replacement only takes place
-		     in the intersection of the two drawables' geometries.
-	1. Repeat the drawable according to the repeat attribute
-	2. Reconstruct a continuous image according to the filter
-	3. Transform according to the transform attribute
-	4. Position image such that src_x, src_y is over dst_x, dst_y
-	5. Sample once per destination pixel 
-	6. Clip. If a pixel is not within the source clip, then no
-	   compositing takes place at that pixel. (Ie., it's *not*
-	   treated as 0).
-
-	Sampling a drawable: 
-
-	- If the channel does not have an alpha channel, the pixels in it
-	  are treated as opaque.
-
-	Note on reconstruction:
-
-	- The top left pixel has coordinates (0.5, 0.5) and pixels are
-	  spaced 1 apart.
-
-Gradient:
-	1. Unless gradient type is conical, repeat the underlying (0, 1)
-		gradient according to the repeat attribute
-	2. Integrate the gradient across the plane according to type.
-	3. Transform according to transform attribute
-	4. Position gradient 
-	5. Sample once per destination pixel.
- 	6. Clip
-
-Solid Fill:
-	1. Repeat has no effect
-	2. Image is already continuous and defined for the entire plane
-	3. Transform has no effect
-	4. Positioning has no effect
-	5. Sample once per destination pixel.
-	6. Clip
-
-Polygon:
-	1. Repeat has no effect
-	2. Image is already continuous and defined on the whole plane
-	3. Transform according to transform attribute
-	4. Position image
-	5. Supersample 15x17 per destination pixel.
-	6. Clip
-
-Possibly interesting additions:
-	- More general transformations, such as warping, or general
-	  shading.
-
-	- Shader image where a function is called to generate the
-          pixel (ie., uploading assembly code).
-
-	- Resampling kernels
-
-	  In principle the polygon image uses a 15x17 box filter for
-	  resampling. If we allow general resampling filters, then we
-	  get all the various antialiasing types for free. 
-
-	  Bilinear downsampling looks terrible and could be much 
-	  improved by a resampling filter. NEAREST reconstruction
-	  combined with a box resampling filter is what GdkPixbuf
-	  does, I believe.
-
-	  Useful for high frequency gradients as well.
-
-	  (Note that the difference between a reconstruction and a
-	  resampling filter is mainly where in the pipeline they
-	  occur. High quality resampling should use a correctly
-	  oriented kernel so it should happen after transformation.
-
-	  An implementation can transform the resampling kernel and
-	  convolve it with the reconstruction if it so desires, but it
-	  will need to deal with the fact that the resampling kernel
-	  will not necessarily be pixel aligned.
-
-	  "Output kernels"
-
-	  One could imagine doing the resampling after compositing,
-	  ie., for each destination pixel sample each source image 16
-	  times, then composite those subpixels individually, then
-	  finally apply a kernel.
-
-	  However, this is effectively the same as full screen
-	  antialiasing, which is a simpler way to think about it. So
-	  resampling kernels may make sense for individual images, but
-	  not as a post-compositing step.
-	  
-	  Fullscreen AA is inefficient without chained compositing
-	  though. Consider an (image scaled up to oversample size IN
-	  some polygon) scaled down to screen size. With the current
-	  implementation, there will be a huge temporary. With chained
-	  compositing, the whole thing ends up being equivalent to the
-	  output kernel from above.
-
-	- Color space conversion
-
-	  The complete model here is that each surface has a color
-	  space associated with it and that the compositing operation
-	  also has one associated with it. Note also that gradients
-	  should have associcated colorspaces.
-
-	- Dithering
-
-	  If people dither something that is already dithered, it will
-	  look terrible, but don't do that, then. (Dithering happens
-	  after resampling if at all - what is the relationship
-	  with color spaces? Presumably dithering should happen in linear
-	  intensity space).
-
-	- Floating point surfaces, 16, 32 and possibly 64 bit per
-	  channel.
-
-	Maybe crack:
-
-	- Glyph polygons
-
-	  If glyphs could be given as polygons, they could be
-	  positioned and rasterized more accurately. The glyph
-	  structure would need subpixel positioning though.
-
-	- Luminance vs. coverage for the alpha channel
-
-	  Whether the alpha channel should be interpreted as luminance
-          modulation or as coverage (intensity modulation). This is a
-          bit of a departure from the rendering model though. It could
-	  also be considered whether it should be possible to have 
-	  both channels in the same drawable.
-
-	- Alternative for component alpha
-
-	  - Set component-alpha on the output image.
-
-	    - This means each of the components are sampled
-	      independently and composited in the corresponding
-	      channel only.
-
-	  - Have 3 x oversampled mask
-
-	  - Scale it down by 3 horizontally, with [ 1/3, 1/3, 1/3 ]
-            resampling filter. 
-
-	    Is this equivalent to just using a component alpha mask?
-
-	Incompatible changes:
-
-	- Gradients could be specified with premultiplied colors. (You
-	  can use a mask to get things like gradients from solid red to
-	  transparent red.
-
-Refactoring pixman
-
-The pixman code is not particularly nice to put it mildly. Among the
-issues are
-
-- inconsistent naming style (fb vs Fb, camelCase vs
-  underscore_naming). Sometimes there is even inconsistency *within*
-  one name.
-
-      fetchProc32 ACCESS(pixman_fetchProcForPicture32)
-
-  may be one of the uglies names ever created.
-
-  coding style: 
-  	 use the one from cairo except that pixman uses this brace style:
-	 
-		while (blah)
-		{
-		}
-
-	Format do while like this:
-
-	       do 
-	       {
-
-	       } 
-	       while (...);
-
-- PIXMAN_COMPOSITE_RECT_GENERAL() is horribly complex
-
-- switch case logic in pixman-access.c
-
-  Instead it would be better to just store function pointers in the
-  image objects themselves,
-
-  	get_pixel()
-	get_scanline()
-
-- Much of the scanline fetching code is for formats that no one 
-  ever uses. a2r2g2b2 anyone?
-
-  It would probably be worthwhile having a generic fetcher for any
-  pixman format whatsoever.
-
-- Code related to particular image types should be split into individual
-  files.
-
-	pixman-bits-image.c
-	pixman-linear-gradient-image.c
-	pixman-radial-gradient-image.c
-	pixman-solid-image.c
-
-- Fast path code should be split into files based on architecture:
-
-       pixman-mmx-fastpath.c
-       pixman-sse2-fastpath.c
-       pixman-c-fastpath.c
-
-       etc.
-
-  Each of these files should then export a fastpath table, which would
-  be declared in pixman-private.h. This should allow us to get rid
-  of the pixman-mmx.h files.
-
-  The fast path table should describe each fast path. Ie there should
-  be bitfields indicating what things the fast path can handle, rather than
-  like now where it is only allowed to take one format per src/mask/dest. Ie., 
-
-  { 
-    FAST_a8r8g8b8 | FAST_x8r8g8b8,
-    FAST_null,
-    FAST_x8r8g8b8,
-    FAST_repeat_normal | FAST_repeat_none,
-    the_fast_path
-  }
-
-There should then be *one* file that implements pixman_image_composite(). 
-This should do this:
-
-     optimize_operator();
-
-     convert 1x1 repeat to solid (actually this should be done at
-     image creation time).
-     
-     is there a useful fastpath?
-
-There should be a file called pixman-cpu.c that contains all the
-architecture specific stuff to detect what CPU features we have.
-
-Issues that must be kept in mind:
-
-       - we need accessor code to be preserved
-
-       - maybe there should be a "store_scanline" too?
-
-         Is this sufficient?
-
-	 We should preserve the optimization where the
-	 compositing happens directly in the destination
-	 whenever possible.
-
-	- It should be possible to create GPU samplers from the
-	  images.
-
-The "horizontal" classification should be a bit in the image, the
-"vertical" classification should just happen inside the gradient
-file. Note though that
-
-      (a) these will change if the tranformation/repeat changes.
-
-      (b) at the moment the optimization for linear gradients
-          takes the source rectangle into account. Presumably
-	  this is to also optimize the case where the gradient
-	  is close enough to horizontal?
-
-Who is responsible for repeats? In principle it should be the scanline
-fetch. Right now NORMAL repeats are handled by walk_composite_region()
-while other repeats are handled by the scanline code.
-
-
-(Random note on filtering: do you filter before or after
-transformation?  Hardware is going to filter after transformation;
-this is also what pixman does currently). It's not completely clear
-what filtering *after* transformation means. One thing that might look
-good would be to do *supersampling*, ie., compute multiple subpixels
-per destination pixel, then average them together.
diff --git a/pixman/test/blitters-test.c b/pixman/test/blitters-test.c
index 8766fa800..a2c6ff4d8 100644
--- a/pixman/test/blitters-test.c
+++ b/pixman/test/blitters-test.c
@@ -46,7 +46,16 @@ create_random_image (pixman_format_code_t *allowed_formats,
     /* do the allocation */
     buf = aligned_malloc (64, stride * height);
 
-    prng_randmemset (buf, stride * height, RANDMEMSET_MORE_00_AND_FF);
+    if (prng_rand_n (4) == 0)
+    {
+	/* uniform distribution */
+	prng_randmemset (buf, stride * height, 0);
+    }
+    else
+    {
+	/* significantly increased probability for 0x00 and 0xFF */
+	prng_randmemset (buf, stride * height, RANDMEMSET_MORE_00_AND_FF);
+    }
 
     img = pixman_image_create_bits (fmt, width, height, buf, stride);
 
@@ -393,6 +402,6 @@ main (int argc, const char *argv[])
     }
 
     return fuzzer_test_main("blitters", 2000000,
-			    0xD8265D5E,
+			    0x0CF3283B,
 			    test_composite, argc, argv);
 }
diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c
index 4e16f7ba1..1049e21e7 100644
--- a/pixman/test/lowlevel-blt-bench.c
+++ b/pixman/test/lowlevel-blt-bench.c
@@ -385,6 +385,7 @@ bench_composite (char * testname,
     double                          t1, t2, t3, pix_cnt;
     int64_t                         n, l1test_width, nlines;
     double                             bytes_per_pix = 0;
+    pixman_bool_t                   bench_pixbuf = FALSE;
 
     pixman_composite_func_t func = pixman_image_composite_wrapper;
 
@@ -422,16 +423,20 @@ bench_composite (char * testname,
 
     mask_img = NULL;
     xmask_img = NULL;
+    if (strcmp (testname, "pixbuf") == 0 || strcmp (testname, "rpixbuf") == 0)
+    {
+        bench_pixbuf = TRUE;
+    }
     if (!(mask_flags & SOLID_FLAG) && mask_fmt != PIXMAN_null)
     {
         bytes_per_pix += (mask_fmt >> 24) / ((op == PIXMAN_OP_SRC) ? 8.0 : 4.0);
         mask_img = pixman_image_create_bits (mask_fmt,
                                              WIDTH, HEIGHT,
-                                             mask,
+                                             bench_pixbuf ? src : mask,
                                              WIDTH * 4);
         xmask_img = pixman_image_create_bits (mask_fmt,
                                              XWIDTH, XHEIGHT,
-                                             mask,
+                                             bench_pixbuf ? src : mask,
                                              XWIDTH * 4);
     }
     else if (mask_fmt != PIXMAN_null)
@@ -643,6 +648,8 @@ tests_tbl[] =
     { "src_0888_0565",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "src_0888_8888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "src_0888_x888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_0888_8888_rev",     PIXMAN_b8g8r8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_0888_0565_rev",     PIXMAN_b8g8r8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "src_x888_x888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
     { "src_x888_8888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "src_8888_8888",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
@@ -707,6 +714,8 @@ tests_tbl[] =
     { "outrev_n_8888_x888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_x8r8g8b8 },
     { "outrev_n_8888_8888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_a8r8g8b8 },
     { "over_reverse_n_8888",   PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER_REVERSE, PIXMAN_null, 0, PIXMAN_a8r8g8b8 },
+    { "pixbuf",                PIXMAN_x8b8g8r8,    0, PIXMAN_OP_SRC,     PIXMAN_a8b8g8r8, 0, PIXMAN_a8r8g8b8 },
+    { "rpixbuf",               PIXMAN_x8b8g8r8,    0, PIXMAN_OP_SRC,     PIXMAN_a8b8g8r8, 0, PIXMAN_a8b8g8r8 },
 };
 
 int
diff --git a/pixman/test/prng-test.c b/pixman/test/prng-test.c
index 0a3ad5e8f..c1d9320cc 100644
--- a/pixman/test/prng-test.c
+++ b/pixman/test/prng-test.c
@@ -106,7 +106,10 @@ int main (int argc, char *argv[])
 {
     const uint32_t ref_crc[RANDMEMSET_MORE_00_AND_FF + 1] =
     {
-        0xBA06763D, 0x103FC550, 0x8B59ABA5, 0xD82A0F39
+        0xBA06763D, 0x103FC550, 0x8B59ABA5, 0xD82A0F39,
+        0xD2321099, 0xFD8C5420, 0xD3B7C42A, 0xFC098093,
+        0x85E01DE0, 0x6680F8F7, 0x4D32DD3C, 0xAE52382B,
+        0x149E6CB5, 0x8B336987, 0x15DCB2B3, 0x8A71B781
     };
     uint32_t crc1, crc2;
     uint32_t ref, seed, seed0, seed1, seed2, seed3;
diff --git a/pixman/test/utils-prng.c b/pixman/test/utils-prng.c
index 967b8989a..7b32e3531 100644
--- a/pixman/test/utils-prng.c
+++ b/pixman/test/utils-prng.c
@@ -107,6 +107,7 @@ randmemset_internal (prng_t                  *prng,
 {
     prng_t local_prng = *prng;
     prng_rand_128_data_t randdata;
+    size_t i;
 
     while (size >= 16)
     {
@@ -138,6 +139,22 @@ randmemset_internal (prng_t                  *prng,
                 };
                 randdata.vb &= (t.vb >= const_40);
             }
+            if (flags & RANDMEMSET_MORE_FFFFFFFF)
+            {
+                const uint32x4 const_C0000000 =
+                {
+                    0xC0000000, 0xC0000000, 0xC0000000, 0xC0000000
+                };
+                randdata.vw |= ((t.vw << 30) >= const_C0000000);
+            }
+            if (flags & RANDMEMSET_MORE_00000000)
+            {
+                const uint32x4 const_40000000 =
+                {
+                    0x40000000, 0x40000000, 0x40000000, 0x40000000
+                };
+                randdata.vw &= ((t.vw << 30) >= const_40000000);
+            }
 #else
             #define PROCESS_ONE_LANE(i)                                       \
                 if (flags & RANDMEMSET_MORE_FF)                               \
@@ -155,6 +172,18 @@ randmemset_internal (prng_t                  *prng,
                     mask_00 |= mask_00 >> 2;                                  \
                     mask_00 |= mask_00 >> 4;                                  \
                     randdata.w[i] &= mask_00;                                 \
+                }                                                             \
+                if (flags & RANDMEMSET_MORE_FFFFFFFF)                         \
+                {                                                             \
+                    int32_t mask_ff = ((t.w[i] << 30) & (t.w[i] << 31)) &     \
+                                       0x80000000;                            \
+                    randdata.w[i] |= mask_ff >> 31;                           \
+                }                                                             \
+                if (flags & RANDMEMSET_MORE_00000000)                         \
+                {                                                             \
+                    int32_t mask_00 = ((t.w[i] << 30) | (t.w[i] << 31)) &     \
+                                       0x80000000;                            \
+                    randdata.w[i] &= mask_00 >> 31;                           \
                 }
 
             PROCESS_ONE_LANE (0)
@@ -198,7 +227,8 @@ randmemset_internal (prng_t                  *prng,
         }
         size -= 16;
     }
-    while (size > 0)
+    i = 0;
+    while (i < size)
     {
         uint8_t randbyte = prng_rand_r (&local_prng) & 0xFF;
         if (flags != 0)
@@ -208,9 +238,25 @@ randmemset_internal (prng_t                  *prng,
                 randbyte = 0xFF;
             if ((flags & RANDMEMSET_MORE_00) && (t < 0x40))
                 randbyte = 0x00;
+            if (i % 4 == 0 && i + 4 <= size)
+            {
+                t = prng_rand_r (&local_prng) & 0xFF;
+                if ((flags & RANDMEMSET_MORE_FFFFFFFF) && (t >= 0xC0))
+                {
+                    memset(&buf[i], 0xFF, 4);
+                    i += 4;
+                    continue;
+                }
+                if ((flags & RANDMEMSET_MORE_00000000) && (t < 0x40))
+                {
+                    memset(&buf[i], 0x00, 4);
+                    i += 4;
+                    continue;
+                }
+            }
         }
-        *buf++ = randbyte;
-        size--;
+        buf[i] = randbyte;
+        i++;
     }
     *prng = local_prng;
 }
@@ -218,8 +264,10 @@ randmemset_internal (prng_t                  *prng,
 /*
  * Fill memory buffer with random data. Flags argument may be used
  * to tweak some statistics properties:
- *    RANDMEMSET_MORE_00 - set ~25% of bytes to 0x00
- *    RANDMEMSET_MORE_FF - set ~25% of bytes to 0xFF
+ *    RANDMEMSET_MORE_00        - set ~25% of bytes to 0x00
+ *    RANDMEMSET_MORE_FF        - set ~25% of bytes to 0xFF
+ *    RANDMEMSET_MORE_00000000  - ~25% chance for 00000000 4-byte clusters
+ *    RANDMEMSET_MORE_FFFFFFFF  - ~25% chance for FFFFFFFF 4-byte clusters
  */
 void prng_randmemset_r (prng_t                  *prng,
                         void                    *voidbuf,
diff --git a/pixman/test/utils-prng.h b/pixman/test/utils-prng.h
index 285107f08..564ffcef1 100644
--- a/pixman/test/utils-prng.h
+++ b/pixman/test/utils-prng.h
@@ -153,7 +153,10 @@ typedef enum
 {
     RANDMEMSET_MORE_00        = 1, /* ~25% chance for 0x00 bytes */
     RANDMEMSET_MORE_FF        = 2, /* ~25% chance for 0xFF bytes */
-    RANDMEMSET_MORE_00_AND_FF = (RANDMEMSET_MORE_00 | RANDMEMSET_MORE_FF)
+    RANDMEMSET_MORE_00000000  = 4, /* ~25% chance for 0x00000000 clusters */
+    RANDMEMSET_MORE_FFFFFFFF  = 8, /* ~25% chance for 0xFFFFFFFF clusters */
+    RANDMEMSET_MORE_00_AND_FF = (RANDMEMSET_MORE_00 | RANDMEMSET_MORE_00000000 |
+                                 RANDMEMSET_MORE_FF | RANDMEMSET_MORE_FFFFFFFF)
 } prng_randmemset_flags_t;
 
 /* Set the 32-bit seed for PRNG */
-- 
cgit v1.2.3