25 files changed, 1063 insertions, 1112 deletions
diff --git a/pixman/configure.ac b/pixman/configure.ac
index 38f89b31e..221179ff1 100644
--- a/pixman/configure.ac
+++ b/pixman/configure.ac
@@ -53,8 +53,8 @@ AC_PREREQ([2.57])
 #
 
 m4_define([pixman_major], 0)
-m4_define([pixman_minor], 29)
-m4_define([pixman_micro], 3)
+m4_define([pixman_minor], 31)
+m4_define([pixman_micro], 1)
 
 m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
 
@@ -279,7 +279,7 @@ AC_MSG_CHECKING(whether to use Loongson MMI assembler)
 
 xserver_save_CFLAGS=$CFLAGS
 CFLAGS=" $LS_CFLAGS $CFLAGS -I$srcdir"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
 #ifndef __mips_loongson_vector_rev
 #error "Loongson Multimedia Instructions are only available on Loongson"
 #endif
@@ -845,6 +845,13 @@ if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
 fi
 
 dnl =====================================
+dnl Check for missing sqrtf() as, e.g., for Solaris 9
+
+AC_SEARCH_LIBS([sqrtf], [m], [],
+               [AC_DEFINE([sqrtf], [sqrt],
+                          [Define to sqrt if you do not have the `sqrtf' function.])])
+
+dnl =====================================
 dnl Thread local storage
 
 AC_MSG_CHECKING(for thread local storage (TLS) support)
diff --git a/pixman/pixman/pixman-arm-neon-asm.h b/pixman/pixman/pixman-arm-neon-asm.h
index 1673b080f..d0d92d74c 100644
--- a/pixman/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman/pixman-arm-neon-asm.h
@@ -385,7 +385,7 @@
  * execute simultaneously with NEON and be completely shadowed by it. Thus
  * we get no performance overhead at all (*). This looks like a very nice
  * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
- * but still can implement some rather advanced prefetch logic in sofware
+ * but still can implement some rather advanced prefetch logic in software
  * for almost zero cost!
  *
  * (*) The overhead of the prefetcher is visible when running some trivial
diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c
index 247aea645..3982dce8b 100644
--- a/pixman/pixman/pixman-fast-path.c
+++ b/pixman/pixman/pixman-fast-path.c
@@ -2261,89 +2261,27 @@ fast_write_back_r5g6b5 (pixman_iter_t *iter)
     }
 }
 
-typedef struct
-{
-    pixman_format_code_t	format;
-    pixman_iter_get_scanline_t	get_scanline;
-    pixman_iter_write_back_t	write_back;
-} fetcher_info_t;
-
-static const fetcher_info_t fetchers[] =
-{
-    { PIXMAN_r5g6b5, fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
-    { PIXMAN_null }
-};
-
-static pixman_bool_t
-fast_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
-{
-    pixman_image_t *image = iter->image;
-
-#define FLAGS								\
+#define IMAGE_FLAGS							\
     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
 
-    if ((iter->iter_flags & ITER_NARROW)			&&
-	(iter->image_flags & FLAGS) == FLAGS)
-    {
-	const fetcher_info_t *f;
-
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
-	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
-
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
-
-		iter->get_scanline = f->get_scanline;
-		return TRUE;
-	    }
-	}
-    }
-
-    return FALSE;
-}
-
-static pixman_bool_t
-fast_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static const pixman_iter_info_t fast_iters[] = 
 {
-    pixman_image_t *image = iter->image;
-
-    if ((iter->iter_flags & ITER_NARROW)		&&
-	(iter->image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS)
-    {
-	const fetcher_info_t *f;
-
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
-	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
-
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
-
-		if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
-		    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
-		{
-		    iter->get_scanline = fast_dest_fetch_noop;
-		}
-		else
-		{
-		    iter->get_scanline = f->get_scanline;
-		}
-		iter->write_back = f->write_back;
-		return TRUE;
-	    }
-	}
-    }
-    return FALSE;
-}
-
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW | ITER_SRC,
+      _pixman_iter_init_bits_stride, fast_fetch_r5g6b5, NULL },
+
+    { PIXMAN_r5g6b5, FAST_PATH_STD_DEST_FLAGS,
+      ITER_NARROW | ITER_DEST,
+      _pixman_iter_init_bits_stride,
+      fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
+    
+    { PIXMAN_r5g6b5, FAST_PATH_STD_DEST_FLAGS,
+      ITER_NARROW | ITER_DEST | ITER_IGNORE_RGB | ITER_IGNORE_ALPHA,
+      _pixman_iter_init_bits_stride,
+      fast_dest_fetch_noop, fast_write_back_r5g6b5 },
+
+    { PIXMAN_null },
+};
 
 pixman_implementation_t *
 _pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
@@ -2351,8 +2289,7 @@ _pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
 
     imp->fill = fast_path_fill;
-    imp->src_iter_init = fast_src_iter_init;
-    imp->dest_iter_init = fast_dest_iter_init;
+    imp->iter_info = fast_iters;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-filter.c b/pixman/pixman/pixman-filter.c
index 26b39d571..5ff7b6eaa 100644
--- a/pixman/pixman/pixman-filter.c
+++ b/pixman/pixman/pixman-filter.c
@@ -28,7 +28,9 @@
 #include <stdio.h>
 #include <math.h>
 #include <assert.h>
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include "pixman-private.h"
 
 typedef double (* kernel_func_t) (double x);
diff --git a/pixman/pixman/pixman-general.c b/pixman/pixman/pixman-general.c
index 93a1b9acf..4da5da5e2 100644
--- a/pixman/pixman/pixman-general.c
+++ b/pixman/pixman/pixman-general.c
@@ -37,43 +37,47 @@
 #include <string.h>
 #include "pixman-private.h"
 
-static pixman_bool_t
-general_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static void
+general_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *info)
 {
     pixman_image_t *image = iter->image;
 
-    if (image->type == LINEAR)
-	_pixman_linear_gradient_iter_init (image, iter);
-    else if (image->type == RADIAL)
+    switch (image->type)
+    {
+    case BITS:
+        if ((iter->iter_flags & ITER_SRC) == ITER_SRC)
+            _pixman_bits_image_src_iter_init (image, iter);
+        else
+            _pixman_bits_image_dest_iter_init (image, iter);
+        break;
+
+    case LINEAR:
+        _pixman_linear_gradient_iter_init (image, iter);
+        break;
+
+    case RADIAL:
 	_pixman_radial_gradient_iter_init (image, iter);
-    else if (image->type == CONICAL)
+        break;
+
+    case CONICAL:
 	_pixman_conical_gradient_iter_init (image, iter);
-    else if (image->type == BITS)
-	_pixman_bits_image_src_iter_init (image, iter);
-    else if (image->type == SOLID)
+        break;
+
+    case SOLID:
         _pixman_log_error (FUNC, "Solid image not handled by noop");
-    else         
-	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+        break;
 
-    return TRUE;
+    default:
+	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+        break;
+    }
 }
 
-static pixman_bool_t
-general_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static const pixman_iter_info_t general_iters[] =
 {
-    if (iter->image->type == BITS)
-    {
-	_pixman_bits_image_dest_iter_init (iter->image, iter);
-
-	return TRUE;
-    }
-    else
-    {
-	_pixman_log_error (FUNC, "Trying to write to a non-writable image");
-
-	return FALSE;
-    }
-}
+    { PIXMAN_any, 0, 0, general_iter_init, NULL, NULL },
+    { PIXMAN_null },
+};
 
 typedef struct op_info_t op_info_t;
 struct op_info_t
@@ -116,7 +120,7 @@ general_composite_rect  (pixman_implementation_t *imp,
     pixman_iter_t src_iter, mask_iter, dest_iter;
     pixman_combine_32_func_t compose;
     pixman_bool_t component_alpha;
-    iter_flags_t narrow, src_iter_flags;
+    iter_flags_t width_flag, src_iter_flags;
     int Bpp;
     int i;
 
@@ -124,12 +128,12 @@ general_composite_rect  (pixman_implementation_t *imp,
 	(!mask_image || mask_image->common.flags & FAST_PATH_NARROW_FORMAT) &&
 	(dest_image->common.flags & FAST_PATH_NARROW_FORMAT))
     {
-	narrow = ITER_NARROW;
+	width_flag = ITER_NARROW;
 	Bpp = 4;
     }
     else
     {
-	narrow = 0;
+	width_flag = ITER_WIDE;
 	Bpp = 16;
     }
 
@@ -145,7 +149,7 @@ general_composite_rect  (pixman_implementation_t *imp,
     mask_buffer = src_buffer + width * Bpp;
     dest_buffer = mask_buffer + width * Bpp;
 
-    if (!narrow)
+    if (width_flag == ITER_WIDE)
     {
 	/* To make sure there aren't any NANs in the buffers */
 	memset (src_buffer, 0, width * Bpp);
@@ -154,11 +158,12 @@ general_composite_rect  (pixman_implementation_t *imp,
     }
     
     /* src iter */
-    src_iter_flags = narrow | op_flags[op].src;
+    src_iter_flags = width_flag | op_flags[op].src | ITER_SRC;
 
-    _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src_image,
-					  src_x, src_y, width, height,
-					  src_buffer, src_iter_flags, info->src_flags);
+    _pixman_implementation_iter_init (imp->toplevel, &src_iter, src_image,
+                                      src_x, src_y, width, height,
+                                      src_buffer, src_iter_flags,
+                                      info->src_flags);
 
     /* mask iter */
     if ((src_iter_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
@@ -176,17 +181,19 @@ general_composite_rect  (pixman_implementation_t *imp,
         mask_image->common.component_alpha    &&
         PIXMAN_FORMAT_RGB (mask_image->bits.format);
 
-    _pixman_implementation_src_iter_init (
-	imp->toplevel, &mask_iter, mask_image, mask_x, mask_y, width, height,
-	mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB), info->mask_flags);
+    _pixman_implementation_iter_init (
+	imp->toplevel, &mask_iter,
+	mask_image, mask_x, mask_y, width, height, mask_buffer,
+	ITER_SRC | width_flag | (component_alpha? 0 : ITER_IGNORE_RGB),
+	info->mask_flags);
 
     /* dest iter */
-    _pixman_implementation_dest_iter_init (
+    _pixman_implementation_iter_init (
 	imp->toplevel, &dest_iter, dest_image, dest_x, dest_y, width, height,
-	dest_buffer, narrow | op_flags[op].dst, info->dest_flags);
+	dest_buffer, ITER_DEST | width_flag | op_flags[op].dst, info->dest_flags);
 
     compose = _pixman_implementation_lookup_combiner (
-	imp->toplevel, op, component_alpha, narrow);
+	imp->toplevel, op, component_alpha, width_flag != ITER_WIDE);
 
     for (i = 0; i < height; ++i)
     {
@@ -219,8 +226,7 @@ _pixman_implementation_create_general (void)
     _pixman_setup_combiner_functions_32 (imp);
     _pixman_setup_combiner_functions_float (imp);
 
-    imp->src_iter_init = general_src_iter_init;
-    imp->dest_iter_init = general_dest_iter_init;
+    imp->iter_info = general_iters;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-image.c b/pixman/pixman/pixman-image.c
index 65041b43b..4f9c2f966 100644
--- a/pixman/pixman/pixman-image.c
+++ b/pixman/pixman/pixman-image.c
@@ -502,8 +502,10 @@ compute_image_info (pixman_image_t *image)
 	break;
     }
 
-    /* Alpha map */
-    if (!image->common.alpha_map)
+    /* Alpha maps are only supported for BITS images, so it's always
+     * safe to ignore their presense for non-BITS images
+     */
+    if (!image->common.alpha_map || image->type != BITS)
     {
 	flags |= FAST_PATH_NO_ALPHA_MAP;
     }
@@ -918,10 +920,10 @@ _pixman_image_get_solid (pixman_implementation_t *imp,
 	pixman_iter_t iter;
 
     otherwise:
-	_pixman_implementation_src_iter_init (
+	_pixman_implementation_iter_init (
 	    imp, &iter, image, 0, 0, 1, 1,
 	    (uint8_t *)&result,
-	    ITER_NARROW, image->common.flags);
+	    ITER_NARROW | ITER_SRC, image->common.flags);
 	
 	result = *iter.get_scanline (&iter, NULL);
     }
diff --git a/pixman/pixman/pixman-implementation.c b/pixman/pixman/pixman-implementation.c
index cfb82bb1f..160847ad0 100644
--- a/pixman/pixman/pixman-implementation.c
+++ b/pixman/pixman/pixman-implementation.c
@@ -285,18 +285,26 @@ _pixman_implementation_fill (pixman_implementation_t *imp,
     return FALSE;
 }
 
-pixman_bool_t
-_pixman_implementation_src_iter_init (pixman_implementation_t	*imp,
-				      pixman_iter_t             *iter,
-				      pixman_image_t		*image,
-				      int			 x,
-				      int			 y,
-				      int			 width,
-				      int			 height,
-				      uint8_t			*buffer,
-				      iter_flags_t		 iter_flags,
-				      uint32_t                   image_flags)
+static uint32_t *
+get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
 {
+    return NULL;
+}
+
+void
+_pixman_implementation_iter_init (pixman_implementation_t *imp,
+                                  pixman_iter_t           *iter,
+                                  pixman_image_t          *image,
+                                  int                      x,
+                                  int                      y,
+                                  int                      width,
+                                  int                      height,
+                                  uint8_t                 *buffer,
+                                  iter_flags_t             iter_flags,
+                                  uint32_t                 image_flags)
+{
+    pixman_format_code_t format;
+
     iter->image = image;
     iter->buffer = (uint32_t *)buffer;
     iter->x = x;
@@ -306,47 +314,38 @@ _pixman_implementation_src_iter_init (pixman_implementation_t	*imp,
     iter->iter_flags = iter_flags;
     iter->image_flags = image_flags;
 
-    while (imp)
+    if (!iter->image)
     {
-	if (imp->src_iter_init && (*imp->src_iter_init) (imp, iter))
-	    return TRUE;
-
-	imp = imp->fallback;
+	iter->get_scanline = get_scanline_null;
+	return;
     }
 
-    return FALSE;
-}
-
-pixman_bool_t
-_pixman_implementation_dest_iter_init (pixman_implementation_t	*imp,
-				       pixman_iter_t            *iter,
-				       pixman_image_t		*image,
-				       int			 x,
-				       int			 y,
-				       int			 width,
-				       int			 height,
-				       uint8_t			*buffer,
-				       iter_flags_t		 iter_flags,
-				       uint32_t                  image_flags)
-{
-    iter->image = image;
-    iter->buffer = (uint32_t *)buffer;
-    iter->x = x;
-    iter->y = y;
-    iter->width = width;
-    iter->height = height;
-    iter->iter_flags = iter_flags;
-    iter->image_flags = image_flags;
+    format = iter->image->common.extended_format_code;
 
     while (imp)
     {
-	if (imp->dest_iter_init && (*imp->dest_iter_init) (imp, iter))
-	    return TRUE;
-
-	imp = imp->fallback;
+        if (imp->iter_info)
+        {
+            const pixman_iter_info_t *info;
+
+            for (info = imp->iter_info; info->format != PIXMAN_null; ++info)
+            {
+                if ((info->format == PIXMAN_any || info->format == format) &&
+                    (info->image_flags & image_flags) == info->image_flags &&
+                    (info->iter_flags & iter_flags) == info->iter_flags)
+                {
+                    iter->get_scanline = info->get_scanline;
+                    iter->write_back = info->write_back;
+
+                    if (info->initializer)
+                        info->initializer (iter, info);
+                    return;
+                }
+            }
+        }
+
+        imp = imp->fallback;
     }
-
-    return FALSE;
 }
 
 pixman_bool_t
diff --git a/pixman/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman/pixman-mips-dspr2-asm.S
index 3adbb2afe..866e93e58 100644
--- a/pixman/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman/pixman-mips-dspr2-asm.S
@@ -699,6 +699,127 @@ LEAF_MIPS_DSPR2(pixman_composite_src_0888_0565_rev_asm_mips)
 END(pixman_composite_src_0888_0565_rev_asm_mips)
 #endif
 
+LEAF_MIPS_DSPR2(pixman_composite_src_pixbuf_8888_asm_mips)
+/*
+ * a0 - dst  (a8b8g8r8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li       v0, 0x00ff00ff
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    addiu    a1, a1, 8
+    addiu    a2, a2, -2
+    srl      t2, t0, 24
+    srl      t3, t1, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t2, t3, t0, t1, v0, t4, t5, t6, t7, t8, t9
+
+    sll      t0, t0, 8
+    sll      t1, t1, 8
+    andi     t2, t2, 0xff
+    andi     t3, t3, 0xff
+    or       t0, t0, t2
+    or       t1, t1, t3
+    wsbh     t0, t0
+    wsbh     t1, t1
+    rotr     t0, t0, 16
+    rotr     t1, t1, 16
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    lw       t0, 0(a1)
+    srl      t1, t0, 24
+
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t3, t4, t5
+
+    sll      t0, t0, 8
+    andi     t1, t1, 0xff
+    or       t0, t0, t1
+    wsbh     t0, t0
+    rotr     t0, t0, 16
+    sw       t0, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_composite_src_pixbuf_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_rpixbuf_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li       v0, 0x00ff00ff
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    addiu    a1, a1, 8
+    addiu    a2, a2, -2
+    srl      t2, t0, 24
+    srl      t3, t1, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t2, t3, t0, t1, v0, t4, t5, t6, t7, t8, t9
+
+    sll      t0, t0, 8
+    sll      t1, t1, 8
+    andi     t2, t2, 0xff
+    andi     t3, t3, 0xff
+    or       t0, t0, t2
+    or       t1, t1, t3
+    rotr     t0, t0, 8
+    rotr     t1, t1, 8
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    lw       t0, 0(a1)
+    srl      t1, t0, 24
+
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t3, t4, t5
+
+    sll      t0, t0, 8
+    andi     t1, t1, 0xff
+    or       t0, t0, t1
+    rotr     t0, t0, 8
+    sw       t0, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_composite_src_rpixbuf_8888_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8888_asm_mips)
 /*
  * a0 - dst  (a8r8g8b8)
@@ -840,34 +961,35 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips)
  * a3 - w
  */
 
-    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5
-    beqz         a3, 4f
+    beqz         a3, 8f
      nop
+    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5
+
     li           t6, 0xff
     addiu        t7, zero, -1 /* t7 = 0xffffffff */
     srl          t8, a1, 24   /* t8 = srca */
     li           t9, 0x00ff00ff
+
     addiu        t1, a3, -1
-    beqz         t1, 3f       /* last pixel */
+    beqz         t1, 4f       /* last pixel */
      nop
-    beq          t8, t6, 2f   /* if (srca == 0xff) */
-     nop
-1:
-                              /* a1 = src */
+
+0:
     lw           t0, 0(a2)    /* t0 = mask */
     lw           t1, 4(a2)    /* t1 = mask */
+    addiu        a3, a3, -2   /* w = w - 2 */
     or           t2, t0, t1
-    beqz         t2, 12f      /* if (t0 == 0) && (t1 == 0) */
+    beqz         t2, 3f      /* if (t0 == 0) && (t1 == 0) */
      addiu       a2, a2, 8
-    and          t3, t0, t1
-    move         t4, a1       /* t4 = src */
-    move         t5, a1       /* t5 = src */
+    and          t2, t0, t1
+    beq          t2, t7, 1f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+     nop
+
+//if(ma)
     lw           t2, 0(a0)    /* t2 = dst */
-    beq          t3, t7, 11f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     lw          t3, 4(a0)    /* t3 = dst */
+    lw           t3, 4(a0)    /* t3 = dst */
     MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
     MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, s0, s1, s2, s3, s4, s5
-11:
     not          t0, t0
     not          t1, t1
     MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
@@ -875,62 +997,79 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips)
     addu_s.qb    t3, t5, t3
     sw           t2, 0(a0)
     sw           t3, 4(a0)
-12:
-    addiu        a3, a3, -2
     addiu        t1, a3, -1
-    bgtz         t1, 1b
+    bgtz         t1, 0b
      addiu       a0, a0, 8
-    b            3f
+    b            4f
+     nop
+1:
+//if (t0 == 0xffffffff) && (t1 == 0xffffffff):
+    beq          t8, t6, 2f   /* if (srca == 0xff) */
      nop
-2:
-                              /* a1 = src */
-    lw           t0, 0(a2)    /* t0 = mask */
-    lw           t1, 4(a2)    /* t1 = mask */
-    or           t2, t0, t1
-    beqz         t2, 22f      /* if (t0 == 0) & (t1 == 0) */
-     addiu       a2, a2, 8
-    and          t2, t0, t1
-    move         t4, a1
-    beq          t2, t7, 21f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     move        t5, a1
     lw           t2, 0(a0)    /* t2 = dst */
     lw           t3, 4(a0)    /* t3 = dst */
-    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
-    not          t0, t0
-    not          t1, t1
-    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
-    addu_s.qb    t4, t4, t2
-    addu_s.qb    t5, t5, t3
-21:
-    sw           t4, 0(a0)
-    sw           t5, 4(a0)
-22:
-    addiu        a3, a3, -2
+    not          t0, a1
+    not          t1, a1
+    srl          t0, t0, 24
+    srl          t1, t1, 24
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, a1, t2
+    addu_s.qb    t3, a1, t3
+    sw           t2, 0(a0)
+    sw           t3, 4(a0)
     addiu        t1, a3, -1
-    bgtz         t1, 2b
+    bgtz         t1, 0b
      addiu       a0, a0, 8
+    b            4f
+     nop
+2:
+    sw           a1, 0(a0)
+    sw           a1, 4(a0)
 3:
-    blez         a3, 4f
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 8
+
+4:
+    beqz         a3, 7f
      nop
                               /* a1 = src */
-    lw           t1, 0(a2)    /* t1 = mask */
-    beqz         t1, 4f
+    lw           t0, 0(a2)    /* t0 = mask */
+    beqz         t0, 7f       /* if (t0 == 0) */
      nop
-    move         t2, a1       /* t2 = src */
-    beq          t1, t7, 31f
-     lw          t0, 0(a0)    /* t0 = dst */
-
-    MIPS_UN8x4_MUL_UN8x4  a1, t1, t2, t9, t3, t4, t5, t6
-    MIPS_UN8x4_MUL_UN8    t1, t8, t1, t9, t3, t4, t5
-31:
-    not          t1, t1
-    MIPS_UN8x4_MUL_UN8x4  t0, t1, t0, t9, t3, t4, t5, t6
-    addu_s.qb    t0, t2, t0
-    sw           t0, 0(a0)
-4:
+    beq          t0, t7, 5f  /* if (t0 == 0xffffffff) */
+     nop
+//if(ma)
+    lw           t1, 0(a0)    /* t1 = dst */
+    MIPS_UN8x4_MUL_UN8x4  a1, t0, t2, t9, t3, t4, t5, s0
+    MIPS_UN8x4_MUL_UN8    t0, t8, t0, t9, t3, t4, t5
+    not          t0, t0
+    MIPS_UN8x4_MUL_UN8x4  t1, t0, t1, t9, t3, t4, t5, s0
+    addu_s.qb    t1, t2, t1
+    sw           t1, 0(a0)
     RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
     j            ra
      nop
+5:
+//if (t0 == 0xffffffff)
+    beq          t8, t6, 6f   /* if (srca == 0xff) */
+     nop
+    lw           t1, 0(a0)    /* t1 = dst */
+    not          t0, a1
+    srl          t0, t0, 24
+    MIPS_UN8x4_MUL_UN8 t1, t0, t1, t9, t2, t3, t4
+    addu_s.qb    t1, a1, t1
+    sw           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+    j            ra
+     nop
+6:
+    sw           a1, 0(a0)
+7:
+    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+8:
+    j            ra
+     nop
 
 END(pixman_composite_over_n_8888_8888_ca_asm_mips)
 
@@ -942,106 +1081,126 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
  * a3 - w
  */
 
-    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
-    beqz         a3, 4f
+    beqz         a3, 8f
      nop
-    li           t5, 0xf800f800
-    li           t6, 0x07e007e0
-    li           t7, 0x001F001F
-    li           t9, 0x00ff00ff
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
 
+    li           t6, 0xff
+    addiu        t7, zero, -1 /* t7 = 0xffffffff */
     srl          t8, a1, 24   /* t8 = srca */
+    li           t9, 0x00ff00ff
+    li           s6, 0xf800f800
+    li           s7, 0x07e007e0
+    li           s8, 0x001F001F
+
     addiu        t1, a3, -1
-    beqz         t1, 3f       /* last pixel */
+    beqz         t1, 4f       /* last pixel */
      nop
-    li           s0, 0xff     /* s0 = 0xff */
-    addiu        s1, zero, -1 /* s1 = 0xffffffff */
 
-    beq          t8, s0, 2f   /* if (srca == 0xff) */
-     nop
-1:
-                              /* a1 = src */
+0:
     lw           t0, 0(a2)    /* t0 = mask */
     lw           t1, 4(a2)    /* t1 = mask */
+    addiu        a3, a3, -2   /* w = w - 2 */
     or           t2, t0, t1
-    beqz         t2, 12f      /* if (t0 == 0) && (t1 == 0) */
+    beqz         t2, 3f      /* if (t0 == 0) && (t1 == 0) */
      addiu       a2, a2, 8
-    and          t3, t0, t1
-    move         s2, a1       /* s2 = src */
-    move         s3, a1       /* s3 = src */
+    and          t2, t0, t1
+    beq          t2, t7, 1f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+     nop
+
+//if(ma)
     lhu          t2, 0(a0)    /* t2 = dst */
-    beq          t3, s1, 11f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     lhu         t3, 2(a0)    /* t3 = dst */
-    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
-    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, t4, s4, s5, s6, s7, s8
-11:
+    lhu          t3, 2(a0)    /* t3 = dst */
+    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
+    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, s0, s1, s2, s3, s4, s5
     not          t0, t0
     not          t1, t1
-    CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
-    MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t0, t1
-    addu_s.qb    s2, s2, s4
-    addu_s.qb    s3, s3, s5
-    CONVERT_2x8888_TO_2x0565 s2, s3, t2, t3, t5, t6, t7, s4, s5
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, s7, s8, s0, s1, s2, s3
+    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, t4, t2
+    addu_s.qb    t3, t5, t3
+    CONVERT_2x8888_TO_2x0565 t2, t3, t2, t3, s6, s7, s8, s0, s1
     sh           t2, 0(a0)
     sh           t3, 2(a0)
-12:
-    addiu        a3, a3, -2
     addiu        t1, a3, -1
-    bgtz         t1, 1b
+    bgtz         t1, 0b
      addiu       a0, a0, 4
-    b            3f
+    b            4f
+     nop
+1:
+//if (t0 == 0xffffffff) && (t1 == 0xffffffff):
+    beq          t8, t6, 2f   /* if (srca == 0xff) */
      nop
-2:
-                              /* a1 = src */
-    lw           t0, 0(a2)    /* t0 = mask */
-    lw           t1, 4(a2)    /* t1 = mask */
-    or           t2, t0, t1
-    beqz         t2, 22f      /* if (t0 == 0) & (t1 == 0) */
-     addiu       a2, a2, 8
-    and          t3, t0, t1
-    move         t2, a1
-    beq          t3, s1, 21f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     move        t3, a1
     lhu          t2, 0(a0)    /* t2 = dst */
     lhu          t3, 2(a0)    /* t3 = dst */
-    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
-    not          t0, t0
-    not          t1, t1
-    CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
-    MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t2, t3
-    addu_s.qb    t2, s2, s4
-    addu_s.qb    t3, s3, s5
-21:
-    CONVERT_2x8888_TO_2x0565 t2, t3, t0, t1, t5, t6, t7, s2, s3
-    sh           t0, 0(a0)
-    sh           t1, 2(a0)
-22:
-    addiu        a3, a3, -2
+    not          t0, a1
+    not          t1, a1
+    srl          t0, t0, 24
+    srl          t1, t1, 24
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, s7, s8, s0, s1, s2, s3
+    MIPS_2xUN8x4_MUL_2xUN8   t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, a1, t2
+    addu_s.qb    t3, a1, t3
+    CONVERT_2x8888_TO_2x0565 t2, t3, t2, t3, s6, s7, s8, s0, s1
+    sh           t2, 0(a0)
+    sh           t3, 2(a0)
     addiu        t1, a3, -1
-    bgtz         t1, 2b
+    bgtz         t1, 0b
      addiu       a0, a0, 4
+    b            4f
+     nop
+2:
+    CONVERT_1x8888_TO_1x0565 a1, t2, s0, s1
+    sh           t2, 0(a0)
+    sh           t2, 2(a0)
 3:
-    blez         a3, 4f
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 4
+
+4:
+    beqz         a3, 7f
      nop
                               /* a1 = src */
-    lw           t1, 0(a2)    /* t1 = mask */
-    beqz         t1, 4f
+    lw           t0, 0(a2)    /* t0 = mask */
+    beqz         t0, 7f       /* if (t0 == 0) */
      nop
-    move         t2, a1       /* t2 = src */
-    beq          t1, t7, 31f
-     lhu         t0, 0(a0)    /* t0 = dst */
-
-    MIPS_UN8x4_MUL_UN8x4     a1, t1, t2, t9, t3, t4, t5, t6
-    MIPS_UN8x4_MUL_UN8       t1, t8, t1, t9, t3, t4, t5
-31:
-    not          t1, t1
-    CONVERT_1x0565_TO_1x8888 t0, s1, s2, s3
-    MIPS_UN8x4_MUL_UN8x4     s1, t1, t3, t9, t4, t5, t6, t7
-    addu_s.qb    t0, t2, t3
-    CONVERT_1x8888_TO_1x0565 t0, s1, s2, s3
-    sh           s1, 0(a0)
-4:
-    RESTORE_REGS_FROM_STACK  20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    beq          t0, t7, 5f  /* if (t0 == 0xffffffff) */
+     nop
+//if(ma)
+    lhu          t1, 0(a0)    /* t1 = dst */
+    MIPS_UN8x4_MUL_UN8x4     a1, t0, t2, t9, t3, t4, t5, s0
+    MIPS_UN8x4_MUL_UN8       t0, t8, t0, t9, t3, t4, t5
+    not          t0, t0
+    CONVERT_1x0565_TO_1x8888 t1, s1, s2, s3
+    MIPS_UN8x4_MUL_UN8x4     s1, t0, s1, t9, t3, t4, t5, s0
+    addu_s.qb    s1, t2, s1
+    CONVERT_1x8888_TO_1x0565 s1, t1, s0, s2
+    sh           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    j            ra
+     nop
+5:
+//if (t0 == 0xffffffff)
+    beq          t8, t6, 6f   /* if (srca == 0xff) */
+     nop
+    lhu          t1, 0(a0)    /* t1 = dst */
+    not          t0, a1
+    srl          t0, t0, 24
+    CONVERT_1x0565_TO_1x8888 t1, s1, s2, s3
+    MIPS_UN8x4_MUL_UN8       s1, t0, s1, t9, t2, t3, t4
+    addu_s.qb    s1, a1, s1
+    CONVERT_1x8888_TO_1x0565 s1, t1, s0, s2
+    sh           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    j            ra
+     nop
+6:
+    CONVERT_1x8888_TO_1x0565 a1, t1, s0, s2
+    sh           t1, 0(a0)
+7:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+8:
     j            ra
      nop
 
@@ -2936,101 +3095,265 @@ END(pixman_composite_over_reverse_n_8888_asm_mips)
 LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips)
 /*
  * a0 - dst  (a8)
- * a1 - src  (a8r8g8b8)
+ * a1 - src  (32bit constant)
  * a2 - w
  */
 
-    beqz              a2, 5f
+    li                t9, 0x00ff00ff
+    beqz              a2, 3f
      nop
-
-    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
-    move              t7, a1
-    srl               t5, t7, 24
-    replv.ph          t5, t5
-    srl               t9, a2, 2   /* t1 = how many multiples of 4 src pixels */
-    beqz              t9, 2f      /* branch if less than 4 src pixels */
+    srl               t7, a2, 2   /* t7 = how many multiples of 4 dst pixels */
+    beqz              t7, 1f      /* branch if less than 4 src pixels */
      nop
 
-1:
-    addiu             t9, t9, -1
-    addiu             a2, a2, -4
+    srl               t8, a1, 24
+    replv.ph          t8, t8
+
+0:
+    beqz              t7, 1f
+     addiu            t7, t7, -1
     lbu               t0, 0(a0)
     lbu               t1, 1(a0)
     lbu               t2, 2(a0)
     lbu               t3, 3(a0)
 
-    muleu_s.ph.qbl    s0, t0, t5
-    muleu_s.ph.qbr    s1, t0, t5
-    muleu_s.ph.qbl    s2, t1, t5
-    muleu_s.ph.qbr    s3, t1, t5
-    muleu_s.ph.qbl    s4, t2, t5
-    muleu_s.ph.qbr    s5, t2, t5
-    muleu_s.ph.qbl    s6, t3, t5
-    muleu_s.ph.qbr    s7, t3, t5
-
-    shrl.ph           t4, s0, 8
-    shrl.ph           t6, s1, 8
-    shrl.ph           t7, s2, 8
-    shrl.ph           t8, s3, 8
-    addq.ph           t0, s0, t4
-    addq.ph           t1, s1, t6
-    addq.ph           t2, s2, t7
-    addq.ph           t3, s3, t8
-    shra_r.ph         t0, t0, 8
-    shra_r.ph         t1, t1, 8
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr.qb.ph       t0, t3, t1
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, t8
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
     shra_r.ph         t2, t2, 8
     shra_r.ph         t3, t3, 8
-    shrl.ph           t4, s4, 8
-    shrl.ph           t6, s5, 8
-    shrl.ph           t7, s6, 8
-    shrl.ph           t8, s7, 8
-    addq.ph           s0, s4, t4
-    addq.ph           s1, s5, t6
-    addq.ph           s2, s6, t7
-    addq.ph           s3, s7, t8
-    shra_r.ph         t4, s0, 8
-    shra_r.ph         t6, s1, 8
-    shra_r.ph         t7, s2, 8
-    shra_r.ph         t8, s3, 8
-
-    precr.qb.ph       s0, t0, t1
-    precr.qb.ph       s1, t2, t3
-    precr.qb.ph       s2, t4, t6
-    precr.qb.ph       s3, t7, t8
+    precr.qb.ph       t2, t2, t3
 
-    sb                s0, 0(a0)
-    sb                s1, 1(a0)
-    sb                s2, 2(a0)
-    sb                s3, 3(a0)
-    bgtz              t9, 1b
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a2, a2, -4
+    b                 0b
      addiu            a0, a0, 4
-2:
-    beqz              a2, 4f
+
+1:
+    beqz              a2, 3f
      nop
-3:
-    lbu               t1, 0(a0)
+    srl               t8, a1, 24
+2:
+    lbu               t0, 0(a0)
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0x00ff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
 
-    muleu_s.ph.qbl    t4, t1, t5
-    muleu_s.ph.qbr    t7, t1, t5
-    shrl.ph           t6, t4, 8
-    shrl.ph           t0, t7, 8
-    addq.ph           t8, t4, t6
-    addq.ph           t9, t7, t0
-    shra_r.ph         t8, t8, 8
-    shra_r.ph         t9, t9, 8
-    precr.qb.ph       t2, t8, t9
     sb                t2, 0(a0)
     addiu             a2, a2, -1
-    bnez              a2, 3b
+    bnez              a2, 2b
      addiu            a0, a0, 1
-4:
-    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
-5:
+
+3:
     j                 ra
      nop
 
 END(pixman_composite_in_n_8_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
+/*
+ * a0     - dst  (a8r8g8b8)
+ * a1     - src  (a8r8g8b8)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2, s3
+    lw       t8, 16(sp) /* t8 = unit_x */
+    li       t6, 0x00ff00ff
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    sra      t1, a3, 16 /* t0 = vx >> 16 */
+    sll      t1, t1, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t1, a1, t1
+    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    lw       t2, 0(a0)  /* t2 = destination (a8r8g8b8) */
+    lw       t3, 4(a0)  /* t3 = destination (a8r8g8b8) */
+
+    OVER_2x8888_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t9, s0, s1, s2, s3
+
+    sw       t4, 0(a0)
+    sw       t5, 4(a0)
+    addiu    a2, a2, -2
+    addiu    t1, a2, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    lw       t1, 0(a0)  /* t1 = destination (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    OVER_8888_8888 t0, t1, t2, t6, t4, t5, t3, t7
+
+    sw       t2, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
+/*
+ * a0     - dst  (r5g6b5)
+ * a1     - src  (a8r8g8b8)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, v0, v1
+    lw       t8, 40(sp) /* t8 = unit_x */
+    li       t4, 0x00ff00ff
+    li       t5, 0xf800f800
+    li       t6, 0x07e007e0
+    li       t7, 0x001F001F
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+    sra      t1, a3, 16 /* t0 = vx >> 16 */
+    sll      t1, t1, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t1, a1, t1
+    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+    lhu      t2, 0(a0)  /* t2 = destination (r5g6b5) */
+    lhu      t3, 2(a0)  /* t3 = destination (r5g6b5) */
+
+    CONVERT_2x0565_TO_2x8888 t2, t3, v0, v1, t6, t7, s0, s1, s2, s3
+    OVER_2x8888_2x8888       t0, t1, v0, v1, t2, t3, t4, t9, s0, s1, s2, s3, s4
+    CONVERT_2x8888_TO_2x0565 t2, t3, v0, v1, t5, t6, t7, t9, s2
+
+    sh       v0, 0(a0)
+    sh       v1, 2(a0)
+    addiu    a2, a2, -2
+    addiu    t1, a2, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    lhu      t1, 0(a0)  /* t1 = destination (r5g6b5) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    CONVERT_1x0565_TO_1x8888 t1, t2, t5, t6
+    OVER_8888_8888           t0, t2, t1, t4, t3, t5, t6, t7
+    CONVERT_1x8888_TO_1x0565 t1, t2, t5, t6
+
+    sh       t2, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, v0, v1
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips)
+/*
+ * a0     - dst (a8r8g8b8)
+ * a1     - src (r5g6b5)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    beqz     a2, 3f
+     nop
+
+    lw       v0, 16(sp) /* v0 = unit_x */
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+
+    li       t4, 0x07e007e0
+    li       t5, 0x001F001F
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 1  /* t0 = t0 * 2 ((r5g6b5)) */
+    addu     t0, a1, t0
+    lhu      t0, 0(t0)  /* t0 = source ((r5g6b5)) */
+    addu     a3, a3, v0 /* a3 = vx + unit_x */
+    sra      t1, a3, 16 /* t1 = vx >> 16 */
+    sll      t1, t1, 1  /* t1 = t1 * 2 ((r5g6b5)) */
+    addu     t1, a1, t1
+    lhu      t1, 0(t1)  /* t1 = source ((r5g6b5)) */
+    addu     a3, a3, v0 /* a3 = vx + unit_x */
+    addiu    a2, a2, -2
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
+
+    sw       t2, 0(a0)
+    sw       t3, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 1  /* t0 = t0 * 2 ((r5g6b5)) */
+    addu     t0, a1, t0
+    lhu      t0, 0(t0)  /* t0 = source ((r5g6b5)) */
+
+    CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3
+
+    sw       t1, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
 /*
  * a0     - dst  (r5g6b5)
diff --git a/pixman/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman/pixman-mips-dspr2-asm.h
index b330c0f0d..cab122d80 100644
--- a/pixman/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman/pixman-mips-dspr2-asm.h
@@ -354,17 +354,16 @@ LEAF_MIPS32R2(symbol)                                   \
                                 out1_565, out2_565,  \
                                 maskR, maskG, maskB, \
                                 scratch1, scratch2
-    precrq.ph.w       \scratch1, \in2_8888, \in1_8888
-    precr_sra.ph.w    \in2_8888, \in1_8888, 0
-    shll.ph           \scratch1, \scratch1, 8
-    srl               \in2_8888, \in2_8888, 3
-    and               \scratch2, \in2_8888, \maskB
-    and               \scratch1, \scratch1, \maskR
-    srl               \in2_8888, \in2_8888, 2
-    and               \out2_565, \in2_8888, \maskG
-    or                \out2_565, \out2_565, \scratch2
-    or                \out1_565, \out2_565, \scratch1
-    srl               \out2_565, \out1_565, 16
+    precr.qb.ph    \scratch1, \in2_8888, \in1_8888
+    precrq.qb.ph   \in2_8888, \in2_8888, \in1_8888
+    and            \out1_565, \scratch1, \maskR
+    shrl.ph        \scratch1, \scratch1, 3
+    shll.ph        \in2_8888, \in2_8888, 3
+    and            \scratch1, \scratch1, \maskB
+    or             \out1_565, \out1_565, \scratch1
+    and            \in2_8888, \in2_8888, \maskG
+    or             \out1_565, \out1_565, \in2_8888
+    srl            \out2_565, \out1_565, 16
 .endm
 
 /*
@@ -587,6 +586,36 @@ LEAF_MIPS32R2(symbol)                                   \
     addu_s.qb          \out_8888, \out_8888, \s_8888
 .endm
 
+/*
+ * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two
+ * a8r8g8b8 destination pixels (d1_8888 and d2_8888). It also requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro OVER_2x8888_2x8888 s1_8888,   \
+                          s2_8888,   \
+                          d1_8888,   \
+                          d2_8888,   \
+                          out1_8888, \
+                          out2_8888, \
+                          maskLSR,   \
+                          scratch1, scratch2, scratch3, \
+                          scratch4, scratch5, scratch6
+    not                    \scratch1,  \s1_8888
+    srl                    \scratch1,  \scratch1,  24
+    not                    \scratch2,  \s2_8888
+    srl                    \scratch2,  \scratch2,  24
+    MIPS_2xUN8x4_MUL_2xUN8 \d1_8888,   \d2_8888, \
+                           \scratch1,  \scratch2,  \
+                           \out1_8888, \out2_8888, \
+                           \maskLSR, \
+                           \scratch3,  \scratch4, \scratch5, \
+                           \scratch6,  \d1_8888,  \d2_8888
+
+    addu_s.qb              \out1_8888, \out1_8888, \s1_8888
+    addu_s.qb              \out2_8888, \out2_8888, \s2_8888
+.endm
+
 .macro MIPS_UN8x4_MUL_UN8_ADD_UN8x4 s_8888,   \
                                     m_8,      \
                                     d_8888,   \
diff --git a/pixman/pixman/pixman-mips-dspr2.c b/pixman/pixman/pixman-mips-dspr2.c
index 1ea244576..e10c9df0a 100644
--- a/pixman/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman/pixman-mips-dspr2.c
@@ -54,6 +54,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_8888_rev,
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_0565_rev,
                                     uint8_t, 3, uint16_t, 1)
 #endif
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_pixbuf_8888,
+                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_rpixbuf_8888,
+                                    uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
                                     uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565,
@@ -121,6 +125,13 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_0565_8_0565, uint16_t, 1,
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8888_8888, uint32_t, 1,
                                          uint32_t, 1, uint32_t, 1)
 
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_8888, OVER,
+                                         uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_0565, OVER,
+                                         uint32_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (0565_8888, SRC,
+                                         uint16_t, uint32_t)
+
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC,
                                           uint32_t, uint32_t)
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_0565, SRC,
@@ -292,6 +303,10 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, x8r8g8b8, mips_composite_src_0888_8888_rev),
     PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, r5g6b5,   mips_composite_src_0888_0565_rev),
 #endif
+    PIXMAN_STD_FAST_PATH (SRC, pixbuf,   pixbuf,  a8r8g8b8, mips_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, pixbuf,   pixbuf,  a8b8g8r8, mips_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, rpixbuf,  rpixbuf, a8r8g8b8, mips_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, rpixbuf,  rpixbuf, a8b8g8r8, mips_composite_src_pixbuf_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8r8g8b8, mips_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   x8r8g8b8, mips_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8b8g8r8, mips_composite_src_n_8_8888),
@@ -357,6 +372,22 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mips_composite_over_reverse_n_8888),
     PIXMAN_STD_FAST_PATH (IN,           solid, null, a8,       mips_composite_in_n_8),
 
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mips_8888_8888),
+
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_0565),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_0565),
+
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, mips_0565_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8888),
+    /* Note: NONE repeat is not supported yet */
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
+
     PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
     PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
 
diff --git a/pixman/pixman/pixman-mips-dspr2.h b/pixman/pixman/pixman-mips-dspr2.h
index 4ac9ff95d..955ed70b8 100644
--- a/pixman/pixman/pixman-mips-dspr2.h
+++ b/pixman/pixman/pixman-mips-dspr2.h
@@ -246,6 +246,48 @@ mips_composite_##name (pixman_implementation_t *imp,                     \
     }                                                                    \
 }
 
+/****************************************************************************/
+
+#define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST(name, op,                    \
+                                                src_type, dst_type)          \
+void                                                                         \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (                    \
+                                                   dst_type *       dst,     \
+                                                   const src_type * src,     \
+                                                   int32_t          w,       \
+                                                   pixman_fixed_t   vx,      \
+                                                   pixman_fixed_t   unit_x); \
+                                                                             \
+static force_inline void                                                     \
+scaled_nearest_scanline_mips_##name##_##op (dst_type *       pd,             \
+                                            const src_type * ps,             \
+                                            int32_t          w,              \
+                                            pixman_fixed_t   vx,             \
+                                            pixman_fixed_t   unit_x,         \
+                                            pixman_fixed_t   max_vx,         \
+                                            pixman_bool_t    zero_src)       \
+{                                                                            \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (pd, ps, w,      \
+                                                             vx, unit_x);    \
+}                                                                            \
+                                                                             \
+FAST_NEAREST_MAINLOOP (mips_##name##_cover_##op,                             \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, COVER)                            \
+FAST_NEAREST_MAINLOOP (mips_##name##_none_##op,                              \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, NONE)                             \
+FAST_NEAREST_MAINLOOP (mips_##name##_pad_##op,                               \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, PAD)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                    \
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                            \
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                             \
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+
+
 /*****************************************************************************/
 
 #define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST(flags, name, op,           \
diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c
index 14790c029..c94d282a9 100644
--- a/pixman/pixman/pixman-mmx.c
+++ b/pixman/pixman/pixman-mmx.c
@@ -301,6 +301,29 @@ negate (__m64 mask)
     return _mm_xor_si64 (mask, MC (4x00ff));
 }
 
+/* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
+ * and maps its result to the same range.
+ *
+ * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
+ * Notation, Notation, Notation", the first of which is
+ *
+ *   prod(a, b) = (a * b + 128) / 255.
+ *
+ * By approximating the division by 255 as 257/65536 it can be replaced by a
+ * multiply and a right shift. This is the implementation that we use in
+ * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
+ * 3DNow!, and unavailable at the time of the book's publication) to perform
+ * the multiplication and right shift in a single operation.
+ *
+ *   prod(a, b) = ((a * b + 128) * 257) >> 16.
+ *
+ * A third way (how pix_multiply() was implemented prior to 14208344) exists
+ * also that performs the multiplication by 257 with adds and shifts.
+ *
+ * Where temp = a * b + 128
+ *
+ *   prod(a, b) = (temp + (temp >> 8)) >> 8.
+ */
 static force_inline __m64
 pix_multiply (__m64 a, __m64 b)
 {
@@ -3899,52 +3922,23 @@ mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
-typedef struct
-{
-    pixman_format_code_t	format;
-    pixman_iter_get_scanline_t	get_scanline;
-} fetcher_info_t;
-
-static const fetcher_info_t fetchers[] =
-{
-    { PIXMAN_x8r8g8b8,		mmx_fetch_x8r8g8b8 },
-    { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
-    { PIXMAN_a8,		mmx_fetch_a8 },
-    { PIXMAN_null }
-};
-
-static pixman_bool_t
-mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
-{
-    pixman_image_t *image = iter->image;
-
-#define FLAGS								\
+#define IMAGE_FLAGS							\
     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
 
-    if ((iter->iter_flags & ITER_NARROW)			&&
-	(iter->image_flags & FLAGS) == FLAGS)
-    {
-	const fetcher_info_t *f;
-
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
-	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
-
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
-
-		iter->get_scanline = f->get_scanline;
-		return TRUE;
-	    }
-	}
-    }
-
-    return FALSE;
-}
+static const pixman_iter_info_t mmx_iters[] = 
+{
+    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
+    },
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
+    },
+    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
+    },
+    { PIXMAN_null },
+};
 
 static const pixman_fast_path_t mmx_fast_paths[] =
 {
@@ -4074,7 +4068,7 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
     imp->blt = mmx_blt;
     imp->fill = mmx_fill;
 
-    imp->src_iter_init = mmx_src_iter_init;
+    imp->iter_info = mmx_iters;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-noop.c b/pixman/pixman/pixman-noop.c
index e39996d9d..e59890492 100644
--- a/pixman/pixman/pixman-noop.c
+++ b/pixman/pixman/pixman-noop.c
@@ -37,12 +37,6 @@ noop_composite (pixman_implementation_t *imp,
     return;
 }
 
-static void
-dest_write_back_direct (pixman_iter_t *iter)
-{
-    iter->buffer += iter->image->bits.rowstride;
-}
-
 static uint32_t *
 noop_get_scanline (pixman_iter_t *iter, const uint32_t *mask)
 {
@@ -53,110 +47,102 @@ noop_get_scanline (pixman_iter_t *iter, const uint32_t *mask)
     return result;
 }
 
-static uint32_t *
-get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
-{
-    return NULL;
+static void
+noop_init_solid_narrow (pixman_iter_t *iter,
+			const pixman_iter_info_t *info)
+{ 
+    pixman_image_t *image = iter->image;
+    uint32_t *buffer = iter->buffer;
+    uint32_t *end = buffer + iter->width;
+    uint32_t color;
+
+    if (iter->image->type == SOLID)
+	color = image->solid.color_32;
+    else
+	color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
+
+    while (buffer < end)
+	*(buffer++) = color;
 }
 
-static pixman_bool_t
-noop_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static void
+noop_init_solid_wide (pixman_iter_t *iter,
+		      const pixman_iter_info_t *info)
 {
     pixman_image_t *image = iter->image;
+    argb_t *buffer = (argb_t *)iter->buffer;
+    argb_t *end = buffer + iter->width;
+    argb_t color;
 
-#define FLAGS						\
-    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
-
-    if (!image)
-    {
-	iter->get_scanline = get_scanline_null;
-    }
-    else if ((iter->iter_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
-	     (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
-    {
-	iter->get_scanline = _pixman_iter_get_scanline_noop;
-    }
-    else if (image->common.extended_format_code == PIXMAN_solid		&&
-	     (iter->image->type == SOLID ||
-	      (iter->image_flags & FAST_PATH_NO_ALPHA_MAP)))
-    {
-	if (iter->iter_flags & ITER_NARROW)
-	{
-	    uint32_t *buffer = iter->buffer;
-	    uint32_t *end = buffer + iter->width;
-	    uint32_t color;
-
-	    if (image->type == SOLID)
-		color = image->solid.color_32;
-	    else
-		color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
-
-	    while (buffer < end)
-		*(buffer++) = color;
-	}
-	else
-	{
-	    argb_t *buffer = (argb_t *)iter->buffer;
-	    argb_t *end = buffer + iter->width;
-	    argb_t color;
-
-	    if (image->type == SOLID)
-		color = image->solid.color_float;
-	    else
-		color = image->bits.fetch_pixel_float (&image->bits, 0, 0);
-
-	    while (buffer < end)
-		*(buffer++) = color;
-	}
-
-	iter->get_scanline = _pixman_iter_get_scanline_noop;
-    }
-    else if (image->common.extended_format_code == PIXMAN_a8r8g8b8	&&
-	     (iter->iter_flags & ITER_NARROW)				&&
-	     (iter->image_flags & FLAGS) == FLAGS			&&
-	     iter->x >= 0 && iter->y >= 0				&&
-	     iter->x + iter->width <= image->bits.width			&&
-	     iter->y + iter->height <= image->bits.height)
-    {
-	iter->buffer =
-	    image->bits.bits + iter->y * image->bits.rowstride + iter->x;
-
-	iter->get_scanline = noop_get_scanline;
-    }
+    if (iter->image->type == SOLID)
+	color = image->solid.color_float;
     else
-    {
-	return FALSE;
-    }
+	color = image->bits.fetch_pixel_float (&image->bits, 0, 0);
 
-    return TRUE;
+    while (buffer < end)
+	*(buffer++) = color;
 }
 
-static pixman_bool_t
-noop_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static void
+noop_init_direct_buffer (pixman_iter_t *iter, const pixman_iter_info_t *info)
 {
     pixman_image_t *image = iter->image;
-    uint32_t image_flags = iter->image_flags;
-    uint32_t iter_flags = iter->iter_flags;
-    
-    if ((image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS	&&
-	(iter_flags & ITER_NARROW) == ITER_NARROW				&&
-	((image->common.extended_format_code == PIXMAN_a8r8g8b8)	||
-	 (image->common.extended_format_code == PIXMAN_x8r8g8b8 &&
-	  (iter_flags & (ITER_LOCALIZED_ALPHA)))))
-    {
-	iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x;
-
-	iter->get_scanline = _pixman_iter_get_scanline_noop;
-	iter->write_back = dest_write_back_direct;
-
-	return TRUE;
-    }
-    else
-    {
-	return FALSE;
-    }
+
+    iter->buffer =
+	image->bits.bits + iter->y * image->bits.rowstride + iter->x;
 }
 
+static void
+dest_write_back_direct (pixman_iter_t *iter)
+{
+    iter->buffer += iter->image->bits.rowstride;
+}
+
+static const pixman_iter_info_t noop_iters[] =
+{
+    /* Source iters */
+    { PIXMAN_any,
+      0, ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_SRC,
+      NULL,
+      _pixman_iter_get_scanline_noop,
+      NULL
+    },
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP, ITER_NARROW | ITER_SRC,
+      noop_init_solid_narrow,
+      _pixman_iter_get_scanline_noop,
+      NULL,
+    },
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP, ITER_WIDE | ITER_SRC,
+      noop_init_solid_wide,
+      _pixman_iter_get_scanline_noop,
+      NULL
+    },
+    { PIXMAN_a8r8g8b8,
+      FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |
+          FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,
+      ITER_NARROW | ITER_SRC,
+      noop_init_direct_buffer,
+      noop_get_scanline,
+      NULL
+    },
+    /* Dest iters */
+    { PIXMAN_a8r8g8b8,
+      FAST_PATH_STD_DEST_FLAGS, ITER_NARROW | ITER_DEST,
+      noop_init_direct_buffer,
+      _pixman_iter_get_scanline_noop,
+      dest_write_back_direct
+    },
+    { PIXMAN_x8r8g8b8,
+      FAST_PATH_STD_DEST_FLAGS, ITER_NARROW | ITER_DEST | ITER_LOCALIZED_ALPHA,
+      noop_init_direct_buffer,
+      _pixman_iter_get_scanline_noop,
+      dest_write_back_direct
+    },
+    { PIXMAN_null },
+};
+
 static const pixman_fast_path_t noop_fast_paths[] =
 {
     { PIXMAN_OP_DST, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, noop_composite },
@@ -169,8 +155,7 @@ _pixman_implementation_create_noop (pixman_implementation_t *fallback)
     pixman_implementation_t *imp =
 	_pixman_implementation_create (fallback, noop_fast_paths);
  
-    imp->src_iter_init = noop_src_iter_init;
-    imp->dest_iter_init = noop_dest_iter_init;
+    imp->iter_info = noop_iters;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h
index 6d9c05321..af4a0b6e0 100644
--- a/pixman/pixman/pixman-private.h
+++ b/pixman/pixman/pixman-private.h
@@ -212,7 +212,8 @@ typedef void      (* pixman_iter_write_back_t)   (pixman_iter_t *iter);
 
 typedef enum
 {
-    ITER_NARROW =		(1 << 0),
+    ITER_NARROW =               (1 << 0),
+    ITER_WIDE =                 (1 << 1),
 
     /* "Localized alpha" is when the alpha channel is used only to compute
      * the alpha value of the destination. This means that the computation
@@ -229,9 +230,15 @@ typedef enum
      * we can treat it as if it were ARGB, which means in some cases we can
      * avoid copying it to a temporary buffer.
      */
-    ITER_LOCALIZED_ALPHA =	(1 << 1),
-    ITER_IGNORE_ALPHA =		(1 << 2),
-    ITER_IGNORE_RGB =		(1 << 3)
+    ITER_LOCALIZED_ALPHA =	(1 << 2),
+    ITER_IGNORE_ALPHA =		(1 << 3),
+    ITER_IGNORE_RGB =		(1 << 4),
+
+    /* These indicate whether the iterator is for a source
+     * or a destination image
+     */
+    ITER_SRC =			(1 << 5),
+    ITER_DEST =			(1 << 6)
 } iter_flags_t;
 
 struct pixman_iter_t
@@ -255,6 +262,19 @@ struct pixman_iter_t
     int				stride;
 };
 
+typedef struct pixman_iter_info_t pixman_iter_info_t;
+typedef void (* pixman_iter_initializer_t) (pixman_iter_t *iter,
+                                            const pixman_iter_info_t *info);
+struct pixman_iter_info_t
+{
+    pixman_format_code_t	format;
+    uint32_t			image_flags;
+    iter_flags_t		iter_flags;
+    pixman_iter_initializer_t	initializer;
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+};
+
 void
 _pixman_bits_image_setup_accessors (bits_image_t *image);
 
@@ -454,8 +474,6 @@ typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
 					     int                      width,
 					     int                      height,
 					     uint32_t                 filler);
-typedef pixman_bool_t (*pixman_iter_init_func_t) (pixman_implementation_t *imp,
-						  pixman_iter_t           *iter);
 
 void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
 void _pixman_setup_combiner_functions_float (pixman_implementation_t *imp);
@@ -477,11 +495,10 @@ struct pixman_implementation_t
     pixman_implementation_t *	toplevel;
     pixman_implementation_t *	fallback;
     const pixman_fast_path_t *	fast_paths;
+    const pixman_iter_info_t *  iter_info;
 
     pixman_blt_func_t		blt;
     pixman_fill_func_t		fill;
-    pixman_iter_init_func_t     src_iter_init;
-    pixman_iter_init_func_t     dest_iter_init;
 
     pixman_combine_32_func_t	combine_32[PIXMAN_N_OPERATORS];
     pixman_combine_32_func_t	combine_32_ca[PIXMAN_N_OPERATORS];
@@ -542,29 +559,17 @@ _pixman_implementation_fill (pixman_implementation_t *imp,
                              int                      height,
                              uint32_t                 filler);
 
-pixman_bool_t
-_pixman_implementation_src_iter_init (pixman_implementation_t       *imp,
-				      pixman_iter_t                 *iter,
-				      pixman_image_t                *image,
-				      int                            x,
-				      int                            y,
-				      int                            width,
-				      int                            height,
-				      uint8_t                       *buffer,
-				      iter_flags_t                   flags,
-				      uint32_t                       image_flags);
-
-pixman_bool_t
-_pixman_implementation_dest_iter_init (pixman_implementation_t       *imp,
-				       pixman_iter_t                 *iter,
-				       pixman_image_t                *image,
-				       int                            x,
-				       int                            y,
-				       int                            width,
-				       int                            height,
-				       uint8_t                       *buffer,
-				       iter_flags_t                   flags,
-				       uint32_t                       image_flags);
+void
+_pixman_implementation_iter_init (pixman_implementation_t       *imp,
+                                  pixman_iter_t                 *iter,
+                                  pixman_image_t                *image,
+                                  int                            x,
+                                  int                            y,
+                                  int                            width,
+                                  int                            height,
+                                  uint8_t                       *buffer,
+                                  iter_flags_t                   flags,
+                                  uint32_t                       image_flags);
 
 /* Specific implementations */
 pixman_implementation_t *
@@ -647,6 +652,9 @@ _pixman_compute_composite_region32 (pixman_region32_t * region,
 uint32_t *
 _pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask);
 
+void
+_pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *info);
+
 /* These "formats" all have depth 0, so they
  * will never clash with any real ones
  */
diff --git a/pixman/pixman/pixman-region.c b/pixman/pixman/pixman-region.c
index 2d6f1571c..59bc9c797 100644
--- a/pixman/pixman/pixman-region.c
+++ b/pixman/pixman/pixman-region.c
@@ -1858,7 +1858,7 @@ pixman_region_subtract_o (region_type_t * region,
         else if (r2->x1 <= x1)
         {
             /*
-	     * Subtrahend preceeds minuend: nuke left edge of minuend.
+	     * Subtrahend precedes minuend: nuke left edge of minuend.
 	     */
             x1 = r2->x2;
             if (x1 >= r1->x2)
@@ -1982,7 +1982,7 @@ PREFIX (_subtract) (region_type_t *reg_d,
     }
 
     /* Add those rectangles in region 1 that aren't in region 2,
-       do yucky substraction for overlaps, and
+       do yucky subtraction for overlaps, and
        just throw away rectangles in region 2 that aren't in region 1 */
     if (!pixman_op (reg_d, reg_m, reg_s, pixman_region_subtract_o, TRUE, FALSE))
 	return FALSE;
@@ -2042,7 +2042,7 @@ PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region */
     }
 
     /* Add those rectangles in region 1 that aren't in region 2,
-     * do yucky substraction for overlaps, and
+     * do yucky subtraction for overlaps, and
      * just throw away rectangles in region 2 that aren't in region 1
      */
     inv_reg.extents = *inv_rect;
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index c7e9a4bb2..dde923524 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -5554,19 +5554,27 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
 
-#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
-
-#define BILINEAR_DECLARE_VARIABLES						\
+#if BILINEAR_INTERPOLATION_BITS < 8
+# define BILINEAR_DECLARE_VARIABLES						\
     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
-    const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
-    const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
-    const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
-    const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
+    const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
+					  unit_x, -unit_x, unit_x, -unit_x);	\
+    const __m128i xmm_zero = _mm_setzero_si128 ();				\
+    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
+				   vx, -(vx + 1), vx, -(vx + 1))
+#else
+# define BILINEAR_DECLARE_VARIABLES						\
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
-					  unit_x, unit_x, unit_x, unit_x);	\
+					  -unit_x, -unit_x, -unit_x, -unit_x);	\
     const __m128i xmm_zero = _mm_setzero_si128 ();				\
-    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
+    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx,				\
+				   -(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1))
+#endif
 
 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
 do {										\
@@ -5585,8 +5593,8 @@ do {										\
     if (BILINEAR_INTERPOLATION_BITS < 8)					\
     {										\
 	/* calculate horizontal weights */					\
-	xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,		\
-		   _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
 	/* horizontal interpolation */						\
 	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
@@ -5595,8 +5603,8 @@ do {										\
     else									\
     {										\
 	/* calculate horizontal weights */					\
-	xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,		\
-		_mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
 	/* horizontal interpolation */						\
 	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
@@ -6332,52 +6340,23 @@ sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
-typedef struct
-{
-    pixman_format_code_t	format;
-    pixman_iter_get_scanline_t	get_scanline;
-} fetcher_info_t;
-
-static const fetcher_info_t fetchers[] =
-{
-    { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
-    { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },
-    { PIXMAN_a8,		sse2_fetch_a8 },
-    { PIXMAN_null }
-};
-
-static pixman_bool_t
-sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
-{
-    pixman_image_t *image = iter->image;
-
-#define FLAGS								\
+#define IMAGE_FLAGS							\
     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
 
-    if ((iter->iter_flags & ITER_NARROW)			&&
-	(iter->image_flags & FLAGS) == FLAGS)
-    {
-	const fetcher_info_t *f;
-
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
-	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
-
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
-
-		iter->get_scanline = f->get_scanline;
-		return TRUE;
-	    }
-	}
-    }
-
-    return FALSE;
-}
+static const pixman_iter_info_t sse2_iters[] = 
+{
+    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
+    },
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
+    },
+    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
+    },
+    { PIXMAN_null },
+};
 
 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
 __attribute__((__force_align_arg_pointer__))
@@ -6435,7 +6414,7 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
     imp->blt = sse2_blt;
     imp->fill = sse2_fill;
 
-    imp->src_iter_init = sse2_src_iter_init;
+    imp->iter_info = sse2_iters;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-utils.c b/pixman/pixman/pixman-utils.c
index f31171f6d..98723a800 100644
--- a/pixman/pixman/pixman-utils.c
+++ b/pixman/pixman/pixman-utils.c
@@ -214,6 +214,17 @@ _pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
+void
+_pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *info)
+{
+    pixman_image_t *image = iter->image;
+    uint8_t *b = (uint8_t *)image->bits.bits;
+    int s = image->bits.rowstride * 4;
+
+    iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (info->format) / 8;
+    iter->stride = s;
+}
+
 #define N_TMP_BOXES (16)
 
 pixman_bool_t
diff --git a/pixman/pixman/pixman-vmx.c b/pixman/pixman/pixman-vmx.c
index 6868704a8..f629003ab 100644
--- a/pixman/pixman/pixman-vmx.c
+++ b/pixman/pixman/pixman-vmx.c
@@ -25,7 +25,9 @@
  * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
  */
 
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 #include <altivec.h>
diff --git a/pixman/pixman/pixman.c b/pixman/pixman/pixman.c
index 184f0c4e6..9555ceaaf 100644
--- a/pixman/pixman/pixman.c
+++ b/pixman/pixman/pixman.c
@@ -605,7 +605,7 @@ pixman_image_composite32 (pixman_op_t      op,
     else
     {
 	mask_format = PIXMAN_null;
-	info.mask_flags = FAST_PATH_IS_OPAQUE;
+	info.mask_flags = FAST_PATH_IS_OPAQUE | FAST_PATH_NO_ALPHA_MAP;
     }
 
     dest_format = dest->common.extended_format_code;
diff --git a/pixman/pixman/refactor b/pixman/pixman/refactor
deleted file mode 100644
index 52fceab17..000000000
--- a/pixman/pixman/refactor
+++ /dev/null
@@ -1,478 +0,0 @@
-Roadmap
-
-- Move all the fetchers etc. into pixman-image to make pixman-compose.c
-  less intimidating.
-
-  DONE
-
-- Make combiners for unified alpha take a mask argument. That way
-  we won't need two separate paths for unified vs component in the
-  general compositing code.
-
-  DONE, except that the Altivec code needs to be updated. Luca is
-  looking into that.
-
-- Delete separate 'unified alpha' path
- 
-  DONE
-
-- Split images into their own files
-
-  DONE
-
-- Split the gradient walker code out into its own file
-
-  DONE
-
-- Add scanline getters per image
-
-  DONE
-
-- Generic 64 bit fetcher 
-
-  DONE
-
-- Split fast path tables into their respective architecture dependent
-  files.
-
-See "Render Algorithm" below for rationale
-
-Images will eventually have these virtual functions:
-
-       get_scanline()
-       get_scanline_wide()
-       get_pixel()
-       get_pixel_wide()
-       get_untransformed_pixel()
-       get_untransformed_pixel_wide()
-       get_unfiltered_pixel()
-       get_unfiltered_pixel_wide()
-
-       store_scanline()
-       store_scanline_wide()
-
-1.
-
-Initially we will just have get_scanline() and get_scanline_wide();
-these will be based on the ones in pixman-compose. Hopefully this will
-reduce the complexity in pixman_composite_rect_general().
-
-Note that there is access considerations - the compose function is
-being compiled twice.
-
-
-2.
-
-Split image types into their own source files. Export noop virtual
-reinit() call.  Call this whenever a property of the image changes.
-
-
-3. 
-
-Split the get_scanline() call into smaller functions that are
-initialized by the reinit() call.
-
-The Render Algorithm:
-	(first repeat, then filter, then transform, then clip)
-
-Starting from a destination pixel (x, y), do
-
-	1 x = x - xDst + xSrc
-	  y = y - yDst + ySrc
-
-	2 reject pixel that is outside the clip
-
-	This treats clipping as something that happens after
-	transformation, which I think is correct for client clips. For
-	hierarchy clips it is wrong, but who really cares? Without
-	GraphicsExposes hierarchy clips are basically irrelevant. Yes,
-	you could imagine cases where the pixels of a subwindow of a
-	redirected, transformed window should be treated as
-	transparent. I don't really care
-
-	Basically, I think the render spec should say that pixels that
-	are unavailable due to the hierarcy have undefined content,
-	and that GraphicsExposes are not generated. Ie., basically
-	that using non-redirected windows as sources is fail. This is
-	at least consistent with the current implementation and we can
-	update the spec later if someone makes it work.
-
-	The implication for render is that it should stop passing the
-	hierarchy clip to pixman. In pixman, if a souce image has a
-	clip it should be used in computing the composite region and
-	nowhere else, regardless of what "has_client_clip" says. The
-	default should be for there to not be any clip.
-
-	I would really like to get rid of the client clip as well for
-	source images, but unfortunately there is at least one
-	application in the wild that uses them.
-
-	3 Transform pixel: (x, y) = T(x, y)
-
-	4 Call p = GetUntransformedPixel (x, y)
-
-	5 If the image has an alpha map, then
-
-		Call GetUntransformedPixel (x, y) on the alpha map
-		
-		add resulting alpha channel to p
-
-	   return p
-
-	Where GetUnTransformedPixel is:
-
-	6 switch (filter)
-	  {
-	  case NEAREST:
-		return GetUnfilteredPixel (x, y);
-		break;
-
-	  case BILINEAR:
-		return GetUnfilteredPixel (...) // 4 times 
-		break;
-
-	  case CONVOLUTION:
-		return GetUnfilteredPixel (...) // as many times as necessary.
-		break;
-	  }
-
-	Where GetUnfilteredPixel (x, y) is
-
-	7 switch (repeat)
-	   {
-	   case REPEAT_NORMAL:
-	   case REPEAT_PAD:
-	   case REPEAT_REFLECT:
-		// adjust x, y as appropriate
-		break;
-
-	   case REPEAT_NONE:
-	        if (x, y) is outside image bounds
-		     return 0;
-		break;
-	   }
-
-	   return GetRawPixel(x, y)
-
-	Where GetRawPixel (x, y) is
-
-	8 Compute the pixel in question, depending on image type.
-
-For gradients, repeat has a totally different meaning, so
-UnfilteredPixel() and RawPixel() must be the same function so that
-gradients can do their own repeat algorithm.
-
-So, the GetRawPixel
-
-	for bits must deal with repeats
-	for gradients must deal with repeats (differently)
-	for solids, should ignore repeats.
-
-	for polygons, when we add them, either ignore repeats or do
-	something similar to bits (in which case, we may want an extra
-	layer of indirection to modify the coordinates).
-
-It is then possible to build things like "get scanline" or "get tile" on
-top of this. In the simplest case, just repeatedly calling GetPixel()
-would work, but specialized get_scanline()s or get_tile()s could be
-plugged in for common cases. 
-
-By not plugging anything in for images with access functions, we only
-have to compile the pixel functions twice, not the scanline functions.
-
-And we can get rid of fetchers for the bizarre formats that no one
-uses. Such as b2g3r3 etc. r1g2b1? Seriously? It is also worth
-considering a generic format based pixel fetcher for these edge cases.
-
-Since the actual routines depend on the image attributes, the images
-must be notified when those change and update their function pointers
-appropriately. So there should probably be a virtual function called
-(* reinit) or something like that.
-
-There will also be wide fetchers for both pixels and lines. The line
-fetcher will just call the wide pixel fetcher. The wide pixel fetcher
-will just call expand, except for 10 bit formats.
-
-Rendering pipeline:
-
-Drawable:
-	0. if (picture has alpha map)
-		0.1. Position alpha map according to the alpha_x/alpha_y
-	        0.2. Where the two drawables intersect, the alpha channel
-		     Replace the alpha channel of source with the one
-		     from the alpha map. Replacement only takes place
-		     in the intersection of the two drawables' geometries.
-	1. Repeat the drawable according to the repeat attribute
-	2. Reconstruct a continuous image according to the filter
-	3. Transform according to the transform attribute
-	4. Position image such that src_x, src_y is over dst_x, dst_y
-	5. Sample once per destination pixel 
-	6. Clip. If a pixel is not within the source clip, then no
-	   compositing takes place at that pixel. (Ie., it's *not*
-	   treated as 0).
-
-	Sampling a drawable: 
-
-	- If the channel does not have an alpha channel, the pixels in it
-	  are treated as opaque.
-
-	Note on reconstruction:
-
-	- The top left pixel has coordinates (0.5, 0.5) and pixels are
-	  spaced 1 apart.
-
-Gradient:
-	1. Unless gradient type is conical, repeat the underlying (0, 1)
-		gradient according to the repeat attribute
-	2. Integrate the gradient across the plane according to type.
-	3. Transform according to transform attribute
-	4. Position gradient 
-	5. Sample once per destination pixel.
- 	6. Clip
-
-Solid Fill:
-	1. Repeat has no effect
-	2. Image is already continuous and defined for the entire plane
-	3. Transform has no effect
-	4. Positioning has no effect
-	5. Sample once per destination pixel.
-	6. Clip
-
-Polygon:
-	1. Repeat has no effect
-	2. Image is already continuous and defined on the whole plane
-	3. Transform according to transform attribute
-	4. Position image
-	5. Supersample 15x17 per destination pixel.
-	6. Clip
-
-Possibly interesting additions:
-	- More general transformations, such as warping, or general
-	  shading.
-
-	- Shader image where a function is called to generate the
-          pixel (ie., uploading assembly code).
-
-	- Resampling kernels
-
-	  In principle the polygon image uses a 15x17 box filter for
-	  resampling. If we allow general resampling filters, then we
-	  get all the various antialiasing types for free. 
-
-	  Bilinear downsampling looks terrible and could be much 
-	  improved by a resampling filter. NEAREST reconstruction
-	  combined with a box resampling filter is what GdkPixbuf
-	  does, I believe.
-
-	  Useful for high frequency gradients as well.
-
-	  (Note that the difference between a reconstruction and a
-	  resampling filter is mainly where in the pipeline they
-	  occur. High quality resampling should use a correctly
-	  oriented kernel so it should happen after transformation.
-
-	  An implementation can transform the resampling kernel and
-	  convolve it with the reconstruction if it so desires, but it
-	  will need to deal with the fact that the resampling kernel
-	  will not necessarily be pixel aligned.
-
-	  "Output kernels"
-
-	  One could imagine doing the resampling after compositing,
-	  ie., for each destination pixel sample each source image 16
-	  times, then composite those subpixels individually, then
-	  finally apply a kernel.
-
-	  However, this is effectively the same as full screen
-	  antialiasing, which is a simpler way to think about it. So
-	  resampling kernels may make sense for individual images, but
-	  not as a post-compositing step.
-	  
-	  Fullscreen AA is inefficient without chained compositing
-	  though. Consider an (image scaled up to oversample size IN
-	  some polygon) scaled down to screen size. With the current
-	  implementation, there will be a huge temporary. With chained
-	  compositing, the whole thing ends up being equivalent to the
-	  output kernel from above.
-
-	- Color space conversion
-
-	  The complete model here is that each surface has a color
-	  space associated with it and that the compositing operation
-	  also has one associated with it. Note also that gradients
-	  should have associcated colorspaces.
-
-	- Dithering
-
-	  If people dither something that is already dithered, it will
-	  look terrible, but don't do that, then. (Dithering happens
-	  after resampling if at all - what is the relationship
-	  with color spaces? Presumably dithering should happen in linear
-	  intensity space).
-
-	- Floating point surfaces, 16, 32 and possibly 64 bit per
-	  channel.
-
-	Maybe crack:
-
-	- Glyph polygons
-
-	  If glyphs could be given as polygons, they could be
-	  positioned and rasterized more accurately. The glyph
-	  structure would need subpixel positioning though.
-
-	- Luminance vs. coverage for the alpha channel
-
-	  Whether the alpha channel should be interpreted as luminance
-          modulation or as coverage (intensity modulation). This is a
-          bit of a departure from the rendering model though. It could
-	  also be considered whether it should be possible to have 
-	  both channels in the same drawable.
-
-	- Alternative for component alpha
-
-	  - Set component-alpha on the output image.
-
-	    - This means each of the components are sampled
-	      independently and composited in the corresponding
-	      channel only.
-
-	  - Have 3 x oversampled mask
-
-	  - Scale it down by 3 horizontally, with [ 1/3, 1/3, 1/3 ]
-            resampling filter. 
-
-	    Is this equivalent to just using a component alpha mask?
-
-	Incompatible changes:
-
-	- Gradients could be specified with premultiplied colors. (You
-	  can use a mask to get things like gradients from solid red to
-	  transparent red.
-
-Refactoring pixman
-
-The pixman code is not particularly nice to put it mildly. Among the
-issues are
-
-- inconsistent naming style (fb vs Fb, camelCase vs
-  underscore_naming). Sometimes there is even inconsistency *within*
-  one name.
-
-      fetchProc32 ACCESS(pixman_fetchProcForPicture32)
-
-  may be one of the uglies names ever created.
-
-  coding style: 
-  	 use the one from cairo except that pixman uses this brace style:
-	 
-		while (blah)
-		{
-		}
-
-	Format do while like this:
-
-	       do 
-	       {
-
-	       } 
-	       while (...);
-
-- PIXMAN_COMPOSITE_RECT_GENERAL() is horribly complex
-
-- switch case logic in pixman-access.c
-
-  Instead it would be better to just store function pointers in the
-  image objects themselves,
-
-  	get_pixel()
-	get_scanline()
-
-- Much of the scanline fetching code is for formats that no one 
-  ever uses. a2r2g2b2 anyone?
-
-  It would probably be worthwhile having a generic fetcher for any
-  pixman format whatsoever.
-
-- Code related to particular image types should be split into individual
-  files.
-
-	pixman-bits-image.c
-	pixman-linear-gradient-image.c
-	pixman-radial-gradient-image.c
-	pixman-solid-image.c
-
-- Fast path code should be split into files based on architecture:
-
-       pixman-mmx-fastpath.c
-       pixman-sse2-fastpath.c
-       pixman-c-fastpath.c
-
-       etc.
-
-  Each of these files should then export a fastpath table, which would
-  be declared in pixman-private.h. This should allow us to get rid
-  of the pixman-mmx.h files.
-
-  The fast path table should describe each fast path. Ie there should
-  be bitfields indicating what things the fast path can handle, rather than
-  like now where it is only allowed to take one format per src/mask/dest. Ie., 
-
-  { 
-    FAST_a8r8g8b8 | FAST_x8r8g8b8,
-    FAST_null,
-    FAST_x8r8g8b8,
-    FAST_repeat_normal | FAST_repeat_none,
-    the_fast_path
-  }
-
-There should then be *one* file that implements pixman_image_composite(). 
-This should do this:
-
-     optimize_operator();
-
-     convert 1x1 repeat to solid (actually this should be done at
-     image creation time).
-     
-     is there a useful fastpath?
-
-There should be a file called pixman-cpu.c that contains all the
-architecture specific stuff to detect what CPU features we have.
-
-Issues that must be kept in mind:
-
-       - we need accessor code to be preserved
-
-       - maybe there should be a "store_scanline" too?
-
-         Is this sufficient?
-
-	 We should preserve the optimization where the
-	 compositing happens directly in the destination
-	 whenever possible.
-
-	- It should be possible to create GPU samplers from the
-	  images.
-
-The "horizontal" classification should be a bit in the image, the
-"vertical" classification should just happen inside the gradient
-file. Note though that
-
-      (a) these will change if the tranformation/repeat changes.
-
-      (b) at the moment the optimization for linear gradients
-          takes the source rectangle into account. Presumably
-	  this is to also optimize the case where the gradient
-	  is close enough to horizontal?
-
-Who is responsible for repeats? In principle it should be the scanline
-fetch. Right now NORMAL repeats are handled by walk_composite_region()
-while other repeats are handled by the scanline code.
-
-
-(Random note on filtering: do you filter before or after
-transformation?  Hardware is going to filter after transformation;
-this is also what pixman does currently). It's not completely clear
-what filtering *after* transformation means. One thing that might look
-good would be to do *supersampling*, ie., compute multiple subpixels
-per destination pixel, then average them together.
diff --git a/pixman/test/blitters-test.c b/pixman/test/blitters-test.c
index 8766fa800..a2c6ff4d8 100644
--- a/pixman/test/blitters-test.c
+++ b/pixman/test/blitters-test.c
@@ -46,7 +46,16 @@ create_random_image (pixman_format_code_t *allowed_formats,
     /* do the allocation */
     buf = aligned_malloc (64, stride * height);
 
-    prng_randmemset (buf, stride * height, RANDMEMSET_MORE_00_AND_FF);
+    if (prng_rand_n (4) == 0)
+    {
+	/* uniform distribution */
+	prng_randmemset (buf, stride * height, 0);
+    }
+    else
+    {
+	/* significantly increased probability for 0x00 and 0xFF */
+	prng_randmemset (buf, stride * height, RANDMEMSET_MORE_00_AND_FF);
+    }
 
     img = pixman_image_create_bits (fmt, width, height, buf, stride);
 
@@ -393,6 +402,6 @@ main (int argc, const char *argv[])
     }
 
     return fuzzer_test_main("blitters", 2000000,
-			    0xD8265D5E,
+			    0x0CF3283B,
 			    test_composite, argc, argv);
 }
diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c
index 4e16f7ba1..1049e21e7 100644
--- a/pixman/test/lowlevel-blt-bench.c
+++ b/pixman/test/lowlevel-blt-bench.c
@@ -385,6 +385,7 @@ bench_composite (char * testname,
     double                          t1, t2, t3, pix_cnt;
     int64_t                         n, l1test_width, nlines;
     double                             bytes_per_pix = 0;
+    pixman_bool_t                   bench_pixbuf = FALSE;
 
     pixman_composite_func_t func = pixman_image_composite_wrapper;
 
@@ -422,16 +423,20 @@ bench_composite (char * testname,
 
     mask_img = NULL;
     xmask_img = NULL;
+    if (strcmp (testname, "pixbuf") == 0 || strcmp (testname, "rpixbuf") == 0)
+    {
+        bench_pixbuf = TRUE;
+    }
     if (!(mask_flags & SOLID_FLAG) && mask_fmt != PIXMAN_null)
     {
         bytes_per_pix += (mask_fmt >> 24) / ((op == PIXMAN_OP_SRC) ? 8.0 : 4.0);
         mask_img = pixman_image_create_bits (mask_fmt,
                                              WIDTH, HEIGHT,
-                                             mask,
+                                             bench_pixbuf ? src : mask,
                                              WIDTH * 4);
         xmask_img = pixman_image_create_bits (mask_fmt,
                                              XWIDTH, XHEIGHT,
-                                             mask,
+                                             bench_pixbuf ? src : mask,
                                              XWIDTH * 4);
     }
     else if (mask_fmt != PIXMAN_null)
@@ -643,6 +648,8 @@ tests_tbl[] =
     { "src_0888_0565",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "src_0888_8888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "src_0888_x888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_0888_8888_rev",     PIXMAN_b8g8r8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_0888_0565_rev",     PIXMAN_b8g8r8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "src_x888_x888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
     { "src_x888_8888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "src_8888_8888",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
@@ -707,6 +714,8 @@ tests_tbl[] =
     { "outrev_n_8888_x888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_x8r8g8b8 },
     { "outrev_n_8888_8888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_a8r8g8b8 },
     { "over_reverse_n_8888",   PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER_REVERSE, PIXMAN_null, 0, PIXMAN_a8r8g8b8 },
+    { "pixbuf",                PIXMAN_x8b8g8r8,    0, PIXMAN_OP_SRC,     PIXMAN_a8b8g8r8, 0, PIXMAN_a8r8g8b8 },
+    { "rpixbuf",               PIXMAN_x8b8g8r8,    0, PIXMAN_OP_SRC,     PIXMAN_a8b8g8r8, 0, PIXMAN_a8b8g8r8 },
 };
 
 int
diff --git a/pixman/test/prng-test.c b/pixman/test/prng-test.c
index 0a3ad5e8f..c1d9320cc 100644
--- a/pixman/test/prng-test.c
+++ b/pixman/test/prng-test.c
@@ -106,7 +106,10 @@ int main (int argc, char *argv[])
 {
     const uint32_t ref_crc[RANDMEMSET_MORE_00_AND_FF + 1] =
     {
-        0xBA06763D, 0x103FC550, 0x8B59ABA5, 0xD82A0F39
+        0xBA06763D, 0x103FC550, 0x8B59ABA5, 0xD82A0F39,
+        0xD2321099, 0xFD8C5420, 0xD3B7C42A, 0xFC098093,
+        0x85E01DE0, 0x6680F8F7, 0x4D32DD3C, 0xAE52382B,
+        0x149E6CB5, 0x8B336987, 0x15DCB2B3, 0x8A71B781
     };
     uint32_t crc1, crc2;
     uint32_t ref, seed, seed0, seed1, seed2, seed3;
diff --git a/pixman/test/utils-prng.c b/pixman/test/utils-prng.c
index 967b8989a..7b32e3531 100644
--- a/pixman/test/utils-prng.c
+++ b/pixman/test/utils-prng.c
@@ -107,6 +107,7 @@ randmemset_internal (prng_t                  *prng,
 {
     prng_t local_prng = *prng;
     prng_rand_128_data_t randdata;
+    size_t i;
 
     while (size >= 16)
     {
@@ -138,6 +139,22 @@ randmemset_internal (prng_t                  *prng,
                 };
                 randdata.vb &= (t.vb >= const_40);
             }
+            if (flags & RANDMEMSET_MORE_FFFFFFFF)
+            {
+                const uint32x4 const_C0000000 =
+                {
+                    0xC0000000, 0xC0000000, 0xC0000000, 0xC0000000
+                };
+                randdata.vw |= ((t.vw << 30) >= const_C0000000);
+            }
+            if (flags & RANDMEMSET_MORE_00000000)
+            {
+                const uint32x4 const_40000000 =
+                {
+                    0x40000000, 0x40000000, 0x40000000, 0x40000000
+                };
+                randdata.vw &= ((t.vw << 30) >= const_40000000);
+            }
 #else
             #define PROCESS_ONE_LANE(i)                                       \
                 if (flags & RANDMEMSET_MORE_FF)                               \
@@ -155,6 +172,18 @@ randmemset_internal (prng_t                  *prng,
                     mask_00 |= mask_00 >> 2;                                  \
                     mask_00 |= mask_00 >> 4;                                  \
                     randdata.w[i] &= mask_00;                                 \
+                }                                                             \
+                if (flags & RANDMEMSET_MORE_FFFFFFFF)                         \
+                {                                                             \
+                    int32_t mask_ff = ((t.w[i] << 30) & (t.w[i] << 31)) &     \
+                                       0x80000000;                            \
+                    randdata.w[i] |= mask_ff >> 31;                           \
+                }                                                             \
+                if (flags & RANDMEMSET_MORE_00000000)                         \
+                {                                                             \
+                    int32_t mask_00 = ((t.w[i] << 30) | (t.w[i] << 31)) &     \
+                                       0x80000000;                            \
+                    randdata.w[i] &= mask_00 >> 31;                           \
                 }
 
             PROCESS_ONE_LANE (0)
@@ -198,7 +227,8 @@ randmemset_internal (prng_t                  *prng,
         }
         size -= 16;
     }
-    while (size > 0)
+    i = 0;
+    while (i < size)
     {
         uint8_t randbyte = prng_rand_r (&local_prng) & 0xFF;
         if (flags != 0)
@@ -208,9 +238,25 @@ randmemset_internal (prng_t                  *prng,
                 randbyte = 0xFF;
             if ((flags & RANDMEMSET_MORE_00) && (t < 0x40))
                 randbyte = 0x00;
+            if (i % 4 == 0 && i + 4 <= size)
+            {
+                t = prng_rand_r (&local_prng) & 0xFF;
+                if ((flags & RANDMEMSET_MORE_FFFFFFFF) && (t >= 0xC0))
+                {
+                    memset(&buf[i], 0xFF, 4);
+                    i += 4;
+                    continue;
+                }
+                if ((flags & RANDMEMSET_MORE_00000000) && (t < 0x40))
+                {
+                    memset(&buf[i], 0x00, 4);
+                    i += 4;
+                    continue;
+                }
+            }
         }
-        *buf++ = randbyte;
-        size--;
+        buf[i] = randbyte;
+        i++;
     }
     *prng = local_prng;
 }
@@ -218,8 +264,10 @@ randmemset_internal (prng_t                  *prng,
 /*
  * Fill memory buffer with random data. Flags argument may be used
  * to tweak some statistics properties:
- *    RANDMEMSET_MORE_00 - set ~25% of bytes to 0x00
- *    RANDMEMSET_MORE_FF - set ~25% of bytes to 0xFF
+ *    RANDMEMSET_MORE_00        - set ~25% of bytes to 0x00
+ *    RANDMEMSET_MORE_FF        - set ~25% of bytes to 0xFF
+ *    RANDMEMSET_MORE_00000000  - ~25% chance for 00000000 4-byte clusters
+ *    RANDMEMSET_MORE_FFFFFFFF  - ~25% chance for FFFFFFFF 4-byte clusters
  */
 void prng_randmemset_r (prng_t                  *prng,
                         void                    *voidbuf,
diff --git a/pixman/test/utils-prng.h b/pixman/test/utils-prng.h
index 285107f08..564ffcef1 100644
--- a/pixman/test/utils-prng.h
+++ b/pixman/test/utils-prng.h
@@ -153,7 +153,10 @@ typedef enum
 {
     RANDMEMSET_MORE_00        = 1, /* ~25% chance for 0x00 bytes */
     RANDMEMSET_MORE_FF        = 2, /* ~25% chance for 0xFF bytes */
-    RANDMEMSET_MORE_00_AND_FF = (RANDMEMSET_MORE_00 | RANDMEMSET_MORE_FF)
+    RANDMEMSET_MORE_00000000  = 4, /* ~25% chance for 0x00000000 clusters */
+    RANDMEMSET_MORE_FFFFFFFF  = 8, /* ~25% chance for 0xFFFFFFFF clusters */
+    RANDMEMSET_MORE_00_AND_FF = (RANDMEMSET_MORE_00 | RANDMEMSET_MORE_00000000 |
+                                 RANDMEMSET_MORE_FF | RANDMEMSET_MORE_FFFFFFFF)
 } prng_randmemset_flags_t;
 
 /* Set the 32-bit seed for PRNG */