16 files changed, 1308 insertions, 268 deletions
diff --git a/pixman/configure.ac b/pixman/configure.ac
index 515e31218..a93e2905b 100644
--- a/pixman/configure.ac
+++ b/pixman/configure.ac
@@ -968,6 +968,22 @@ fi
 AC_MSG_RESULT($support_for_attribute_constructor)
 AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
 
+dnl =====================================
+dnl __float128
+
+support_for_float128=no
+
+AC_MSG_CHECKING(for __float128)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+__float128 a = 1.0Q, b = 2.0Q; int main (void) { return a + b; }
+]])], support_for_float128=yes)
+
+if test x$support_for_float128 = xyes; then
+   AC_DEFINE([HAVE_FLOAT128], [], [Whether the tool chain supports __float128])
+fi
+
+AC_MSG_RESULT($support_for_float128)
+
 dnl ==================
 dnl libpng
 
diff --git a/pixman/demos/scale.c b/pixman/demos/scale.c
index 9100ff72a..869ada12b 100644
--- a/pixman/demos/scale.c
+++ b/pixman/demos/scale.c
@@ -39,6 +39,7 @@ typedef struct
     GtkAdjustment *     scale_x_adjustment;
     GtkAdjustment *     scale_y_adjustment;
     GtkAdjustment *     rotate_adjustment;
+    GtkAdjustment *	subsample_adjustment;
     int                 scaled_width;
     int                 scaled_height;
 } app_t;
@@ -236,7 +237,8 @@ rescale (GtkWidget *may_be_null, app_t *app)
 	get_value (app, filters, "reconstruct_y_combo_box"),
 	get_value (app, filters, "sample_x_combo_box"),
 	get_value (app, filters, "sample_y_combo_box"),
-        4, 4);
+	gtk_adjustment_get_value (app->subsample_adjustment),
+	gtk_adjustment_get_value (app->subsample_adjustment));
 
     pixman_image_set_filter (app->original, PIXMAN_FILTER_SEPARABLE_CONVOLUTION, params, n_params);
 
@@ -360,10 +362,13 @@ app_new (pixman_image_t *original)
         GTK_ADJUSTMENT (gtk_builder_get_object (app->builder, "scale_y_adjustment"));
     app->rotate_adjustment =
         GTK_ADJUSTMENT (gtk_builder_get_object (app->builder, "rotate_adjustment"));
+    app->subsample_adjustment =
+	GTK_ADJUSTMENT (gtk_builder_get_object (app->builder, "subsample_adjustment"));
 
     g_signal_connect (app->scale_x_adjustment, "value_changed", G_CALLBACK (rescale), app);
     g_signal_connect (app->scale_y_adjustment, "value_changed", G_CALLBACK (rescale), app);
     g_signal_connect (app->rotate_adjustment, "value_changed", G_CALLBACK (rescale), app);
+    g_signal_connect (app->subsample_adjustment, "value_changed", G_CALLBACK (rescale), app);
     
     widget = get_widget (app, "scale_x_scale");
     gtk_scale_add_mark (GTK_SCALE (widget), 0.0, GTK_POS_LEFT, NULL);
diff --git a/pixman/demos/scale.ui b/pixman/demos/scale.ui
index f7c0c805f..b3450d34d 100644
--- a/pixman/demos/scale.ui
+++ b/pixman/demos/scale.ui
@@ -23,6 +23,14 @@
     <property name="page_increment">10</property>
     <property name="page_size">10</property>
   </object>
+  <object class="GtkAdjustment" id="subsample_adjustment">
+    <property name="lower">1</property>
+    <property name="upper">12</property>
+    <property name="step_increment">1</property>
+    <property name="page_increment">1</property>
+    <property name="page_size">0</property>
+    <property name="value">4</property>
+  </object>
   <object class="GtkWindow" id="main">
     <child>
       <object class="GtkHBox" id="u">
@@ -51,6 +59,7 @@
         <child>
           <object class="GtkVBox" id="box1">
             <property name="visible">True</property>
+	    <property name="spacing">12</property>
             <child>
               <object class="GtkHBox" id="box2">
                 <property name="visible">True</property>
@@ -234,6 +243,17 @@
                       </packing>
                     </child>
                     <child>
+                      <object class="GtkLabel" id="label9">
+                        <property name="visible">True</property>
+                        <property name="xalign">1</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Subsample:&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                      </object>
+                      <packing>
+                        <property name="top_attach">5</property>
+                      </packing>
+                    </child>
+                    <child>
                       <object class="GtkComboBox" id="reconstruct_x_combo_box">
                         <property name="visible">True</property>
                       </object>
@@ -277,6 +297,16 @@
                         <property name="top_attach">4</property>
                       </packing>
                     </child>
+                    <child>
+                      <object class="GtkSpinButton" id="subsample_spin_button">
+                        <property name="visible">True</property>
+			<property name="adjustment">subsample_adjustment</property>
+                      </object>
+                      <packing>
+                        <property name="left_attach">1</property>
+                        <property name="top_attach">5</property>
+                      </packing>
+                    </child>
                   </object>
                   <packing>
                     <property name="expand">False</property>
diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c
index c625e0c4a..247aea645 100644
--- a/pixman/pixman/pixman-fast-path.c
+++ b/pixman/pixman/pixman-fast-path.c
@@ -739,36 +739,6 @@ fast_composite_over_8888_0565 (pixman_implementation_t *imp,
 }
 
 static void
-fast_composite_src_x888_0565 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint16_t    *dst_line, *dst;
-    uint32_t    *src_line, *src, s;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    *dst = convert_8888_to_0565 (s);
-	    dst++;
-	}
-    }
-}
-
-static void
 fast_composite_add_8_8 (pixman_implementation_t *imp,
 			pixman_composite_info_t *info)
 {
@@ -1243,6 +1213,18 @@ fast_composite_tiled_repeat (pixman_implementation_t *imp,
     pixman_composite_func_t func;
     pixman_format_code_t mask_format;
     uint32_t src_flags, mask_flags;
+    int32_t sx, sy;
+    int32_t width_remain;
+    int32_t num_pixels;
+    int32_t src_width;
+    int32_t i, j;
+    pixman_image_t extended_src_image;
+    uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
+    pixman_bool_t need_src_extension;
+    uint32_t *src_line;
+    int32_t src_stride;
+    int32_t src_bpp;
+    pixman_composite_info_t info2 = *info;
 
     src_flags = (info->src_flags & ~FAST_PATH_NORMAL_REPEAT) |
 		    FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
@@ -1258,149 +1240,131 @@ fast_composite_tiled_repeat (pixman_implementation_t *imp,
 	mask_flags = FAST_PATH_IS_OPAQUE;
     }
 
-    if (_pixman_implementation_lookup_composite (
-	    imp->toplevel, info->op,
-	    src_image->common.extended_format_code, src_flags,
-	    mask_format, mask_flags,
-	    dest_image->common.extended_format_code, info->dest_flags,
-	    &imp, &func))
+    _pixman_implementation_lookup_composite (
+	imp->toplevel, info->op,
+	src_image->common.extended_format_code, src_flags,
+	mask_format, mask_flags,
+	dest_image->common.extended_format_code, info->dest_flags,
+	&imp, &func);
+
+    src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
+
+    if (src_image->bits.width < REPEAT_MIN_WIDTH		&&
+	(src_bpp == 32 || src_bpp == 16 || src_bpp == 8)	&&
+	!src_image->bits.indexed)
     {
-	int32_t sx, sy;
-	int32_t width_remain;
-	int32_t num_pixels;
-	int32_t src_width;
-	int32_t i, j;
-	pixman_image_t extended_src_image;
-	uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
-	pixman_bool_t need_src_extension;
-	uint32_t *src_line;
-	int32_t src_stride;
-	int32_t src_bpp;
-	pixman_composite_info_t info2 = *info;
-
-	src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
-
-	if (src_image->bits.width < REPEAT_MIN_WIDTH		&&
-	    (src_bpp == 32 || src_bpp == 16 || src_bpp == 8)	&&
-	    !src_image->bits.indexed)
-	{
-	    sx = src_x;
-	    sx = MOD (sx, src_image->bits.width);
-	    sx += width;
-	    src_width = 0;
+	sx = src_x;
+	sx = MOD (sx, src_image->bits.width);
+	sx += width;
+	src_width = 0;
 
-	    while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
-		src_width += src_image->bits.width;
+	while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
+	    src_width += src_image->bits.width;
 
-	    src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
+	src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
 
-	    /* Initialize/validate stack-allocated temporary image */
-	    _pixman_bits_image_init (&extended_src_image, src_image->bits.format,
-				     src_width, 1, &extended_src[0], src_stride,
-				     FALSE);
-	    _pixman_image_validate (&extended_src_image);
+	/* Initialize/validate stack-allocated temporary image */
+	_pixman_bits_image_init (&extended_src_image, src_image->bits.format,
+				 src_width, 1, &extended_src[0], src_stride,
+				 FALSE);
+	_pixman_image_validate (&extended_src_image);
 
-	    info2.src_image = &extended_src_image;
-	    need_src_extension = TRUE;
-	}
-	else
-	{
-	    src_width = src_image->bits.width;
-	    need_src_extension = FALSE;
-	}
+	info2.src_image = &extended_src_image;
+	need_src_extension = TRUE;
+    }
+    else
+    {
+	src_width = src_image->bits.width;
+	need_src_extension = FALSE;
+    }
 
-	sx = src_x;
-	sy = src_y;
+    sx = src_x;
+    sy = src_y;
 
-	while (--height >= 0)
-	{
-	    sx = MOD (sx, src_width);
-	    sy = MOD (sy, src_image->bits.height);
+    while (--height >= 0)
+    {
+	sx = MOD (sx, src_width);
+	sy = MOD (sy, src_image->bits.height);
 
-	    if (need_src_extension)
+	if (need_src_extension)
+	{
+	    if (src_bpp == 32)
 	    {
-		if (src_bpp == 32)
-		{
-		    PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
 
-		    for (i = 0; i < src_width; )
-		    {
-			for (j = 0; j < src_image->bits.width; j++, i++)
-			    extended_src[i] = src_line[j];
-		    }
-		}
-		else if (src_bpp == 16)
+		for (i = 0; i < src_width; )
 		{
-		    uint16_t *src_line_16;
-
-		    PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
-					   src_line_16, 1);
-		    src_line = (uint32_t*)src_line_16;
-
-		    for (i = 0; i < src_width; )
-		    {
-			for (j = 0; j < src_image->bits.width; j++, i++)
-			    ((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
-		    }
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			extended_src[i] = src_line[j];
 		}
-		else if (src_bpp == 8)
-		{
-		    uint8_t *src_line_8;
+	    }
+	    else if (src_bpp == 16)
+	    {
+		uint16_t *src_line_16;
 
-		    PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
-					   src_line_8, 1);
-		    src_line = (uint32_t*)src_line_8;
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
+				       src_line_16, 1);
+		src_line = (uint32_t*)src_line_16;
 
-		    for (i = 0; i < src_width; )
-		    {
-			for (j = 0; j < src_image->bits.width; j++, i++)
-			    ((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
-		    }
+		for (i = 0; i < src_width; )
+		{
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
 		}
-
-		info2.src_y = 0;
 	    }
-	    else
+	    else if (src_bpp == 8)
 	    {
-		info2.src_y = sy;
+		uint8_t *src_line_8;
+
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
+				       src_line_8, 1);
+		src_line = (uint32_t*)src_line_8;
+
+		for (i = 0; i < src_width; )
+		{
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
+		}
 	    }
 
-	    width_remain = width;
+	    info2.src_y = 0;
+	}
+	else
+	{
+	    info2.src_y = sy;
+	}
 
-	    while (width_remain > 0)
-	    {
-		num_pixels = src_width - sx;
+	width_remain = width;
 
-		if (num_pixels > width_remain)
-		    num_pixels = width_remain;
+	while (width_remain > 0)
+	{
+	    num_pixels = src_width - sx;
 
-		info2.src_x = sx;
-		info2.width = num_pixels;
-		info2.height = 1;
+	    if (num_pixels > width_remain)
+		num_pixels = width_remain;
 
-		func (imp, &info2);
+	    info2.src_x = sx;
+	    info2.width = num_pixels;
+	    info2.height = 1;
 
-		width_remain -= num_pixels;
-		info2.mask_x += num_pixels;
-		info2.dest_x += num_pixels;
-		sx = 0;
-	    }
+	    func (imp, &info2);
 
-	    sx = src_x;
-	    sy++;
-	    info2.mask_x = info->mask_x;
-	    info2.mask_y++;
-	    info2.dest_x = info->dest_x;
-	    info2.dest_y++;
+	    width_remain -= num_pixels;
+	    info2.mask_x += num_pixels;
+	    info2.dest_x += num_pixels;
+	    sx = 0;
 	}
 
-	if (need_src_extension)
-	    _pixman_image_fini (&extended_src_image);
-    }
-    else
-    {
-	_pixman_log_error (FUNC, "Didn't find a suitable function ");
+	sx = src_x;
+	sy++;
+	info2.mask_x = info->mask_x;
+	info2.mask_y++;
+	info2.dest_x = info->dest_x;
+	info2.dest_y++;
     }
+
+    if (need_src_extension)
+	_pixman_image_fini (&extended_src_image);
 }
 
 /* Use more unrolling for src_0565_0565 because it is typically CPU bound */
@@ -1913,10 +1877,6 @@ static const pixman_fast_path_t c_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
     PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
     PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
 
@@ -2199,12 +2159,200 @@ fast_path_fill (pixman_implementation_t *imp,
     return TRUE;
 }
 
+/*****************************************************************************/
+
+static uint32_t *
+fast_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int32_t w = iter->width;
+    uint32_t *dst = iter->buffer;
+    const uint16_t *src = (const uint16_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    /* Align the source buffer at 4 bytes boundary */
+    if (w > 0 && ((uintptr_t)src & 3))
+    {
+	*dst++ = convert_0565_to_8888 (*src++);
+	w--;
+    }
+    /* Process two pixels per iteration */
+    while ((w -= 2) >= 0)
+    {
+	uint32_t sr, sb, sg, t0, t1;
+	uint32_t s = *(const uint32_t *)src;
+	src += 2;
+	sr = (s >> 8) & 0x00F800F8;
+	sb = (s << 3) & 0x00F800F8;
+	sg = (s >> 3) & 0x00FC00FC;
+	sr |= sr >> 5;
+	sb |= sb >> 5;
+	sg |= sg >> 6;
+	t0 = ((sr << 16) & 0x00FF0000) | ((sg << 8) & 0x0000FF00) |
+	     (sb & 0xFF) | 0xFF000000;
+	t1 = (sr & 0x00FF0000) | ((sg >> 8) & 0x0000FF00) |
+	     (sb >> 16) | 0xFF000000;
+#ifdef WORDS_BIGENDIAN
+	*dst++ = t1;
+	*dst++ = t0;
+#else
+	*dst++ = t0;
+	*dst++ = t1;
+#endif
+    }
+    if (w & 1)
+    {
+	*dst = convert_0565_to_8888 (*src);
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->bits += iter->stride;
+    return iter->buffer;
+}
+
+/* Helper function for a workaround, which tries to ensure that 0x1F001F
+ * constant is always allocated in a register on RISC architectures.
+ */
+static force_inline uint32_t
+convert_8888_to_0565_workaround (uint32_t s, uint32_t x1F001F)
+{
+    uint32_t a, b;
+    a = (s >> 3) & x1F001F;
+    b = s & 0xFC00;
+    a |= a >> 5;
+    a |= b >> 5;
+    return a;
+}
+
+static void
+fast_write_back_r5g6b5 (pixman_iter_t *iter)
+{
+    int32_t w = iter->width;
+    uint16_t *dst = (uint16_t *)(iter->bits - iter->stride);
+    const uint32_t *src = iter->buffer;
+    /* Workaround to ensure that x1F001F variable is allocated in a register */
+    static volatile uint32_t volatile_x1F001F = 0x1F001F;
+    uint32_t x1F001F = volatile_x1F001F;
+
+    while ((w -= 4) >= 0)
+    {
+	uint32_t s1 = *src++;
+	uint32_t s2 = *src++;
+	uint32_t s3 = *src++;
+	uint32_t s4 = *src++;
+	*dst++ = convert_8888_to_0565_workaround (s1, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s2, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s3, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s4, x1F001F);
+    }
+    if (w & 2)
+    {
+	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+    }
+    if (w & 1)
+    {
+	*dst = convert_8888_to_0565_workaround (*src, x1F001F);
+    }
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_r5g6b5, fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
+    { PIXMAN_null }
+};
+
+static pixman_bool_t
+fast_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+#define FLAGS								\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+    if ((iter->iter_flags & ITER_NARROW)			&&
+	(iter->image_flags & FLAGS) == FLAGS)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		iter->get_scanline = f->get_scanline;
+		return TRUE;
+	    }
+	}
+    }
+
+    return FALSE;
+}
+
+static pixman_bool_t
+fast_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+    if ((iter->iter_flags & ITER_NARROW)		&&
+	(iter->image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
+		    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+		{
+		    iter->get_scanline = fast_dest_fetch_noop;
+		}
+		else
+		{
+		    iter->get_scanline = f->get_scanline;
+		}
+		iter->write_back = f->write_back;
+		return TRUE;
+	    }
+	}
+    }
+    return FALSE;
+}
+
+
 pixman_implementation_t *
 _pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
 
     imp->fill = fast_path_fill;
+    imp->src_iter_init = fast_src_iter_init;
+    imp->dest_iter_init = fast_dest_iter_init;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-general.c b/pixman/pixman/pixman-general.c
index f175d771e..93a1b9acf 100644
--- a/pixman/pixman/pixman-general.c
+++ b/pixman/pixman/pixman-general.c
@@ -188,9 +188,6 @@ general_composite_rect  (pixman_implementation_t *imp,
     compose = _pixman_implementation_lookup_combiner (
 	imp->toplevel, op, component_alpha, narrow);
 
-    if (!compose)
-	return;
-
     for (i = 0; i < height; ++i)
     {
 	uint32_t *s, *m, *d;
diff --git a/pixman/pixman/pixman-glyph.c b/pixman/pixman/pixman-glyph.c
index 6d2c8bbb7..5a271b64b 100644
--- a/pixman/pixman/pixman-glyph.c
+++ b/pixman/pixman/pixman-glyph.c
@@ -463,16 +463,13 @@ pixman_composite_glyphs_no_mask (pixman_op_t            op,
 		{
 		    glyph_format = glyph_img->common.extended_format_code;
 		    glyph_flags = glyph_img->common.flags;
-		    
+
 		    _pixman_implementation_lookup_composite (
 			get_implementation(), op,
 			src->common.extended_format_code, src->common.flags,
 			glyph_format, glyph_flags | extra,
 			dest_format, dest_flags,
 			&implementation, &func);
-
-		    if (!func)
-			goto out;
 		}
 
 		info.src_x = src_x + composite_box.x1 - dest_x;
@@ -582,9 +579,6 @@ add_glyphs (pixman_glyph_cache_t *cache,
 		mask_format, info.mask_flags,
 		dest_format, dest_flags,
 		&implementation, &func);
-
-	    if (!func)
-		goto out;
 	}
 
 	glyph_box.x1 = glyphs[i].x - glyph->origin_x + off_x;
diff --git a/pixman/pixman/pixman-implementation.c b/pixman/pixman/pixman-implementation.c
index ec467a619..c0a643633 100644
--- a/pixman/pixman/pixman-implementation.c
+++ b/pixman/pixman/pixman-implementation.c
@@ -65,7 +65,13 @@ typedef struct
 
 PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
 
-pixman_bool_t
+static void
+dummy_composite_rect (pixman_implementation_t *imp,
+		      pixman_composite_info_t *info)
+{
+}
+
+void
 _pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel,
 					 pixman_op_t               op,
 					 pixman_format_code_t      src_format,
@@ -142,7 +148,11 @@ _pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel,
 	    ++info;
 	}
     }
-    return FALSE;
+
+    /* We should never reach this point */
+    _pixman_log_error (FUNC, "No known composite function\n");
+    *out_imp = NULL;
+    *out_func = dummy_composite_rect;
 
 update_cache:
     if (i)
@@ -160,8 +170,16 @@ update_cache:
 	cache->cache[0].fast_path.dest_flags = dest_flags;
 	cache->cache[0].fast_path.func = *out_func;
     }
+}
 
-    return TRUE;
+static void
+dummy_combine (pixman_implementation_t *imp,
+	       pixman_op_t              op,
+	       uint32_t *               pd,
+	       const uint32_t *         ps,
+	       const uint32_t *         pm,
+	       int                      w)
+{
 }
 
 pixman_combine_32_func_t
@@ -199,7 +217,9 @@ _pixman_implementation_lookup_combiner (pixman_implementation_t *imp,
 	imp = imp->fallback;
     }
 
-    return NULL;
+    /* We should never reach this point */
+    _pixman_log_error (FUNC, "No known combine function\n");
+    return dummy_combine;
 }
 
 pixman_bool_t
diff --git a/pixman/pixman/pixman-inlines.h b/pixman/pixman/pixman-inlines.h
index ab4def0dc..dd1c2f17f 100644
--- a/pixman/pixman/pixman-inlines.h
+++ b/pixman/pixman/pixman-inlines.h
@@ -88,6 +88,42 @@ pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
 	   ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
 }
 
+#if BILINEAR_INTERPOLATION_BITS <= 4
+/* Inspired by Filter_32_opaque from Skia */
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t lo, hi;
+
+    distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
+    disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
+
+    distxy = distx * disty;
+    distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */
+    distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */
+    distixiy =
+	16 * 16 - (disty << 4) -
+	(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
+
+    lo = (tl & 0xff00ff) * distixiy;
+    hi = ((tl >> 8) & 0xff00ff) * distixiy;
+
+    lo += (tr & 0xff00ff) * distxiy;
+    hi += ((tr >> 8) & 0xff00ff) * distxiy;
+
+    lo += (bl & 0xff00ff) * distixy;
+    hi += ((bl >> 8) & 0xff00ff) * distixy;
+
+    lo += (br & 0xff00ff) * distxy;
+    hi += ((br >> 8) & 0xff00ff) * distxy;
+
+    return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
+}
+
+#else
 #if SIZEOF_LONG > 4
 
 static force_inline uint32_t
@@ -184,6 +220,7 @@ bilinear_interpolation (uint32_t tl, uint32_t tr,
 }
 
 #endif
+#endif // BILINEAR_INTERPOLATION_BITS <= 4
 
 /*
  * For each scanline fetched from source image with PAD repeat:
diff --git a/pixman/pixman/pixman-matrix.c b/pixman/pixman/pixman-matrix.c
index cd2f1b5b8..89b96826b 100644
--- a/pixman/pixman/pixman-matrix.c
+++ b/pixman/pixman/pixman-matrix.c
@@ -34,6 +34,338 @@
 
 #define F(x)    pixman_int_to_fixed (x)
 
+static force_inline int
+count_leading_zeros (uint32_t x)
+{
+#ifdef __GNUC__
+    return __builtin_clz (x);
+#else
+    int n = 0;
+    while (x)
+    {
+        n++;
+        x >>= 1;
+    }
+    return 32 - n;
+#endif
+}
+
+/*
+ * Large signed/unsigned integer division with rounding for the platforms with
+ * only 64-bit integer data type supported (no 128-bit data type).
+ *
+ * Arguments:
+ *     hi, lo - high and low 64-bit parts of the dividend
+ *     div    - 48-bit divisor
+ *
+ * Returns: lowest 64 bits of the result as a return value and highest 64
+ *          bits of the result to "result_hi" pointer
+ */
+
+/* grade-school unsigned division (128-bit by 48-bit) with rounding to nearest */
+static force_inline uint64_t
+rounded_udiv_128_by_48 (uint64_t  hi,
+                        uint64_t  lo,
+                        uint64_t  div,
+                        uint64_t *result_hi)
+{
+    uint64_t tmp, remainder, result_lo;
+    assert(div < ((uint64_t)1 << 48));
+
+    remainder = hi % div;
+    *result_hi = hi / div;
+
+    tmp = (remainder << 16) + (lo >> 48);
+    result_lo = tmp / div;
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + ((lo >> 32) & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + ((lo >> 16) & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + (lo & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    /* round to nearest */
+    if (remainder * 2 >= div && ++result_lo == 0)
+        *result_hi += 1;
+
+    return result_lo;
+}
+
+/* signed division (128-bit by 49-bit) with rounding to nearest */
+static inline int64_t
+rounded_sdiv_128_by_49 (int64_t   hi,
+                        uint64_t  lo,
+                        int64_t   div,
+                        int64_t  *signed_result_hi)
+{
+    uint64_t result_lo, result_hi;
+    int sign = 0;
+    if (div < 0)
+    {
+        div = -div;
+        sign ^= 1;
+    }
+    if (hi < 0)
+    {
+        if (lo != 0)
+            hi++;
+        hi = -hi;
+        lo = -lo;
+        sign ^= 1;
+    }
+    result_lo = rounded_udiv_128_by_48 (hi, lo, div, &result_hi);
+    if (sign)
+    {
+        if (result_lo != 0)
+            result_hi++;
+        result_hi = -result_hi;
+        result_lo = -result_lo;
+    }
+    if (signed_result_hi)
+    {
+        *signed_result_hi = result_hi;
+    }
+    return result_lo;
+}
+
+/*
+ * Multiply 64.16 fixed point value by (2^scalebits) and convert
+ * to 128-bit integer.
+ */
+static force_inline void
+fixed_64_16_to_int128 (int64_t  hi,
+                       int64_t  lo,
+                       int64_t *rhi,
+                       int64_t *rlo,
+                       int      scalebits)
+{
+    /* separate integer and fractional parts */
+    hi += lo >> 16;
+    lo &= 0xFFFF;
+
+    if (scalebits <= 0)
+    {
+        *rlo = hi >> (-scalebits);
+        *rhi = *rlo >> 63;
+    }
+    else
+    {
+        *rhi = hi >> (64 - scalebits);
+        *rlo = (uint64_t)hi << scalebits;
+        if (scalebits < 16)
+            *rlo += lo >> (16 - scalebits);
+        else
+            *rlo += lo << (scalebits - 16);
+    }
+}
+
+/*
+ * Convert 112.16 fixed point value to 48.16 with clamping for the out
+ * of range values.
+ */
+static force_inline pixman_fixed_48_16_t
+fixed_112_16_to_fixed_48_16 (int64_t hi, int64_t lo, pixman_bool_t *clampflag)
+{
+    if ((lo >> 63) != hi)
+    {
+        *clampflag = TRUE;
+        return hi >= 0 ? INT64_MAX : INT64_MIN;
+    }
+    else
+    {
+        return lo;
+    }
+}
+
+/*
+ * Transform a point with 31.16 fixed point coordinates from the destination
+ * space to a point with 48.16 fixed point coordinates in the source space.
+ * No overflows are possible for affine transformations and the results are
+ * accurate including the least significant bit. Projective transformations
+ * may overflow, in this case the results are just clamped to return maximum
+ * or minimum 48.16 values (so that the caller can at least handle the NONE
+ * and PAD repeats correctly) and the return value is FALSE to indicate that
+ * such clamping has happened.
+ */
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point_31_16 (const pixman_transform_t    *t,
+                              const pixman_vector_48_16_t *v,
+                              pixman_vector_48_16_t       *result)
+{
+    pixman_bool_t clampflag = FALSE;
+    int i;
+    int64_t tmp[3][2], divint;
+    uint16_t divfrac;
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    for (i = 0; i < 3; i++)
+    {
+        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+    }
+
+    /*
+     * separate 64-bit integer and 16-bit fractional parts for the divisor,
+     * which is also scaled by 65536 after fixed point multiplication.
+     */
+    divint  = tmp[2][0] + (tmp[2][1] >> 16);
+    divfrac = tmp[2][1] & 0xFFFF;
+
+    if (divint == pixman_fixed_1 && divfrac == 0)
+    {
+        /*
+         * this is a simple affine transformation
+         */
+        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+        result->v[2] = pixman_fixed_1;
+    }
+    else if (divint == 0 && divfrac == 0)
+    {
+        /*
+         * handle zero divisor (if the values are non-zero, set the
+         * results to maximum positive or minimum negative)
+         */
+        clampflag = TRUE;
+
+        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+
+        if (result->v[0] > 0)
+            result->v[0] = INT64_MAX;
+        else if (result->v[0] < 0)
+            result->v[0] = INT64_MIN;
+
+        if (result->v[1] > 0)
+            result->v[1] = INT64_MAX;
+        else if (result->v[1] < 0)
+            result->v[1] = INT64_MIN;
+    }
+    else
+    {
+        /*
+         * projective transformation, analyze the top 32 bits of the divisor
+         */
+        int32_t hi32divbits = divint >> 32;
+        if (hi32divbits < 0)
+            hi32divbits = ~hi32divbits;
+
+        if (hi32divbits == 0)
+        {
+            /* the divisor is small, we can actually keep all the bits */
+            int64_t hi, rhi, lo, rlo;
+            int64_t div = (divint << 16) + divfrac;
+
+            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+
+            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+        }
+        else
+        {
+            /* the divisor needs to be reduced to 48 bits */
+            int64_t hi, rhi, lo, rlo, div;
+            int shift = 32 - count_leading_zeros (hi32divbits);
+            fixed_64_16_to_int128 (divint, divfrac, &hi, &div, 16 - shift);
+
+            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32 - shift);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+
+            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32 - shift);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+        }
+    }
+    result->v[2] = pixman_fixed_1;
+    return !clampflag;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_point_31_16_affine (const pixman_transform_t    *t,
+                                     const pixman_vector_48_16_t *v,
+                                     pixman_vector_48_16_t       *result)
+{
+    int64_t hi0, lo0, hi1, lo1;
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    hi0  = (int64_t)t->matrix[0][0] * (v->v[0] >> 16);
+    lo0  = (int64_t)t->matrix[0][0] * (v->v[0] & 0xFFFF);
+    hi0 += (int64_t)t->matrix[0][1] * (v->v[1] >> 16);
+    lo0 += (int64_t)t->matrix[0][1] * (v->v[1] & 0xFFFF);
+    hi0 += (int64_t)t->matrix[0][2];
+
+    hi1  = (int64_t)t->matrix[1][0] * (v->v[0] >> 16);
+    lo1  = (int64_t)t->matrix[1][0] * (v->v[0] & 0xFFFF);
+    hi1 += (int64_t)t->matrix[1][1] * (v->v[1] >> 16);
+    lo1 += (int64_t)t->matrix[1][1] * (v->v[1] & 0xFFFF);
+    hi1 += (int64_t)t->matrix[1][2];
+
+    result->v[0] = hi0 + ((lo0 + 0x8000) >> 16);
+    result->v[1] = hi1 + ((lo1 + 0x8000) >> 16);
+    result->v[2] = pixman_fixed_1;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
+                                 const pixman_vector_48_16_t *v,
+                                 pixman_vector_48_16_t       *result)
+{
+    int i;
+    int64_t tmp[3][2];
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    for (i = 0; i < 3; i++)
+    {
+        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+    }
+
+    result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+    result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+    result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16);
+}
+
 PIXMAN_EXPORT void
 pixman_transform_init_identity (struct pixman_transform *matrix)
 {
@@ -50,69 +382,41 @@ PIXMAN_EXPORT pixman_bool_t
 pixman_transform_point_3d (const struct pixman_transform *transform,
                            struct pixman_vector *         vector)
 {
-    struct pixman_vector result;
-    pixman_fixed_32_32_t partial;
-    pixman_fixed_48_16_t v;
-    int i, j;
+    pixman_vector_48_16_t tmp;
+    tmp.v[0] = vector->vector[0];
+    tmp.v[1] = vector->vector[1];
+    tmp.v[2] = vector->vector[2];
 
-    for (j = 0; j < 3; j++)
-    {
-	v = 0;
-	for (i = 0; i < 3; i++)
-	{
-	    partial = ((pixman_fixed_48_16_t) transform->matrix[j][i] *
-	               (pixman_fixed_48_16_t) vector->vector[i]);
-	    v += (partial + 0x8000) >> 16;
-	}
-	
-	if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
-	    return FALSE;
-	
-	result.vector[j] = (pixman_fixed_t) v;
-    }
-    
-    *vector = result;
+    pixman_transform_point_31_16_3d (transform, &tmp, &tmp);
 
-    if (!result.vector[2])
-	return FALSE;
+    vector->vector[0] = tmp.v[0];
+    vector->vector[1] = tmp.v[1];
+    vector->vector[2] = tmp.v[2];
 
-    return TRUE;
+    return vector->vector[0] == tmp.v[0] &&
+           vector->vector[1] == tmp.v[1] &&
+           vector->vector[2] == tmp.v[2];
 }
 
 PIXMAN_EXPORT pixman_bool_t
 pixman_transform_point (const struct pixman_transform *transform,
                         struct pixman_vector *         vector)
 {
-    pixman_fixed_32_32_t partial;
-    pixman_fixed_34_30_t v[3];
-    pixman_fixed_48_16_t quo;
-    int i, j;
+    pixman_vector_48_16_t tmp;
+    tmp.v[0] = vector->vector[0];
+    tmp.v[1] = vector->vector[1];
+    tmp.v[2] = vector->vector[2];
 
-    for (j = 0; j < 3; j++)
-    {
-	v[j] = 0;
-	
-	for (i = 0; i < 3; i++)
-	{
-	    partial = ((pixman_fixed_32_32_t) transform->matrix[j][i] *
-	               (pixman_fixed_32_32_t) vector->vector[i]);
-	    v[j] += (partial + 2) >> 2;
-	}
-    }
-    
-    if (!((v[2] + 0x8000) >> 16))
-	return FALSE;
+    if (!pixman_transform_point_31_16 (transform, &tmp, &tmp))
+        return FALSE;
 
-    for (j = 0; j < 2; j++)
-    {
-	quo = v[j] / ((v[2] + 0x8000) >> 16);
-	if (quo > pixman_max_fixed_48_16 || quo < pixman_min_fixed_48_16)
-	    return FALSE;
-	vector->vector[j] = (pixman_fixed_t) quo;
-    }
-    
-    vector->vector[2] = pixman_fixed_1;
-    return TRUE;
+    vector->vector[0] = tmp.v[0];
+    vector->vector[1] = tmp.v[1];
+    vector->vector[2] = tmp.v[2];
+
+    return vector->vector[0] == tmp.v[0] &&
+           vector->vector[1] == tmp.v[1] &&
+           vector->vector[2] == tmp.v[2];
 }
 
 PIXMAN_EXPORT pixman_bool_t
diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h
index e5ab873ed..cb78a2ed8 100644
--- a/pixman/pixman/pixman-private.h
+++ b/pixman/pixman/pixman-private.h
@@ -497,7 +497,7 @@ pixman_implementation_t *
 _pixman_implementation_create (pixman_implementation_t *fallback,
 			       const pixman_fast_path_t *fast_paths);
 
-pixman_bool_t
+void
 _pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel,
 					 pixman_op_t               op,
 					 pixman_format_code_t      src_format,
@@ -1052,7 +1052,7 @@ _pixman_log_error (const char *function, const char *message);
 
 #else
 
-#define _pixman_log_error(f,m) do { } while (0)				\
+#define _pixman_log_error(f,m) do { } while (0)
 
 #define return_if_fail(expr)						\
     do                                                                  \
@@ -1078,6 +1078,27 @@ _pixman_log_error (const char *function, const char *message);
 #endif
 
 /*
+ * Matrix
+ */
+
+typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t;
+
+pixman_bool_t
+pixman_transform_point_31_16 (const pixman_transform_t    *t,
+                              const pixman_vector_48_16_t *v,
+                              pixman_vector_48_16_t       *result);
+
+void
+pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
+                                 const pixman_vector_48_16_t *v,
+                                 pixman_vector_48_16_t       *result);
+
+void
+pixman_transform_point_31_16_affine (const pixman_transform_t    *t,
+                                     const pixman_vector_48_16_t *v,
+                                     pixman_vector_48_16_t       *result);
+
+/*
  * Timers
  */
 
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index 5a0e0626a..fc873cc96 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -4523,7 +4523,163 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 	sse2_combine_add_u (imp, op, dst, src, NULL, width);
     }
+}
+
+static void
+sse2_composite_add_n_8888 (pixman_implementation_t *imp,
+			   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst, src;
+    int dst_stride;
+
+    __m128i xmm_src;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+
+    if (src == ~0)
+    {
+	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
+		     dest_x, dest_y, width, height, ~0);
+
+	return;
+    }
+
+    xmm_src = _mm_set_epi32 (src, src, src, src);
+    while (height--)
+    {
+	int w = width;
+	uint32_t d;
+
+	dst = dst_line;
+	dst_line += dst_stride;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    d = *dst;
+	    *dst++ =
+		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    save_128_aligned
+		((__m128i*)dst,
+		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
+
+	    dst += 4;
+	    w -= 4;
+	}
+
+	while (w--)
+	{
+	    d = *dst;
+	    *dst++ =
+		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
+						  _mm_cvtsi32_si128 (d)));
+	}
+    }
+}
+
+static void
+sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+
+    __m128i xmm_src;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    uint8_t m = *mask++;
+	    if (m)
+	    {
+		*dst = pack_1x128_32
+		    (_mm_adds_epu16
+		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+		      unpack_32_1x128 (*dst)));
+	    }
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    uint32_t m = *(uint32_t*)mask;
+	    if (m)
+	    {
+		__m128i xmm_mask_lo, xmm_mask_hi;
+		__m128i xmm_dst_lo, xmm_dst_hi;
+
+		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
+		__m128i xmm_mask =
+		    _mm_unpacklo_epi8 (unpack_32_1x128(m),
+				       _mm_setzero_si128 ());
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+
+		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
 
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+	    if (m)
+	    {
+		*dst = pack_1x128_32
+		    (_mm_adds_epu16
+		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+		      unpack_32_1x128 (*dst)));
+	    }
+	    dst++;
+	    w--;
+	}
+    }
 }
 
 static pixman_bool_t
@@ -5786,6 +5942,121 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
 			       uint32_t, uint8_t, uint32_t,
 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
 
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
+						const uint32_t * mask,
+						const uint32_t * src_top,
+						const uint32_t * src_bottom,
+						int32_t          w,
+						int              wt,
+						int              wb,
+						pixman_fixed_t   vx,
+						pixman_fixed_t   unit_x,
+						pixman_fixed_t   max_vx,
+						pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+    __m128i xmm_mask;
+
+    if (zero_src || (*mask >> 24) == 0)
+	return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	if (pix1)
+	{
+		uint32_t d = *dst;
+
+		__m128i ms = unpack_32_1x128 (pix1);
+		__m128i alpha     = expand_alpha_1x128 (ms);
+		__m128i dest      = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32
+			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+
+	dst++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+	if (pix1 | pix2 | pix3 | pix4)
+	{
+	    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+	    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha_lo, &xmm_alpha_hi,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned
+		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	dst += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	if (pix1)
+	{
+		uint32_t d = *dst;
+
+		__m128i ms = unpack_32_1x128 (pix1);
+		__m128i alpha     = expand_alpha_1x128 (ms);
+		__m128i dest      = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32
+			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+
+	dst++;
+	w--;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_HAVE_SOLID_MASK)
+
 static const pixman_fast_path_t sse2_fast_paths[] =
 {
     /* PIXMAN_OP_OVER */
@@ -5848,6 +6119,14 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
 
     /* PIXMAN_OP_SRC */
     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
@@ -5912,6 +6191,11 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
 
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
diff --git a/pixman/pixman/pixman.c b/pixman/pixman/pixman.c
index 3fabed161..184f0c4e6 100644
--- a/pixman/pixman/pixman.c
+++ b/pixman/pixman/pixman.c
@@ -581,11 +581,13 @@ pixman_image_composite32 (pixman_op_t      op,
                           int32_t          height)
 {
     pixman_format_code_t src_format, mask_format, dest_format;
-    uint32_t src_flags, mask_flags, dest_flags;
     pixman_region32_t region;
     pixman_box32_t extents;
     pixman_implementation_t *imp;
     pixman_composite_func_t func;
+    pixman_composite_info_t info;
+    const pixman_box32_t *pbox;
+    int n;
 
     _pixman_image_validate (src);
     if (mask)
@@ -593,27 +595,27 @@ pixman_image_composite32 (pixman_op_t      op,
     _pixman_image_validate (dest);
 
     src_format = src->common.extended_format_code;
-    src_flags = src->common.flags;
+    info.src_flags = src->common.flags;
 
     if (mask && !(mask->common.flags & FAST_PATH_IS_OPAQUE))
     {
 	mask_format = mask->common.extended_format_code;
-	mask_flags = mask->common.flags;
+	info.mask_flags = mask->common.flags;
     }
     else
     {
 	mask_format = PIXMAN_null;
-	mask_flags = FAST_PATH_IS_OPAQUE;
+	info.mask_flags = FAST_PATH_IS_OPAQUE;
     }
 
     dest_format = dest->common.extended_format_code;
-    dest_flags = dest->common.flags;
+    info.dest_flags = dest->common.flags;
 
     /* Check for pixbufs */
     if ((mask_format == PIXMAN_a8r8g8b8 || mask_format == PIXMAN_a8b8g8r8) &&
 	(src->type == BITS && src->bits.bits == mask->bits.bits)	   &&
 	(src->common.repeat == mask->common.repeat)			   &&
-	(src_flags & mask_flags & FAST_PATH_ID_TRANSFORM)		   &&
+	(info.src_flags & info.mask_flags & FAST_PATH_ID_TRANSFORM)	   &&
 	(src_x == mask_x && src_y == mask_y))
     {
 	if (src_format == PIXMAN_x8b8g8r8)
@@ -638,7 +640,7 @@ pixman_image_composite32 (pixman_op_t      op,
     extents.x2 -= dest_x - src_x;
     extents.y2 -= dest_y - src_y;
 
-    if (!analyze_extent (src, &extents, &src_flags))
+    if (!analyze_extent (src, &extents, &info.src_flags))
 	goto out;
 
     extents.x1 -= src_x - mask_x;
@@ -646,7 +648,7 @@ pixman_image_composite32 (pixman_op_t      op,
     extents.x2 -= src_x - mask_x;
     extents.y2 -= src_y - mask_y;
 
-    if (!analyze_extent (mask, &extents, &mask_flags))
+    if (!analyze_extent (mask, &extents, &info.mask_flags))
 	goto out;
 
     /* If the clip is within the source samples, and the samples are
@@ -659,16 +661,16 @@ pixman_image_composite32 (pixman_op_t      op,
 			 FAST_PATH_BILINEAR_FILTER |			\
 			 FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR)
 
-    if ((src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
-	(src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    if ((info.src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+	(info.src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
     {
-	src_flags |= FAST_PATH_IS_OPAQUE;
+	info.src_flags |= FAST_PATH_IS_OPAQUE;
     }
 
-    if ((mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
-	(mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    if ((info.mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+	(info.mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
     {
-	mask_flags |= FAST_PATH_IS_OPAQUE;
+	info.mask_flags |= FAST_PATH_IS_OPAQUE;
     }
 
     /*
@@ -676,42 +678,35 @@ pixman_image_composite32 (pixman_op_t      op,
      * if the src or dest are opaque. The output operator should be
      * mathematically equivalent to the source.
      */
-    op = optimize_operator (op, src_flags, mask_flags, dest_flags);
+    info.op = optimize_operator (op, info.src_flags, info.mask_flags, info.dest_flags);
 
-    if (_pixman_implementation_lookup_composite (
-	    get_implementation (), op,
-	    src_format, src_flags, mask_format, mask_flags, dest_format, dest_flags,
-	    &imp, &func))
-    {
-	pixman_composite_info_t info;
-	const pixman_box32_t *pbox;
-	int n;
+    _pixman_implementation_lookup_composite (
+	get_implementation (), info.op,
+	src_format, info.src_flags,
+	mask_format, info.mask_flags,
+	dest_format, info.dest_flags,
+	&imp, &func);
 
-	info.op = op;
-	info.src_image = src;
-	info.mask_image = mask;
-	info.dest_image = dest;
-	info.src_flags = src_flags;
-	info.mask_flags = mask_flags;
-	info.dest_flags = dest_flags;
+    info.src_image = src;
+    info.mask_image = mask;
+    info.dest_image = dest;
 
-	pbox = pixman_region32_rectangles (&region, &n);
+    pbox = pixman_region32_rectangles (&region, &n);
 
-	while (n--)
-	{
-	    info.src_x = pbox->x1 + src_x - dest_x;
-	    info.src_y = pbox->y1 + src_y - dest_y;
-	    info.mask_x = pbox->x1 + mask_x - dest_x;
-	    info.mask_y = pbox->y1 + mask_y - dest_y;
-	    info.dest_x = pbox->x1;
-	    info.dest_y = pbox->y1;
-	    info.width = pbox->x2 - pbox->x1;
-	    info.height = pbox->y2 - pbox->y1;
-
-	    func (imp, &info);
-
-	    pbox++;
-	}
+    while (n--)
+    {
+	info.src_x = pbox->x1 + src_x - dest_x;
+	info.src_y = pbox->y1 + src_y - dest_y;
+	info.mask_x = pbox->x1 + mask_x - dest_x;
+	info.mask_y = pbox->y1 + mask_y - dest_y;
+	info.dest_x = pbox->x1;
+	info.dest_y = pbox->y1;
+	info.width = pbox->x2 - pbox->x1;
+	info.height = pbox->y2 - pbox->y1;
+
+	func (imp, &info);
+
+	pbox++;
     }
 
 out:
diff --git a/pixman/test/Makefile.sources b/pixman/test/Makefile.sources
index 8c0b505df..e323a8e8c 100644
--- a/pixman/test/Makefile.sources
+++ b/pixman/test/Makefile.sources
@@ -17,6 +17,7 @@ TESTPROGRAMS =			\
 	gradient-crash-test	\
 	region-contains-test	\
 	alphamap		\
+	matrix-test		\
 	stress-test		\
 	composite-traps-test	\
 	blitters-test		\
diff --git a/pixman/test/affine-test.c b/pixman/test/affine-test.c
index 678fbe844..2506250db 100644
--- a/pixman/test/affine-test.c
+++ b/pixman/test/affine-test.c
@@ -307,11 +307,11 @@ test_composite (int      testnum,
 }
 
 #if BILINEAR_INTERPOLATION_BITS == 8
-#define CHECKSUM 0x97097336
+#define CHECKSUM 0x2CDF1F07
 #elif BILINEAR_INTERPOLATION_BITS == 7
-#define CHECKSUM 0x31D2DC21
+#define CHECKSUM 0xBC00B1DF
 #elif BILINEAR_INTERPOLATION_BITS == 4
-#define CHECKSUM 0x8B925154
+#define CHECKSUM 0xA227306B
 #else
 #define CHECKSUM 0x00000000
 #endif
diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c
index 2f97b7b24..7336fa0d5 100644
--- a/pixman/test/lowlevel-blt-bench.c
+++ b/pixman/test/lowlevel-blt-bench.c
@@ -630,6 +630,8 @@ tests_tbl[] =
     { "src_0565_0565",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "src_1555_0565",         PIXMAN_a1r5g5b5,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "src_0565_1555",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "src_8_8",               PIXMAN_a8,          0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "src_n_8",               PIXMAN_a8,          1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8 },
     { "src_n_8_0565",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
     { "src_n_8_1555",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
     { "src_n_8_4444",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },
@@ -772,7 +774,7 @@ main (int argc, char *argv[])
 
     for (i = 0; i < ARRAY_LENGTH (tests_tbl); i++)
     {
-	if (strcmp (pattern, "all") == 0 || strstr (tests_tbl[i].testname, pattern))
+	if (strcmp (pattern, "all") == 0 || strcmp (tests_tbl[i].testname, pattern) == 0)
 	{
 	    bench_composite (tests_tbl[i].testname,
 			     tests_tbl[i].src_fmt,
diff --git a/pixman/test/matrix-test.c b/pixman/test/matrix-test.c
new file mode 100644
index 000000000..8437dd291
--- /dev/null
+++ b/pixman/test/matrix-test.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright © 2012 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "utils.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef HAVE_FLOAT128
+
+#define pixman_fixed_to_float128(x) (((__float128)(x)) / 65536.0Q)
+
+typedef struct { __float128 v[3]; } pixman_vector_f128_t;
+typedef struct { __float128 m[3][3]; } pixman_transform_f128_t;
+
+pixman_bool_t
+pixman_transform_point_f128 (const pixman_transform_f128_t *t,
+                             const pixman_vector_f128_t    *v,
+                             pixman_vector_f128_t          *result)
+{
+    int i;
+    for (i = 0; i < 3; i++)
+    {
+        result->v[i] = t->m[i][0] * v->v[0] +
+                       t->m[i][1] * v->v[1] +
+                       t->m[i][2] * v->v[2];
+    }
+    if (result->v[2] != 0)
+    {
+        result->v[0] /= result->v[2];
+        result->v[1] /= result->v[2];
+        result->v[2] = 1;
+        return TRUE;
+    }
+    else
+    {
+        return FALSE;
+    }
+}
+
+pixman_bool_t does_it_fit_fixed_48_16 (__float128 x)
+{
+    if (x >= 65536.0Q * 65536.0Q * 32768.0Q)
+        return FALSE;
+    if (x <= -65536.0Q * 65536.0Q * 32768.0Q)
+        return FALSE;
+    return TRUE;
+}
+
+#endif
+
+uint32_t
+test_matrix (int testnum, int verbose)
+{
+    uint32_t crc32 = 0;
+    int i, j, k;
+    pixman_bool_t is_affine;
+
+    prng_srand (testnum);
+
+    for (i = 0; i < 100; i++)
+    {
+        pixman_bool_t           transform_ok;
+        pixman_transform_t      ti;
+        pixman_vector_48_16_t   vi, result_i;
+#ifdef HAVE_FLOAT128
+        pixman_transform_f128_t tf;
+        pixman_vector_f128_t    vf, result_f;
+#endif
+        prng_randmemset (&ti, sizeof(ti), 0);
+        prng_randmemset (&vi, sizeof(vi), 0);
+
+        for (j = 0; j < 3; j++)
+        {
+            /* make sure that "vi" contains 31.16 fixed point data */
+            vi.v[j] >>= 17;
+            /* and apply random shift */
+            if (prng_rand_n (3) == 0)
+                vi.v[j] >>= prng_rand_n (46);
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* random shift for the matrix */
+            for (j = 0; j < 3; j++)
+                for (k = 0; k < 3; k++)
+                    ti.matrix[j][k] >>= prng_rand_n (30);
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* affine matrix */
+            ti.matrix[2][0] = 0;
+            ti.matrix[2][1] = 0;
+            ti.matrix[2][2] = pixman_fixed_1;
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* cartesian coordinates */
+            vi.v[2] = pixman_fixed_1;
+        }
+
+        is_affine = (ti.matrix[2][0] == 0 && ti.matrix[2][1] == 0 &&
+                     ti.matrix[2][2] == pixman_fixed_1 &&
+                     vi.v[2] == pixman_fixed_1);
+
+        transform_ok = TRUE;
+        if (is_affine && prng_rand_n (2))
+            pixman_transform_point_31_16_affine (&ti, &vi, &result_i);
+        else
+            transform_ok = pixman_transform_point_31_16 (&ti, &vi, &result_i);
+
+        crc32 = compute_crc32 (crc32, &result_i, sizeof(result_i));
+
+#ifdef HAVE_FLOAT128
+        /* compare with a reference 128-bit floating point implementation */
+        for (j = 0; j < 3; j++)
+        {
+            vf.v[j] = pixman_fixed_to_float128 (vi.v[j]);
+            for (k = 0; k < 3; k++)
+            {
+                tf.m[j][k] = pixman_fixed_to_float128 (ti.matrix[j][k]);
+            }
+        }
+
+        if (pixman_transform_point_f128 (&tf, &vf, &result_f))
+        {
+            if (transform_ok ||
+                (does_it_fit_fixed_48_16 (result_f.v[0]) &&
+                 does_it_fit_fixed_48_16 (result_f.v[1]) &&
+                 does_it_fit_fixed_48_16 (result_f.v[2])))
+            {
+                for (j = 0; j < 3; j++)
+                {
+                    double diff = fabs (result_f.v[j] -
+                                        pixman_fixed_to_float128 (result_i.v[j]));
+
+                    if (is_affine && diff > (0.51 / 65536.0))
+                    {
+                        printf ("%d:%d: bad precision for affine (%.12f)\n",
+                               testnum, i, diff);
+                        abort ();
+                    }
+                    else if (diff > (0.71 / 65536.0))
+                    {
+                        printf ("%d:%d: bad precision for projective (%.12f)\n",
+                               testnum, i, diff);
+                        abort ();
+                    }
+                }
+            }
+        }
+#endif
+    }
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main ("matrix", 20000,
+			     0xBEBF98C3,
+			     test_matrix, argc, argv);
+}