xserver libX11 libxcb pixman mesa git update 20 Mar 2011

author: marha <marha@users.sourceforge.net> 2011-03-20 16:32:44 +0000
committer: marha <marha@users.sourceforge.net> 2011-03-20 16:32:44 +0000
commit: eca5dee9e7a8dea1edba4d10b60444ac0e884139 (patch)
tree: 67c0e6552d06cb59b33ef79ece38d6581b2c8976 /pixman
parent: d7f1bd4112420f1d4b41c5409074eca6b34bf507 (diff)
download: vcxsrv-eca5dee9e7a8dea1edba4d10b60444ac0e884139.tar.gz
vcxsrv-eca5dee9e7a8dea1edba4d10b60444ac0e884139.tar.bz2
vcxsrv-eca5dee9e7a8dea1edba4d10b60444ac0e884139.zip
9 files changed, 9319 insertions, 9398 deletions
diff --git a/pixman/pixman/pixman-bits-image.c b/pixman/pixman/pixman-bits-image.c
index a865d719a..88c2f0eea 100644
--- a/pixman/pixman/pixman-bits-image.c
+++ b/pixman/pixman/pixman-bits-image.c
@@ -1,1608 +1,1602 @@
-/*
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- *             2005 Lars Knoll & Zack Rusin, Trolltech
- *             2008 Aaron Plattner, NVIDIA Corporation
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007, 2009 Red Hat, Inc.
- * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-
-/*
- * By default, just evaluate the image at 32bpp and expand.  Individual image
- * types can plug in a better scanline getter if they want to. For example
- * we  could produce smoother gradients by evaluating them at higher color
- * depth, but that's a project for the future.
- */
-static void
-_pixman_image_get_scanline_generic_64 (pixman_image_t * image,
-                                       int              x,
-                                       int              y,
-                                       int              width,
-                                       uint32_t *       buffer,
-                                       const uint32_t * mask)
-{
-    uint32_t *mask8 = NULL;
-
-    /* Contract the mask image, if one exists, so that the 32-bit fetch
-     * function can use it.
-     */
-    if (mask)
-    {
-	mask8 = pixman_malloc_ab (width, sizeof(uint32_t));
-	if (!mask8)
-	    return;
-
-	pixman_contract (mask8, (uint64_t *)mask, width);
-    }
-
-    /* Fetch the source image into the first half of buffer. */
-    image->bits.get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8);
-
-    /* Expand from 32bpp to 64bpp in place. */
-    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width);
-
-    free (mask8);
-}
-
-/* Fetch functions */
-
-static force_inline uint32_t
-fetch_pixel_no_alpha (bits_image_t *image,
-		      int x, int y, pixman_bool_t check_bounds)
-{
-    if (check_bounds &&
-	(x < 0 || x >= image->width || y < 0 || y >= image->height))
-    {
-	return 0;
-    }
-
-    return image->fetch_pixel_32 (image, x, y);
-}
-
-typedef uint32_t (* get_pixel_t) (bits_image_t *image,
-				  int x, int y, pixman_bool_t check_bounds);
-
-static force_inline void
-repeat (pixman_repeat_t repeat, int size, int *coord)
-{
-    switch (repeat)
-    {
-    case PIXMAN_REPEAT_NORMAL:
-	*coord = MOD (*coord, size);
-	break;
-
-    case PIXMAN_REPEAT_PAD:
-	*coord = CLIP (*coord, 0, size - 1);
-	break;
-
-    case PIXMAN_REPEAT_REFLECT:
-	*coord = MOD (*coord, size * 2);
-
-	if (*coord >= size)
-	    *coord = size * 2 - *coord - 1;
-	break;
-
-    case PIXMAN_REPEAT_NONE:
-	break;
-
-    default:
-        break;
-    }
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_nearest (bits_image_t   *image,
-				pixman_fixed_t  x,
-				pixman_fixed_t  y,
-				get_pixel_t	get_pixel)
-{
-    int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
-    int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
-    if (image->common.repeat != PIXMAN_REPEAT_NONE)
-    {
-	repeat (image->common.repeat, image->width, &x0);
-	repeat (image->common.repeat, image->height, &y0);
-
-	return get_pixel (image, x0, y0, FALSE);
-    }
-    else
-    {
-	return get_pixel (image, x0, y0, TRUE);
-    }
-}
-
-#if SIZEOF_LONG > 4
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
-			uint32_t bl, uint32_t br,
-			int distx, int disty)
-{
-    uint64_t distxy, distxiy, distixy, distixiy;
-    uint64_t tl64, tr64, bl64, br64;
-    uint64_t f, r;
-
-    distxy = distx * disty;
-    distxiy = distx * (256 - disty);
-    distixy = (256 - distx) * disty;
-    distixiy = (256 - distx) * (256 - disty);
-
-    /* Alpha and Blue */
-    tl64 = tl & 0xff0000ff;
-    tr64 = tr & 0xff0000ff;
-    bl64 = bl & 0xff0000ff;
-    br64 = br & 0xff0000ff;
-
-    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
-    r = f & 0x0000ff0000ff0000ull;
-
-    /* Red and Green */
-    tl64 = tl;
-    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
-
-    tr64 = tr;
-    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
-
-    bl64 = bl;
-    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
-
-    br64 = br;
-    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
-
-    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
-    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
-
-    return (uint32_t)(r >> 16);
-}
-
-#else
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
-			uint32_t bl, uint32_t br,
-			int distx, int disty)
-{
-    int distxy, distxiy, distixy, distixiy;
-    uint32_t f, r;
-
-    distxy = distx * disty;
-    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
-    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
-    distixiy =
-	256 * 256 - (disty << 8) -
-	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
-
-    /* Blue */
-    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
-      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
-
-    /* Green */
-    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
-      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
-    r |= f & 0xff000000;
-
-    tl >>= 16;
-    tr >>= 16;
-    bl >>= 16;
-    br >>= 16;
-    r >>= 16;
-
-    /* Red */
-    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
-      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
-    r |= f & 0x00ff0000;
-
-    /* Alpha */
-    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
-      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
-    r |= f & 0xff000000;
-
-    return r;
-}
-
-#endif
-
-static force_inline uint32_t
-bits_image_fetch_pixel_bilinear (bits_image_t   *image,
-				 pixman_fixed_t  x,
-				 pixman_fixed_t  y,
-				 get_pixel_t	 get_pixel)
-{
-    pixman_repeat_t repeat_mode = image->common.repeat;
-    int width = image->width;
-    int height = image->height;
-    int x1, y1, x2, y2;
-    uint32_t tl, tr, bl, br;
-    int32_t distx, disty;
-
-    x1 = x - pixman_fixed_1 / 2;
-    y1 = y - pixman_fixed_1 / 2;
-
-    distx = (x1 >> 8) & 0xff;
-    disty = (y1 >> 8) & 0xff;
-
-    x1 = pixman_fixed_to_int (x1);
-    y1 = pixman_fixed_to_int (y1);
-    x2 = x1 + 1;
-    y2 = y1 + 1;
-
-    if (repeat_mode != PIXMAN_REPEAT_NONE)
-    {
-	repeat (repeat_mode, width, &x1);
-	repeat (repeat_mode, height, &y1);
-	repeat (repeat_mode, width, &x2);
-	repeat (repeat_mode, height, &y2);
-
-	tl = get_pixel (image, x1, y1, FALSE);
-	bl = get_pixel (image, x1, y2, FALSE);
-	tr = get_pixel (image, x2, y1, FALSE);
-	br = get_pixel (image, x2, y2, FALSE);
-    }
-    else
-    {
-	tl = get_pixel (image, x1, y1, TRUE);
-	tr = get_pixel (image, x2, y1, TRUE);
-	bl = get_pixel (image, x1, y2, TRUE);
-	br = get_pixel (image, x2, y2, TRUE);
-    }
-
-    return bilinear_interpolation (tl, tr, bl, br, distx, disty);
-}
-
-static void
-bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
-					  int              offset,
-					  int              line,
-					  int              width,
-					  uint32_t *       buffer,
-					  const uint32_t * mask)
-{
-    bits_image_t *bits = &ima->bits;
-    pixman_fixed_t x_top, x_bottom, x;
-    pixman_fixed_t ux_top, ux_bottom, ux;
-    pixman_vector_t v;
-    uint32_t top_mask, bottom_mask;
-    uint32_t *top_row;
-    uint32_t *bottom_row;
-    uint32_t *end;
-    uint32_t zero[2] = { 0, 0 };
-    uint32_t one = 1;
-    int y, y1, y2;
-    int disty;
-    int mask_inc;
-    int w;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (bits->common.transform, &v))
-	return;
-
-    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
-    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
-
-    y = v.vector[1] - pixman_fixed_1/2;
-    disty = (y >> 8) & 0xff;
-
-    /* Load the pointers to the first and second lines from the source
-     * image that bilinear code must read.
-     *
-     * The main trick in this code is about the check if any line are
-     * outside of the image;
-     *
-     * When I realize that a line (any one) is outside, I change
-     * the pointer to a dummy area with zeros. Once I change this, I
-     * must be sure the pointer will not change, so I set the
-     * variables to each pointer increments inside the loop.
-     */
-    y1 = pixman_fixed_to_int (y);
-    y2 = y1 + 1;
-
-    if (y1 < 0 || y1 >= bits->height)
-    {
-	top_row = zero;
-	x_top = 0;
-	ux_top = 0;
-    }
-    else
-    {
-	top_row = bits->bits + y1 * bits->rowstride;
-	x_top = x;
-	ux_top = ux;
-    }
-
-    if (y2 < 0 || y2 >= bits->height)
-    {
-	bottom_row = zero;
-	x_bottom = 0;
-	ux_bottom = 0;
-    }
-    else
-    {
-	bottom_row = bits->bits + y2 * bits->rowstride;
-	x_bottom = x;
-	ux_bottom = ux;
-    }
-
-    /* Instead of checking whether the operation uses the mast in
-     * each loop iteration, verify this only once and prepare the
-     * variables to make the code smaller inside the loop.
-     */
-    if (!mask)
-    {
-        mask_inc = 0;
-        mask = &one;
-    }
-    else
-    {
-        /* If have a mask, prepare the variables to check it */
-        mask_inc = 1;
-    }
-
-    /* If both are zero, then the whole thing is zero */
-    if (top_row == zero && bottom_row == zero)
-    {
-	memset (buffer, 0, width * sizeof (uint32_t));
-	return;
-    }
-    else if (bits->format == PIXMAN_x8r8g8b8)
-    {
-	if (top_row == zero)
-	{
-	    top_mask = 0;
-	    bottom_mask = 0xff000000;
-	}
-	else if (bottom_row == zero)
-	{
-	    top_mask = 0xff000000;
-	    bottom_mask = 0;
-	}
-	else
-	{
-	    top_mask = 0xff000000;
-	    bottom_mask = 0xff000000;
-	}
-    }
-    else
-    {
-	top_mask = 0;
-	bottom_mask = 0;
-    }
-
-    end = buffer + width;
-
-    /* Zero fill to the left of the image */
-    while (buffer < end && x < pixman_fixed_minus_1)
-    {
-	*buffer++ = 0;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Left edge
-     */
-    while (buffer < end && x < 0)
-    {
-	uint32_t tr, br;
-	int32_t distx;
-
-	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
-	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
-	distx = (x >> 8) & 0xff;
-
-	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
-
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Main part */
-    w = pixman_int_to_fixed (bits->width - 1);
-
-    while (buffer < end  &&  x < w)
-    {
-	if (*mask)
-	{
-	    uint32_t tl, tr, bl, br;
-	    int32_t distx;
-
-	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
-	    tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
-	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
-	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
-	    distx = (x >> 8) & 0xff;
-
-	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
-	}
-
-	buffer++;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Right Edge */
-    w = pixman_int_to_fixed (bits->width);
-    while (buffer < end  &&  x < w)
-    {
-	if (*mask)
-	{
-	    uint32_t tl, bl;
-	    int32_t distx;
-
-	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
-	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
-
-	    distx = (x >> 8) & 0xff;
-
-	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
-	}
-
-	buffer++;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Zero fill to the left of the image */
-    while (buffer < end)
-	*buffer++ = 0;
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_convolution (bits_image_t   *image,
-				    pixman_fixed_t  x,
-				    pixman_fixed_t  y,
-				    get_pixel_t     get_pixel)
-{
-    pixman_fixed_t *params = image->common.filter_params;
-    int x_off = (params[0] - pixman_fixed_1) >> 1;
-    int y_off = (params[1] - pixman_fixed_1) >> 1;
-    int32_t cwidth = pixman_fixed_to_int (params[0]);
-    int32_t cheight = pixman_fixed_to_int (params[1]);
-    int32_t srtot, sgtot, sbtot, satot;
-    int32_t i, j, x1, x2, y1, y2;
-    pixman_repeat_t repeat_mode = image->common.repeat;
-    int width = image->width;
-    int height = image->height;
-
-    params += 2;
-
-    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
-    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
-    x2 = x1 + cwidth;
-    y2 = y1 + cheight;
-
-    srtot = sgtot = sbtot = satot = 0;
-
-    for (i = y1; i < y2; ++i)
-    {
-	for (j = x1; j < x2; ++j)
-	{
-	    int rx = j;
-	    int ry = i;
-
-	    pixman_fixed_t f = *params;
-
-	    if (f)
-	    {
-		uint32_t pixel;
-
-		if (repeat_mode != PIXMAN_REPEAT_NONE)
-		{
-		    repeat (repeat_mode, width, &rx);
-		    repeat (repeat_mode, height, &ry);
-
-		    pixel = get_pixel (image, rx, ry, FALSE);
-		}
-		else
-		{
-		    pixel = get_pixel (image, rx, ry, TRUE);
-		}
-
-		srtot += RED_8 (pixel) * f;
-		sgtot += GREEN_8 (pixel) * f;
-		sbtot += BLUE_8 (pixel) * f;
-		satot += ALPHA_8 (pixel) * f;
-	    }
-
-	    params++;
-	}
-    }
-
-    satot >>= 16;
-    srtot >>= 16;
-    sgtot >>= 16;
-    sbtot >>= 16;
-
-    satot = CLIP (satot, 0, 0xff);
-    srtot = CLIP (srtot, 0, 0xff);
-    sgtot = CLIP (sgtot, 0, 0xff);
-    sbtot = CLIP (sbtot, 0, 0xff);
-
-    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_filtered (bits_image_t *image,
-				 pixman_fixed_t x,
-				 pixman_fixed_t y,
-				 get_pixel_t    get_pixel)
-{
-    switch (image->common.filter)
-    {
-    case PIXMAN_FILTER_NEAREST:
-    case PIXMAN_FILTER_FAST:
-	return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
-	break;
-
-    case PIXMAN_FILTER_BILINEAR:
-    case PIXMAN_FILTER_GOOD:
-    case PIXMAN_FILTER_BEST:
-	return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
-	break;
-
-    case PIXMAN_FILTER_CONVOLUTION:
-	return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
-	break;
-
-    default:
-        break;
-    }
-
-    return 0;
-}
-
-static void
-bits_image_fetch_affine_no_alpha (pixman_image_t * image,
-				  int              offset,
-				  int              line,
-				  int              width,
-				  uint32_t *       buffer,
-				  const uint32_t * mask)
-{
-    pixman_fixed_t x, y;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (image->common.transform)
-    {
-	if (!pixman_transform_point_3d (image->common.transform, &v))
-	    return;
-
-	ux = image->common.transform->matrix[0][0];
-	uy = image->common.transform->matrix[1][0];
-    }
-    else
-    {
-	ux = pixman_fixed_1;
-	uy = 0;
-    }
-
-    x = v.vector[0];
-    y = v.vector[1];
-
-    for (i = 0; i < width; ++i)
-    {
-	if (!mask || mask[i])
-	{
-	    buffer[i] = bits_image_fetch_pixel_filtered (
-		&image->bits, x, y, fetch_pixel_no_alpha);
-	}
-
-	x += ux;
-	y += uy;
-    }
-}
-
-/* General fetcher */
-static force_inline uint32_t
-fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
-{
-    uint32_t pixel;
-
-    if (check_bounds &&
-	(x < 0 || x >= image->width || y < 0 || y >= image->height))
-    {
-	return 0;
-    }
-
-    pixel = image->fetch_pixel_32 (image, x, y);
-
-    if (image->common.alpha_map)
-    {
-	uint32_t pixel_a;
-
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
-
-	if (x < 0 || x >= image->common.alpha_map->width ||
-	    y < 0 || y >= image->common.alpha_map->height)
-	{
-	    pixel_a = 0;
-	}
-	else
-	{
-	    pixel_a = image->common.alpha_map->fetch_pixel_32 (
-		image->common.alpha_map, x, y);
-
-	    pixel_a = ALPHA_8 (pixel_a);
-	}
-
-	pixel &= 0x00ffffff;
-	pixel |= (pixel_a << 24);
-    }
-
-    return pixel;
-}
-
-static void
-bits_image_fetch_general (pixman_image_t * image,
-			  int              offset,
-			  int              line,
-			  int              width,
-			  uint32_t *       buffer,
-			  const uint32_t * mask)
-{
-    pixman_fixed_t x, y, w;
-    pixman_fixed_t ux, uy, uw;
-    pixman_vector_t v;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (image->common.transform)
-    {
-	if (!pixman_transform_point_3d (image->common.transform, &v))
-	    return;
-
-	ux = image->common.transform->matrix[0][0];
-	uy = image->common.transform->matrix[1][0];
-	uw = image->common.transform->matrix[2][0];
-    }
-    else
-    {
-	ux = pixman_fixed_1;
-	uy = 0;
-	uw = 0;
-    }
-
-    x = v.vector[0];
-    y = v.vector[1];
-    w = v.vector[2];
-
-    for (i = 0; i < width; ++i)
-    {
-	pixman_fixed_t x0, y0;
-
-	if (!mask || mask[i])
-	{
-	    if (w != 0)
-	    {
-		x0 = ((pixman_fixed_48_16_t)x << 16) / w;
-		y0 = ((pixman_fixed_48_16_t)y << 16) / w;
-	    }
-	    else
-	    {
-		x0 = 0;
-		y0 = 0;
-	    }
-
-	    buffer[i] = bits_image_fetch_pixel_filtered (
-		&image->bits, x0, y0, fetch_pixel_general);
-	}
-
-	x += ux;
-	y += uy;
-	w += uw;
-    }
-}
-
-static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
-typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
-
-static force_inline void
-bits_image_fetch_bilinear_affine (pixman_image_t * image,
-				  int              offset,
-				  int              line,
-				  int              width,
-				  uint32_t *       buffer,
-				  const uint32_t * mask,
-
-				  convert_pixel_t	convert_pixel,
-				  pixman_format_code_t	format,
-				  pixman_repeat_t	repeat_mode)
-{
-    pixman_fixed_t x, y;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    bits_image_t *bits = &image->bits;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (image->common.transform, &v))
-	return;
-
-    ux = image->common.transform->matrix[0][0];
-    uy = image->common.transform->matrix[1][0];
-
-    x = v.vector[0];
-    y = v.vector[1];
-
-    for (i = 0; i < width; ++i)
-    {
-	int x1, y1, x2, y2;
-	uint32_t tl, tr, bl, br;
-	int32_t distx, disty;
-	int width = image->bits.width;
-	int height = image->bits.height;
-	const uint8_t *row1;
-	const uint8_t *row2;
-
-	if (mask && !mask[i])
-	    goto next;
-
-	x1 = x - pixman_fixed_1 / 2;
-	y1 = y - pixman_fixed_1 / 2;
-
-	distx = (x1 >> 8) & 0xff;
-	disty = (y1 >> 8) & 0xff;
-
-	y1 = pixman_fixed_to_int (y1);
-	y2 = y1 + 1;
-	x1 = pixman_fixed_to_int (x1);
-	x2 = x1 + 1;
-
-	if (repeat_mode != PIXMAN_REPEAT_NONE)
-	{
-	    uint32_t mask;
-
-	    mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
-	    repeat (repeat_mode, width, &x1);
-	    repeat (repeat_mode, height, &y1);
-	    repeat (repeat_mode, width, &x2);
-	    repeat (repeat_mode, height, &y2);
-
-	    row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
-	    row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
-
-	    tl = convert_pixel (row1, x1) | mask;
-	    tr = convert_pixel (row1, x2) | mask;
-	    bl = convert_pixel (row2, x1) | mask;
-	    br = convert_pixel (row2, x2) | mask;
-	}
-	else
-	{
-	    uint32_t mask1, mask2;
-	    int bpp;
-
-	    /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
-	     * which means if you use it in expressions, those
-	     * expressions become unsigned themselves. Since
-	     * the variables below can be negative in some cases,
-	     * that will lead to crashes on 64 bit architectures.
-	     *
-	     * So this line makes sure bpp is signed
-	     */
-	    bpp = PIXMAN_FORMAT_BPP (format);
-
-	    if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
-	    {
-		buffer[i] = 0;
-		goto next;
-	    }
-
-	    if (y2 == 0)
-	    {
-		row1 = zero;
-		mask1 = 0;
-	    }
-	    else
-	    {
-		row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
-		row1 += bpp / 8 * x1;
-
-		mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-	    }
-
-	    if (y1 == height - 1)
-	    {
-		row2 = zero;
-		mask2 = 0;
-	    }
-	    else
-	    {
-		row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
-		row2 += bpp / 8 * x1;
-
-		mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-	    }
-
-	    if (x2 == 0)
-	    {
-		tl = 0;
-		bl = 0;
-	    }
-	    else
-	    {
-		tl = convert_pixel (row1, 0) | mask1;
-		bl = convert_pixel (row2, 0) | mask2;
-	    }
-
-	    if (x1 == width - 1)
-	    {
-		tr = 0;
-		br = 0;
-	    }
-	    else
-	    {
-		tr = convert_pixel (row1, 1) | mask1;
-		br = convert_pixel (row2, 1) | mask2;
-	    }
-	}
-
-	buffer[i] = bilinear_interpolation (
-	    tl, tr, bl, br, distx, disty);
-
-    next:
-	x += ux;
-	y += uy;
-    }
-}
-
-static force_inline void
-bits_image_fetch_nearest_affine (pixman_image_t * image,
-				 int              offset,
-				 int              line,
-				 int              width,
-				 uint32_t *       buffer,
-				 const uint32_t * mask,
-				 
-				 convert_pixel_t	convert_pixel,
-				 pixman_format_code_t	format,
-				 pixman_repeat_t	repeat_mode)
-{
-    pixman_fixed_t x, y;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    bits_image_t *bits = &image->bits;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (image->common.transform, &v))
-	return;
-
-    ux = image->common.transform->matrix[0][0];
-    uy = image->common.transform->matrix[1][0];
-
-    x = v.vector[0];
-    y = v.vector[1];
-
-    for (i = 0; i < width; ++i)
-    {
-	int width, height, x0, y0;
-	const uint8_t *row;
-
-	if (mask && !mask[i])
-	    goto next;
-	
-	width = image->bits.width;
-	height = image->bits.height;
-	x0 = pixman_fixed_to_int (x - pixman_fixed_e);
-	y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
-	if (repeat_mode == PIXMAN_REPEAT_NONE &&
-	    (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
-	{
-	    buffer[i] = 0;
-	}
-	else
-	{
-	    uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
-	    if (repeat_mode != PIXMAN_REPEAT_NONE)
-	    {
-		repeat (repeat_mode, width, &x0);
-		repeat (repeat_mode, height, &y0);
-	    }
-
-	    row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
-
-	    buffer[i] = convert_pixel (row, x0) | mask;
-	}
-
-    next:
-	x += ux;
-	y += uy;
-    }
-}
-
-static force_inline uint32_t
-convert_a8r8g8b8 (const uint8_t *row, int x)
-{
-    return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_x8r8g8b8 (const uint8_t *row, int x)
-{
-    return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_a8 (const uint8_t *row, int x)
-{
-    return *(row + x) << 24;
-}
-
-static force_inline uint32_t
-convert_r5g6b5 (const uint8_t *row, int x)
-{
-    return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
-}
-
-#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
-    static void								\
-    bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image,	\
-					       int              offset,	\
-					       int              line,	\
-					       int              width,	\
-					       uint32_t *       buffer,	\
-					       const uint32_t * mask)	\
-    {									\
-	bits_image_fetch_bilinear_affine (image, offset, line,		\
-					  width, buffer, mask,		\
-					  convert_ ## format,		\
-					  PIXMAN_ ## format,		\
-					  repeat_mode);			\
-    }
-
-#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)			\
-    static void								\
-    bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image,	\
-					      int              offset,	\
-					      int              line,	\
-					      int              width,	\
-					      uint32_t *       buffer,	\
-					      const uint32_t * mask)	\
-    {									\
-	bits_image_fetch_nearest_affine (image, offset, line,		\
-					 width, buffer, mask,		\
-					 convert_ ## format,		\
-					 PIXMAN_ ## format,		\
-					 repeat_mode);			\
-    }
-
-#define MAKE_FETCHERS(name, format, repeat_mode)			\
-    MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
-    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)
-
-MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL)
-
-static void
-bits_image_fetch_solid_32 (pixman_image_t * image,
-                           int              x,
-                           int              y,
-                           int              width,
-                           uint32_t *       buffer,
-                           const uint32_t * mask)
-{
-    uint32_t color;
-    uint32_t *end;
-
-    color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
-
-    end = buffer + width;
-    while (buffer < end)
-	*(buffer++) = color;
-}
-
-static void
-bits_image_fetch_solid_64 (pixman_image_t * image,
-                           int              x,
-                           int              y,
-                           int              width,
-                           uint32_t *       b,
-                           const uint32_t * unused)
-{
-    uint64_t color;
-    uint64_t *buffer = (uint64_t *)b;
-    uint64_t *end;
-
-    color = image->bits.fetch_pixel_64 (&image->bits, 0, 0);
-
-    end = buffer + width;
-    while (buffer < end)
-	*(buffer++) = color;
-}
-
-static void
-bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
-                                            pixman_bool_t wide,
-                                            int           x,
-                                            int           y,
-                                            int           width,
-                                            uint32_t *    buffer)
-{
-    uint32_t w;
-
-    if (y < 0 || y >= image->height)
-    {
-	memset (buffer, 0, width * (wide? 8 : 4));
-	return;
-    }
-
-    if (x < 0)
-    {
-	w = MIN (width, -x);
-
-	memset (buffer, 0, w * (wide ? 8 : 4));
-
-	width -= w;
-	buffer += w * (wide? 2 : 1);
-	x += w;
-    }
-
-    if (x < image->width)
-    {
-	w = MIN (width, image->width - x);
-
-	if (wide)
-	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-	else
-	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-
-	width -= w;
-	buffer += w * (wide? 2 : 1);
-	x += w;
-    }
-
-    memset (buffer, 0, width * (wide ? 8 : 4));
-}
-
-static void
-bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
-                                              pixman_bool_t wide,
-                                              int           x,
-                                              int           y,
-                                              int           width,
-                                              uint32_t *    buffer)
-{
-    uint32_t w;
-
-    while (y < 0)
-	y += image->height;
-
-    while (y >= image->height)
-	y -= image->height;
-
-    while (width)
-    {
-	while (x < 0)
-	    x += image->width;
-	while (x >= image->width)
-	    x -= image->width;
-
-	w = MIN (width, image->width - x);
-
-	if (wide)
-	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-	else
-	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-
-	buffer += w * (wide? 2 : 1);
-	x += w;
-	width -= w;
-    }
-}
-
-static void
-bits_image_fetch_untransformed_32 (pixman_image_t * image,
-                                   int              x,
-                                   int              y,
-                                   int              width,
-                                   uint32_t *       buffer,
-                                   const uint32_t * mask)
-{
-    if (image->common.repeat == PIXMAN_REPEAT_NONE)
-    {
-	bits_image_fetch_untransformed_repeat_none (
-	    &image->bits, FALSE, x, y, width, buffer);
-    }
-    else
-    {
-	bits_image_fetch_untransformed_repeat_normal (
-	    &image->bits, FALSE, x, y, width, buffer);
-    }
-}
-
-static void
-bits_image_fetch_untransformed_64 (pixman_image_t * image,
-                                   int              x,
-                                   int              y,
-                                   int              width,
-                                   uint32_t *       buffer,
-                                   const uint32_t * unused)
-{
-    if (image->common.repeat == PIXMAN_REPEAT_NONE)
-    {
-	bits_image_fetch_untransformed_repeat_none (
-	    &image->bits, TRUE, x, y, width, buffer);
-    }
-    else
-    {
-	bits_image_fetch_untransformed_repeat_normal (
-	    &image->bits, TRUE, x, y, width, buffer);
-    }
-}
-
-typedef struct
-{
-    pixman_format_code_t	format;
-    uint32_t			flags;
-    fetch_scanline_t		fetch_32;
-    fetch_scanline_t		fetch_64;
-} fetcher_info_t;
-
-static const fetcher_info_t fetcher_info[] =
-{
-    { PIXMAN_solid,
-      FAST_PATH_NO_ALPHA_MAP,
-      bits_image_fetch_solid_32,
-      bits_image_fetch_solid_64
-    },
-
-    { PIXMAN_any,
-      (FAST_PATH_NO_ALPHA_MAP			|
-       FAST_PATH_ID_TRANSFORM			|
-       FAST_PATH_NO_CONVOLUTION_FILTER		|
-       FAST_PATH_NO_PAD_REPEAT			|
-       FAST_PATH_NO_REFLECT_REPEAT),
-      bits_image_fetch_untransformed_32,
-      bits_image_fetch_untransformed_64
-    },
-
-#define FAST_BILINEAR_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_X_UNIT_POSITIVE		|				\
-     FAST_PATH_Y_UNIT_ZERO		|				\
-     FAST_PATH_NONE_REPEAT		|				\
-     FAST_PATH_BILINEAR_FILTER)
-
-    { PIXMAN_a8r8g8b8,
-      FAST_BILINEAR_FLAGS,
-      bits_image_fetch_bilinear_no_repeat_8888,
-      _pixman_image_get_scanline_generic_64
-    },
-
-    { PIXMAN_x8r8g8b8,
-      FAST_BILINEAR_FLAGS,
-      bits_image_fetch_bilinear_no_repeat_8888,
-      _pixman_image_get_scanline_generic_64
-    },
-
-#define GENERAL_BILINEAR_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_BILINEAR_FILTER)
-
-#define GENERAL_NEAREST_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_NEAREST_FILTER)
-
-#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
-    { PIXMAN_ ## format,						\
-      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
-      bits_image_fetch_bilinear_affine_ ## name,			\
-      _pixman_image_get_scanline_generic_64				\
-    },
-
-#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\
-    { PIXMAN_ ## format,						\
-      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
-      bits_image_fetch_nearest_affine_ ## name,			\
-      _pixman_image_get_scanline_generic_64				\
-    },
-
-#define AFFINE_FAST_PATHS(name, format, repeat)				\
-    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
-    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
-    
-    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
-    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
-    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
-    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
-    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
-    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
-    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
-    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
-    AFFINE_FAST_PATHS (pad_a8, a8, PAD)
-    AFFINE_FAST_PATHS (none_a8, a8, NONE)
-    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
-    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
-    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
-    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
-    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
-    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
-
-    /* Affine, no alpha */
-    { PIXMAN_any,
-      (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
-      bits_image_fetch_affine_no_alpha,
-      _pixman_image_get_scanline_generic_64
-    },
-
-    /* General */
-    { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
-
-    { PIXMAN_null },
-};
-
-static void
-bits_image_property_changed (pixman_image_t *image)
-{
-    uint32_t flags = image->common.flags;
-    pixman_format_code_t format = image->common.extended_format_code;
-    const fetcher_info_t *info;
-
-    _pixman_bits_image_setup_accessors (&image->bits);
-
-    info = fetcher_info;
-    while (info->format != PIXMAN_null)
-    {
-	if ((info->format == format || info->format == PIXMAN_any)	&&
-	    (info->flags & flags) == info->flags)
-	{
-	    image->bits.get_scanline_32 = info->fetch_32;
-	    image->bits.get_scanline_64 = info->fetch_64;
-	    break;
-	}
-
-	info++;
-    }
-}
-
-static uint32_t *
-src_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
-{
-    iter->image->bits.get_scanline_32 (
-	iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
-
-    return iter->buffer;
-}
-
-static uint32_t *
-src_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
-    iter->image->bits.get_scanline_64 (
-	iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
-
-    return iter->buffer;
-}
-
-void
-_pixman_bits_image_src_iter_init (pixman_image_t *image,
-				  pixman_iter_t *iter,
-				  int x, int y, int width, int height,
-				  uint8_t *buffer, iter_flags_t flags)
-{
-    if (flags & ITER_NARROW)
-	iter->get_scanline = src_get_scanline_narrow;
-    else
-	iter->get_scanline = src_get_scanline_wide;
-}
-
-static uint32_t *
-dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
-{
-    pixman_image_t *image  = iter->image;
-    int             x      = iter->x;
-    int             y      = iter->y;
-    int             width  = iter->width;
-    uint32_t *	    buffer = iter->buffer;
-
-    image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask);
-    if (image->common.alpha_map)
-    {
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
-
-	image->common.alpha_map->fetch_scanline_32 (
-	    (pixman_image_t *)image->common.alpha_map,
-	    x, y, width, buffer, mask);
-    }
-
-    return iter->buffer;
-}
-
-static uint32_t *
-dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
-    bits_image_t *  image  = &iter->image->bits;
-    int             x      = iter->x;
-    int             y      = iter->y;
-    int             width  = iter->width;
-    uint32_t *	    buffer = iter->buffer;
-
-    image->fetch_scanline_64 (
-	(pixman_image_t *)image, x, y, width, buffer, mask);
-    if (image->common.alpha_map)
-    {
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
-
-	image->common.alpha_map->fetch_scanline_64 (
-	    (pixman_image_t *)image->common.alpha_map, x, y, width, buffer, mask);
-    }
-
-    return iter->buffer;
-}
-
-static void
-dest_write_back_narrow (pixman_iter_t *iter)
-{
-    bits_image_t *  image  = &iter->image->bits;
-    int             x      = iter->x;
-    int             y      = iter->y;
-    int             width  = iter->width;
-    const uint32_t *buffer = iter->buffer;
-
-    image->store_scanline_32 (image, x, y, width, buffer);
-
-    if (image->common.alpha_map)
-    {
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
-
-	image->common.alpha_map->store_scanline_32 (
-	    image->common.alpha_map, x, y, width, buffer);
-    }
-
-    iter->y++;
-}
-
-static void
-dest_write_back_wide (pixman_iter_t *iter)
-{
-    bits_image_t *  image  = &iter->image->bits;
-    int             x      = iter->x;
-    int             y      = iter->y;
-    int             width  = iter->width;
-    const uint32_t *buffer = iter->buffer;
-
-    image->store_scanline_64 (image, x, y, width, buffer);
-
-    if (image->common.alpha_map)
-    {
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
-
-	image->common.alpha_map->store_scanline_64 (
-	    image->common.alpha_map, x, y, width, buffer);
-    }
-
-    iter->y++;
-}
-
-static void
-dest_write_back_direct (pixman_iter_t *iter)
-{
-    iter->buffer += iter->image->bits.rowstride;
-}
-
-void
-_pixman_bits_image_dest_iter_init (pixman_image_t *image,
-				   pixman_iter_t *iter,
-				   int x, int y, int width, int height,
-				   uint8_t *buffer, iter_flags_t flags)
-{
-    if (flags & ITER_NARROW)
-    {
-	if (((image->common.flags &
-	      (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) ==
-	     (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) &&
-	    (image->bits.format == PIXMAN_a8r8g8b8	||
-	     (image->bits.format == PIXMAN_x8r8g8b8	&&
-	      (flags & ITER_LOCALIZED_ALPHA))))
-	{
-	    iter->buffer = image->bits.bits + y * image->bits.rowstride + x;
-
-	    iter->get_scanline = _pixman_iter_get_scanline_noop;
-	    iter->write_back = dest_write_back_direct;
-	}
-	else
-	{
-	    if ((flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
-		(ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
-	    {
-		iter->get_scanline = _pixman_iter_get_scanline_noop;
-	    }
-	    else
-	    {
-		iter->get_scanline = dest_get_scanline_narrow;
-	    }
-
-	    iter->write_back = dest_write_back_narrow;
-	}
-    }
-    else
-    {
-	iter->get_scanline = dest_get_scanline_wide;
-	iter->write_back = dest_write_back_wide;
-    }
-}
-
-static uint32_t *
-create_bits (pixman_format_code_t format,
-             int                  width,
-             int                  height,
-             int *                rowstride_bytes)
-{
-    int stride;
-    int buf_size;
-    int bpp;
-
-    /* what follows is a long-winded way, avoiding any possibility of integer
-     * overflows, of saying:
-     * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
-     */
-
-    bpp = PIXMAN_FORMAT_BPP (format);
-    if (pixman_multiply_overflows_int (width, bpp))
-	return NULL;
-
-    stride = width * bpp;
-    if (pixman_addition_overflows_int (stride, 0x1f))
-	return NULL;
-
-    stride += 0x1f;
-    stride >>= 5;
-
-    stride *= sizeof (uint32_t);
-
-    if (pixman_multiply_overflows_int (height, stride))
-	return NULL;
-
-    buf_size = height * stride;
-
-    if (rowstride_bytes)
-	*rowstride_bytes = stride;
-
-    return calloc (buf_size, 1);
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_bits (pixman_format_code_t format,
-                          int                  width,
-                          int                  height,
-                          uint32_t *           bits,
-                          int                  rowstride_bytes)
-{
-    pixman_image_t *image;
-    uint32_t *free_me = NULL;
-
-    /* must be a whole number of uint32_t's
-     */
-    return_val_if_fail (
-	bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
-
-    return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
-
-    if (!bits && width && height)
-    {
-	free_me = bits = create_bits (format, width, height, &rowstride_bytes);
-	if (!bits)
-	    return NULL;
-    }
-
-    image = _pixman_image_allocate ();
-
-    if (!image)
-    {
-	if (free_me)
-	    free (free_me);
-
-	return NULL;
-    }
-
-    image->type = BITS;
-    image->bits.format = format;
-    image->bits.width = width;
-    image->bits.height = height;
-    image->bits.bits = bits;
-    image->bits.free_me = free_me;
-    image->bits.read_func = NULL;
-    image->bits.write_func = NULL;
-
-    /* The rowstride is stored in number of uint32_t */
-    image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t);
-
-    image->bits.indexed = NULL;
-
-    image->common.property_changed = bits_image_property_changed;
-
-    _pixman_image_reset_clip_region (image);
-
-    return image;
-}
+/*
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007, 2009 Red Hat, Inc.
+ * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+
+/*
+ * By default, just evaluate the image at 32bpp and expand.  Individual image
+ * types can plug in a better scanline getter if they want to. For example
+ * we  could produce smoother gradients by evaluating them at higher color
+ * depth, but that's a project for the future.
+ */
+static void
+_pixman_image_get_scanline_generic_64 (pixman_image_t * image,
+                                       int              x,
+                                       int              y,
+                                       int              width,
+                                       uint32_t *       buffer,
+                                       const uint32_t * mask)
+{
+    uint32_t *mask8 = NULL;
+
+    /* Contract the mask image, if one exists, so that the 32-bit fetch
+     * function can use it.
+     */
+    if (mask)
+    {
+	mask8 = pixman_malloc_ab (width, sizeof(uint32_t));
+	if (!mask8)
+	    return;
+
+	pixman_contract (mask8, (uint64_t *)mask, width);
+    }
+
+    /* Fetch the source image into the first half of buffer. */
+    image->bits.get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8);
+
+    /* Expand from 32bpp to 64bpp in place. */
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width);
+
+    free (mask8);
+}
+
+/* Fetch functions */
+
+static force_inline uint32_t
+fetch_pixel_no_alpha (bits_image_t *image,
+		      int x, int y, pixman_bool_t check_bounds)
+{
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+	return 0;
+    }
+
+    return image->fetch_pixel_32 (image, x, y);
+}
+
+typedef uint32_t (* get_pixel_t) (bits_image_t *image,
+				  int x, int y, pixman_bool_t check_bounds);
+
+static force_inline void
+repeat (pixman_repeat_t repeat, int size, int *coord)
+{
+    switch (repeat)
+    {
+    case PIXMAN_REPEAT_NORMAL:
+	*coord = MOD (*coord, size);
+	break;
+
+    case PIXMAN_REPEAT_PAD:
+	*coord = CLIP (*coord, 0, size - 1);
+	break;
+
+    case PIXMAN_REPEAT_REFLECT:
+	*coord = MOD (*coord, size * 2);
+
+	if (*coord >= size)
+	    *coord = size * 2 - *coord - 1;
+	break;
+
+    case PIXMAN_REPEAT_NONE:
+	break;
+
+    default:
+        break;
+    }
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_nearest (bits_image_t   *image,
+				pixman_fixed_t  x,
+				pixman_fixed_t  y,
+				get_pixel_t	get_pixel)
+{
+    int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+    int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+    if (image->common.repeat != PIXMAN_REPEAT_NONE)
+    {
+	repeat (image->common.repeat, image->width, &x0);
+	repeat (image->common.repeat, image->height, &y0);
+
+	return get_pixel (image, x0, y0, FALSE);
+    }
+    else
+    {
+	return get_pixel (image, x0, y0, TRUE);
+    }
+}
+
+#if SIZEOF_LONG > 4
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    uint64_t distxy, distxiy, distixy, distixiy;
+    uint64_t tl64, tr64, bl64, br64;
+    uint64_t f, r;
+
+    distxy = distx * disty;
+    distxiy = distx * (256 - disty);
+    distixy = (256 - distx) * disty;
+    distixiy = (256 - distx) * (256 - disty);
+
+    /* Alpha and Blue */
+    tl64 = tl & 0xff0000ff;
+    tr64 = tr & 0xff0000ff;
+    bl64 = bl & 0xff0000ff;
+    br64 = br & 0xff0000ff;
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r = f & 0x0000ff0000ff0000ull;
+
+    /* Red and Green */
+    tl64 = tl;
+    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+    tr64 = tr;
+    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+    bl64 = bl;
+    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+    br64 = br;
+    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+    return (uint32_t)(r >> 16);
+}
+
+#else
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t f, r;
+
+    distxy = distx * disty;
+    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
+    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
+    distixiy =
+	256 * 256 - (disty << 8) -
+	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
+
+    /* Blue */
+    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+
+    /* Green */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    tl >>= 16;
+    tr >>= 16;
+    bl >>= 16;
+    br >>= 16;
+    r >>= 16;
+
+    /* Red */
+    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+    r |= f & 0x00ff0000;
+
+    /* Alpha */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    return r;
+}
+
+#endif
+
+static force_inline uint32_t
+bits_image_fetch_pixel_bilinear (bits_image_t   *image,
+				 pixman_fixed_t  x,
+				 pixman_fixed_t  y,
+				 get_pixel_t	 get_pixel)
+{
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+    int x1, y1, x2, y2;
+    uint32_t tl, tr, bl, br;
+    int32_t distx, disty;
+
+    x1 = x - pixman_fixed_1 / 2;
+    y1 = y - pixman_fixed_1 / 2;
+
+    distx = (x1 >> 8) & 0xff;
+    disty = (y1 >> 8) & 0xff;
+
+    x1 = pixman_fixed_to_int (x1);
+    y1 = pixman_fixed_to_int (y1);
+    x2 = x1 + 1;
+    y2 = y1 + 1;
+
+    if (repeat_mode != PIXMAN_REPEAT_NONE)
+    {
+	repeat (repeat_mode, width, &x1);
+	repeat (repeat_mode, height, &y1);
+	repeat (repeat_mode, width, &x2);
+	repeat (repeat_mode, height, &y2);
+
+	tl = get_pixel (image, x1, y1, FALSE);
+	bl = get_pixel (image, x1, y2, FALSE);
+	tr = get_pixel (image, x2, y1, FALSE);
+	br = get_pixel (image, x2, y2, FALSE);
+    }
+    else
+    {
+	tl = get_pixel (image, x1, y1, TRUE);
+	tr = get_pixel (image, x2, y1, TRUE);
+	bl = get_pixel (image, x1, y2, TRUE);
+	br = get_pixel (image, x2, y2, TRUE);
+    }
+
+    return bilinear_interpolation (tl, tr, bl, br, distx, disty);
+}
+
+static void
+bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
+					  int              offset,
+					  int              line,
+					  int              width,
+					  uint32_t *       buffer,
+					  const uint32_t * mask)
+{
+    bits_image_t *bits = &ima->bits;
+    pixman_fixed_t x_top, x_bottom, x;
+    pixman_fixed_t ux_top, ux_bottom, ux;
+    pixman_vector_t v;
+    uint32_t top_mask, bottom_mask;
+    uint32_t *top_row;
+    uint32_t *bottom_row;
+    uint32_t *end;
+    uint32_t zero[2] = { 0, 0 };
+    uint32_t one = 1;
+    int y, y1, y2;
+    int disty;
+    int mask_inc;
+    int w;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (bits->common.transform, &v))
+	return;
+
+    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
+    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
+
+    y = v.vector[1] - pixman_fixed_1/2;
+    disty = (y >> 8) & 0xff;
+
+    /* Load the pointers to the first and second lines from the source
+     * image that bilinear code must read.
+     *
+     * The main trick in this code is about the check if any line are
+     * outside of the image;
+     *
+     * When I realize that a line (any one) is outside, I change
+     * the pointer to a dummy area with zeros. Once I change this, I
+     * must be sure the pointer will not change, so I set the
+     * variables to each pointer increments inside the loop.
+     */
+    y1 = pixman_fixed_to_int (y);
+    y2 = y1 + 1;
+
+    if (y1 < 0 || y1 >= bits->height)
+    {
+	top_row = zero;
+	x_top = 0;
+	ux_top = 0;
+    }
+    else
+    {
+	top_row = bits->bits + y1 * bits->rowstride;
+	x_top = x;
+	ux_top = ux;
+    }
+
+    if (y2 < 0 || y2 >= bits->height)
+    {
+	bottom_row = zero;
+	x_bottom = 0;
+	ux_bottom = 0;
+    }
+    else
+    {
+	bottom_row = bits->bits + y2 * bits->rowstride;
+	x_bottom = x;
+	ux_bottom = ux;
+    }
+
+    /* Instead of checking whether the operation uses the mast in
+     * each loop iteration, verify this only once and prepare the
+     * variables to make the code smaller inside the loop.
+     */
+    if (!mask)
+    {
+        mask_inc = 0;
+        mask = &one;
+    }
+    else
+    {
+        /* If have a mask, prepare the variables to check it */
+        mask_inc = 1;
+    }
+
+    /* If both are zero, then the whole thing is zero */
+    if (top_row == zero && bottom_row == zero)
+    {
+	memset (buffer, 0, width * sizeof (uint32_t));
+	return;
+    }
+    else if (bits->format == PIXMAN_x8r8g8b8)
+    {
+	if (top_row == zero)
+	{
+	    top_mask = 0;
+	    bottom_mask = 0xff000000;
+	}
+	else if (bottom_row == zero)
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0;
+	}
+	else
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0xff000000;
+	}
+    }
+    else
+    {
+	top_mask = 0;
+	bottom_mask = 0;
+    }
+
+    end = buffer + width;
+
+    /* Zero fill to the left of the image */
+    while (buffer < end && x < pixman_fixed_minus_1)
+    {
+	*buffer++ = 0;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Left edge
+     */
+    while (buffer < end && x < 0)
+    {
+	uint32_t tr, br;
+	int32_t distx;
+
+	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
+	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	distx = (x >> 8) & 0xff;
+
+	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
+
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Main part */
+    w = pixman_int_to_fixed (bits->width - 1);
+
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, tr, bl, br;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	    distx = (x >> 8) & 0xff;
+
+	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
+	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Right Edge */
+    w = pixman_int_to_fixed (bits->width);
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, bl;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+
+	    distx = (x >> 8) & 0xff;
+
+	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
+	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Zero fill to the left of the image */
+    while (buffer < end)
+	*buffer++ = 0;
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_convolution (bits_image_t   *image,
+				    pixman_fixed_t  x,
+				    pixman_fixed_t  y,
+				    get_pixel_t     get_pixel)
+{
+    pixman_fixed_t *params = image->common.filter_params;
+    int x_off = (params[0] - pixman_fixed_1) >> 1;
+    int y_off = (params[1] - pixman_fixed_1) >> 1;
+    int32_t cwidth = pixman_fixed_to_int (params[0]);
+    int32_t cheight = pixman_fixed_to_int (params[1]);
+    int32_t srtot, sgtot, sbtot, satot;
+    int32_t i, j, x1, x2, y1, y2;
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+
+    params += 2;
+
+    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+    x2 = x1 + cwidth;
+    y2 = y1 + cheight;
+
+    srtot = sgtot = sbtot = satot = 0;
+
+    for (i = y1; i < y2; ++i)
+    {
+	for (j = x1; j < x2; ++j)
+	{
+	    int rx = j;
+	    int ry = i;
+
+	    pixman_fixed_t f = *params;
+
+	    if (f)
+	    {
+		uint32_t pixel;
+
+		if (repeat_mode != PIXMAN_REPEAT_NONE)
+		{
+		    repeat (repeat_mode, width, &rx);
+		    repeat (repeat_mode, height, &ry);
+
+		    pixel = get_pixel (image, rx, ry, FALSE);
+		}
+		else
+		{
+		    pixel = get_pixel (image, rx, ry, TRUE);
+		}
+
+		srtot += RED_8 (pixel) * f;
+		sgtot += GREEN_8 (pixel) * f;
+		sbtot += BLUE_8 (pixel) * f;
+		satot += ALPHA_8 (pixel) * f;
+	    }
+
+	    params++;
+	}
+    }
+
+    satot >>= 16;
+    srtot >>= 16;
+    sgtot >>= 16;
+    sbtot >>= 16;
+
+    satot = CLIP (satot, 0, 0xff);
+    srtot = CLIP (srtot, 0, 0xff);
+    sgtot = CLIP (sgtot, 0, 0xff);
+    sbtot = CLIP (sbtot, 0, 0xff);
+
+    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_filtered (bits_image_t *image,
+				 pixman_fixed_t x,
+				 pixman_fixed_t y,
+				 get_pixel_t    get_pixel)
+{
+    switch (image->common.filter)
+    {
+    case PIXMAN_FILTER_NEAREST:
+    case PIXMAN_FILTER_FAST:
+	return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
+	break;
+
+    case PIXMAN_FILTER_BILINEAR:
+    case PIXMAN_FILTER_GOOD:
+    case PIXMAN_FILTER_BEST:
+	return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
+	break;
+
+    case PIXMAN_FILTER_CONVOLUTION:
+	return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
+	break;
+
+    default:
+        break;
+    }
+
+    return 0;
+}
+
+static void
+bits_image_fetch_affine_no_alpha (pixman_image_t * image,
+				  int              offset,
+				  int              line,
+				  int              width,
+				  uint32_t *       buffer,
+				  const uint32_t * mask)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return;
+
+	ux = image->common.transform->matrix[0][0];
+	uy = image->common.transform->matrix[1][0];
+    }
+    else
+    {
+	ux = pixman_fixed_1;
+	uy = 0;
+    }
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	if (!mask || mask[i])
+	{
+	    buffer[i] = bits_image_fetch_pixel_filtered (
+		&image->bits, x, y, fetch_pixel_no_alpha);
+	}
+
+	x += ux;
+	y += uy;
+    }
+}
+
+/* General fetcher */
+static force_inline uint32_t
+fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
+{
+    uint32_t pixel;
+
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+	return 0;
+    }
+
+    pixel = image->fetch_pixel_32 (image, x, y);
+
+    if (image->common.alpha_map)
+    {
+	uint32_t pixel_a;
+
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	if (x < 0 || x >= image->common.alpha_map->width ||
+	    y < 0 || y >= image->common.alpha_map->height)
+	{
+	    pixel_a = 0;
+	}
+	else
+	{
+	    pixel_a = image->common.alpha_map->fetch_pixel_32 (
+		image->common.alpha_map, x, y);
+
+	    pixel_a = ALPHA_8 (pixel_a);
+	}
+
+	pixel &= 0x00ffffff;
+	pixel |= (pixel_a << 24);
+    }
+
+    return pixel;
+}
+
+static void
+bits_image_fetch_general (pixman_image_t * image,
+			  int              offset,
+			  int              line,
+			  int              width,
+			  uint32_t *       buffer,
+			  const uint32_t * mask)
+{
+    pixman_fixed_t x, y, w;
+    pixman_fixed_t ux, uy, uw;
+    pixman_vector_t v;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return;
+
+	ux = image->common.transform->matrix[0][0];
+	uy = image->common.transform->matrix[1][0];
+	uw = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+	ux = pixman_fixed_1;
+	uy = 0;
+	uw = 0;
+    }
+
+    x = v.vector[0];
+    y = v.vector[1];
+    w = v.vector[2];
+
+    for (i = 0; i < width; ++i)
+    {
+	pixman_fixed_t x0, y0;
+
+	if (!mask || mask[i])
+	{
+	    if (w != 0)
+	    {
+		x0 = ((pixman_fixed_48_16_t)x << 16) / w;
+		y0 = ((pixman_fixed_48_16_t)y << 16) / w;
+	    }
+	    else
+	    {
+		x0 = 0;
+		y0 = 0;
+	    }
+
+	    buffer[i] = bits_image_fetch_pixel_filtered (
+		&image->bits, x0, y0, fetch_pixel_general);
+	}
+
+	x += ux;
+	y += uy;
+	w += uw;
+    }
+}
+
+static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
+
+static force_inline void
+bits_image_fetch_bilinear_affine (pixman_image_t * image,
+				  int              offset,
+				  int              line,
+				  int              width,
+				  uint32_t *       buffer,
+				  const uint32_t * mask,
+
+				  convert_pixel_t	convert_pixel,
+				  pixman_format_code_t	format,
+				  pixman_repeat_t	repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	int x1, y1, x2, y2;
+	uint32_t tl, tr, bl, br;
+	int32_t distx, disty;
+	int width = image->bits.width;
+	int height = image->bits.height;
+	const uint8_t *row1;
+	const uint8_t *row2;
+
+	if (mask && !mask[i])
+	    goto next;
+
+	x1 = x - pixman_fixed_1 / 2;
+	y1 = y - pixman_fixed_1 / 2;
+
+	distx = (x1 >> 8) & 0xff;
+	disty = (y1 >> 8) & 0xff;
+
+	y1 = pixman_fixed_to_int (y1);
+	y2 = y1 + 1;
+	x1 = pixman_fixed_to_int (x1);
+	x2 = x1 + 1;
+
+	if (repeat_mode != PIXMAN_REPEAT_NONE)
+	{
+	    uint32_t mask;
+
+	    mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+	    repeat (repeat_mode, width, &x1);
+	    repeat (repeat_mode, height, &y1);
+	    repeat (repeat_mode, width, &x2);
+	    repeat (repeat_mode, height, &y2);
+
+	    row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+	    row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+
+	    tl = convert_pixel (row1, x1) | mask;
+	    tr = convert_pixel (row1, x2) | mask;
+	    bl = convert_pixel (row2, x1) | mask;
+	    br = convert_pixel (row2, x2) | mask;
+	}
+	else
+	{
+	    uint32_t mask1, mask2;
+	    int bpp;
+
+	    /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
+	     * which means if you use it in expressions, those
+	     * expressions become unsigned themselves. Since
+	     * the variables below can be negative in some cases,
+	     * that will lead to crashes on 64 bit architectures.
+	     *
+	     * So this line makes sure bpp is signed
+	     */
+	    bpp = PIXMAN_FORMAT_BPP (format);
+
+	    if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
+	    {
+		buffer[i] = 0;
+		goto next;
+	    }
+
+	    if (y2 == 0)
+	    {
+		row1 = zero;
+		mask1 = 0;
+	    }
+	    else
+	    {
+		row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+		row1 += bpp / 8 * x1;
+
+		mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+	    }
+
+	    if (y1 == height - 1)
+	    {
+		row2 = zero;
+		mask2 = 0;
+	    }
+	    else
+	    {
+		row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+		row2 += bpp / 8 * x1;
+
+		mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+	    }
+
+	    if (x2 == 0)
+	    {
+		tl = 0;
+		bl = 0;
+	    }
+	    else
+	    {
+		tl = convert_pixel (row1, 0) | mask1;
+		bl = convert_pixel (row2, 0) | mask2;
+	    }
+
+	    if (x1 == width - 1)
+	    {
+		tr = 0;
+		br = 0;
+	    }
+	    else
+	    {
+		tr = convert_pixel (row1, 1) | mask1;
+		br = convert_pixel (row2, 1) | mask2;
+	    }
+	}
+
+	buffer[i] = bilinear_interpolation (
+	    tl, tr, bl, br, distx, disty);
+
+    next:
+	x += ux;
+	y += uy;
+    }
+}
+
+static force_inline void
+bits_image_fetch_nearest_affine (pixman_image_t * image,
+				 int              offset,
+				 int              line,
+				 int              width,
+				 uint32_t *       buffer,
+				 const uint32_t * mask,
+				 
+				 convert_pixel_t	convert_pixel,
+				 pixman_format_code_t	format,
+				 pixman_repeat_t	repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	int width, height, x0, y0;
+	const uint8_t *row;
+
+	if (mask && !mask[i])
+	    goto next;
+	
+	width = image->bits.width;
+	height = image->bits.height;
+	x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+	y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+	if (repeat_mode == PIXMAN_REPEAT_NONE &&
+	    (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
+	{
+	    buffer[i] = 0;
+	}
+	else
+	{
+	    uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+	    if (repeat_mode != PIXMAN_REPEAT_NONE)
+	    {
+		repeat (repeat_mode, width, &x0);
+		repeat (repeat_mode, height, &y0);
+	    }
+
+	    row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
+
+	    buffer[i] = convert_pixel (row, x0) | mask;
+	}
+
+    next:
+	x += ux;
+	y += uy;
+    }
+}
+
+static force_inline uint32_t
+convert_a8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_x8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_a8 (const uint8_t *row, int x)
+{
+    return *(row + x) << 24;
+}
+
+static force_inline uint32_t
+convert_r5g6b5 (const uint8_t *row, int x)
+{
+    return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
+}
+
+#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
+    static void								\
+    bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image,	\
+					       int              offset,	\
+					       int              line,	\
+					       int              width,	\
+					       uint32_t *       buffer,	\
+					       const uint32_t * mask)	\
+    {									\
+	bits_image_fetch_bilinear_affine (image, offset, line,		\
+					  width, buffer, mask,		\
+					  convert_ ## format,		\
+					  PIXMAN_ ## format,		\
+					  repeat_mode);			\
+    }
+
+#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)			\
+    static void								\
+    bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image,	\
+					      int              offset,	\
+					      int              line,	\
+					      int              width,	\
+					      uint32_t *       buffer,	\
+					      const uint32_t * mask)	\
+    {									\
+	bits_image_fetch_nearest_affine (image, offset, line,		\
+					 width, buffer, mask,		\
+					 convert_ ## format,		\
+					 PIXMAN_ ## format,		\
+					 repeat_mode);			\
+    }
+
+#define MAKE_FETCHERS(name, format, repeat_mode)			\
+    MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
+    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)
+
+MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL)
+
+static void
+bits_image_fetch_solid_32 (pixman_image_t * image,
+                           int              x,
+                           int              y,
+                           int              width,
+                           uint32_t *       buffer,
+                           const uint32_t * mask)
+{
+    uint32_t color;
+    uint32_t *end;
+
+    color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
+
+    end = buffer + width;
+    while (buffer < end)
+	*(buffer++) = color;
+}
+
+static void
+bits_image_fetch_solid_64 (pixman_image_t * image,
+                           int              x,
+                           int              y,
+                           int              width,
+                           uint32_t *       b,
+                           const uint32_t * unused)
+{
+    uint64_t color;
+    uint64_t *buffer = (uint64_t *)b;
+    uint64_t *end;
+
+    color = image->bits.fetch_pixel_64 (&image->bits, 0, 0);
+
+    end = buffer + width;
+    while (buffer < end)
+	*(buffer++) = color;
+}
+
+static void
+bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
+                                            pixman_bool_t wide,
+                                            int           x,
+                                            int           y,
+                                            int           width,
+                                            uint32_t *    buffer)
+{
+    uint32_t w;
+
+    if (y < 0 || y >= image->height)
+    {
+	memset (buffer, 0, width * (wide? 8 : 4));
+	return;
+    }
+
+    if (x < 0)
+    {
+	w = MIN (width, -x);
+
+	memset (buffer, 0, w * (wide ? 8 : 4));
+
+	width -= w;
+	buffer += w * (wide? 2 : 1);
+	x += w;
+    }
+
+    if (x < image->width)
+    {
+	w = MIN (width, image->width - x);
+
+	if (wide)
+	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	else
+	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+	width -= w;
+	buffer += w * (wide? 2 : 1);
+	x += w;
+    }
+
+    memset (buffer, 0, width * (wide ? 8 : 4));
+}
+
+static void
+bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
+                                              pixman_bool_t wide,
+                                              int           x,
+                                              int           y,
+                                              int           width,
+                                              uint32_t *    buffer)
+{
+    uint32_t w;
+
+    while (y < 0)
+	y += image->height;
+
+    while (y >= image->height)
+	y -= image->height;
+
+    while (width)
+    {
+	while (x < 0)
+	    x += image->width;
+	while (x >= image->width)
+	    x -= image->width;
+
+	w = MIN (width, image->width - x);
+
+	if (wide)
+	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	else
+	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+	buffer += w * (wide? 2 : 1);
+	x += w;
+	width -= w;
+    }
+}
+
+static void
+bits_image_fetch_untransformed_32 (pixman_image_t * image,
+                                   int              x,
+                                   int              y,
+                                   int              width,
+                                   uint32_t *       buffer,
+                                   const uint32_t * mask)
+{
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+	bits_image_fetch_untransformed_repeat_none (
+	    &image->bits, FALSE, x, y, width, buffer);
+    }
+    else
+    {
+	bits_image_fetch_untransformed_repeat_normal (
+	    &image->bits, FALSE, x, y, width, buffer);
+    }
+}
+
+static void
+bits_image_fetch_untransformed_64 (pixman_image_t * image,
+                                   int              x,
+                                   int              y,
+                                   int              width,
+                                   uint32_t *       buffer,
+                                   const uint32_t * unused)
+{
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+	bits_image_fetch_untransformed_repeat_none (
+	    &image->bits, TRUE, x, y, width, buffer);
+    }
+    else
+    {
+	bits_image_fetch_untransformed_repeat_normal (
+	    &image->bits, TRUE, x, y, width, buffer);
+    }
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    uint32_t			flags;
+    fetch_scanline_t		fetch_32;
+    fetch_scanline_t		fetch_64;
+} fetcher_info_t;
+
+static const fetcher_info_t fetcher_info[] =
+{
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP,
+      bits_image_fetch_solid_32,
+      bits_image_fetch_solid_64
+    },
+
+    { PIXMAN_any,
+      (FAST_PATH_NO_ALPHA_MAP			|
+       FAST_PATH_ID_TRANSFORM			|
+       FAST_PATH_NO_CONVOLUTION_FILTER		|
+       FAST_PATH_NO_PAD_REPEAT			|
+       FAST_PATH_NO_REFLECT_REPEAT),
+      bits_image_fetch_untransformed_32,
+      bits_image_fetch_untransformed_64
+    },
+
+#define FAST_BILINEAR_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_X_UNIT_POSITIVE		|				\
+     FAST_PATH_Y_UNIT_ZERO		|				\
+     FAST_PATH_NONE_REPEAT		|				\
+     FAST_PATH_BILINEAR_FILTER)
+
+    { PIXMAN_a8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      bits_image_fetch_bilinear_no_repeat_8888,
+      _pixman_image_get_scanline_generic_64
+    },
+
+    { PIXMAN_x8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      bits_image_fetch_bilinear_no_repeat_8888,
+      _pixman_image_get_scanline_generic_64
+    },
+
+#define GENERAL_BILINEAR_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_BILINEAR_FILTER)
+
+#define GENERAL_NEAREST_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_NEAREST_FILTER)
+
+#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
+    { PIXMAN_ ## format,						\
+      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
+      bits_image_fetch_bilinear_affine_ ## name,			\
+      _pixman_image_get_scanline_generic_64				\
+    },
+
+#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\
+    { PIXMAN_ ## format,						\
+      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
+      bits_image_fetch_nearest_affine_ ## name,			\
+      _pixman_image_get_scanline_generic_64				\
+    },
+
+#define AFFINE_FAST_PATHS(name, format, repeat)				\
+    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
+    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+    
+    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_a8, a8, PAD)
+    AFFINE_FAST_PATHS (none_a8, a8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
+    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
+    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
+    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
+    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
+
+    /* Affine, no alpha */
+    { PIXMAN_any,
+      (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
+      bits_image_fetch_affine_no_alpha,
+      _pixman_image_get_scanline_generic_64
+    },
+
+    /* General */
+    { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
+
+    { PIXMAN_null },
+};
+
+static void
+bits_image_property_changed (pixman_image_t *image)
+{
+    uint32_t flags = image->common.flags;
+    pixman_format_code_t format = image->common.extended_format_code;
+    const fetcher_info_t *info;
+
+    _pixman_bits_image_setup_accessors (&image->bits);
+
+    info = fetcher_info;
+    while (info->format != PIXMAN_null)
+    {
+	if ((info->format == format || info->format == PIXMAN_any)	&&
+	    (info->flags & flags) == info->flags)
+	{
+	    image->bits.get_scanline_32 = info->fetch_32;
+	    image->bits.get_scanline_64 = info->fetch_64;
+	    break;
+	}
+
+	info++;
+    }
+}
+
+static uint32_t *
+src_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->image->bits.get_scanline_32 (
+	iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
+
+    return iter->buffer;
+}
+
+static uint32_t *
+src_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->image->bits.get_scanline_64 (
+	iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
+
+    return iter->buffer;
+}
+
+void
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+	iter->get_scanline = src_get_scanline_narrow;
+    else
+	iter->get_scanline = src_get_scanline_wide;
+}
+
+static uint32_t *
+dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *	    buffer = iter->buffer;
+
+    image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask);
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->fetch_scanline_32 (
+	    (pixman_image_t *)image->common.alpha_map,
+	    x, y, width, buffer, mask);
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *	    buffer = iter->buffer;
+
+    image->fetch_scanline_64 (
+	(pixman_image_t *)image, x, y, width, buffer, mask);
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->fetch_scanline_64 (
+	    (pixman_image_t *)image->common.alpha_map, x, y, width, buffer, mask);
+    }
+
+    return iter->buffer;
+}
+
+static void
+dest_write_back_narrow (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_32 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->store_scanline_32 (
+	    image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+static void
+dest_write_back_wide (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_64 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->store_scanline_64 (
+	    image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+static void
+dest_write_back_direct (pixman_iter_t *iter)
+{
+    iter->buffer += iter->image->bits.rowstride;
+}
+
+void
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+    {
+	if (((image->common.flags &
+	      (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) ==
+	     (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) &&
+	    (image->bits.format == PIXMAN_a8r8g8b8	||
+	     (image->bits.format == PIXMAN_x8r8g8b8	&&
+	      (iter->flags & ITER_LOCALIZED_ALPHA))))
+	{
+	    iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x;
+
+	    iter->get_scanline = _pixman_iter_get_scanline_noop;
+	    iter->write_back = dest_write_back_direct;
+	}
+	else
+	{
+	    if ((iter->flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
+		(ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+	    {
+		iter->get_scanline = _pixman_iter_get_scanline_noop;
+	    }
+	    else
+	    {
+		iter->get_scanline = dest_get_scanline_narrow;
+	    }
+
+	    iter->write_back = dest_write_back_narrow;
+	}
+    }
+    else
+    {
+	iter->get_scanline = dest_get_scanline_wide;
+	iter->write_back = dest_write_back_wide;
+    }
+}
+
+static uint32_t *
+create_bits (pixman_format_code_t format,
+             int                  width,
+             int                  height,
+             int *                rowstride_bytes)
+{
+    int stride;
+    int buf_size;
+    int bpp;
+
+    /* what follows is a long-winded way, avoiding any possibility of integer
+     * overflows, of saying:
+     * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
+     */
+
+    bpp = PIXMAN_FORMAT_BPP (format);
+    if (pixman_multiply_overflows_int (width, bpp))
+	return NULL;
+
+    stride = width * bpp;
+    if (pixman_addition_overflows_int (stride, 0x1f))
+	return NULL;
+
+    stride += 0x1f;
+    stride >>= 5;
+
+    stride *= sizeof (uint32_t);
+
+    if (pixman_multiply_overflows_int (height, stride))
+	return NULL;
+
+    buf_size = height * stride;
+
+    if (rowstride_bytes)
+	*rowstride_bytes = stride;
+
+    return calloc (buf_size, 1);
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_bits (pixman_format_code_t format,
+                          int                  width,
+                          int                  height,
+                          uint32_t *           bits,
+                          int                  rowstride_bytes)
+{
+    pixman_image_t *image;
+    uint32_t *free_me = NULL;
+
+    /* must be a whole number of uint32_t's
+     */
+    return_val_if_fail (
+	bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
+
+    return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
+
+    if (!bits && width && height)
+    {
+	free_me = bits = create_bits (format, width, height, &rowstride_bytes);
+	if (!bits)
+	    return NULL;
+    }
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+    {
+	if (free_me)
+	    free (free_me);
+
+	return NULL;
+    }
+
+    image->type = BITS;
+    image->bits.format = format;
+    image->bits.width = width;
+    image->bits.height = height;
+    image->bits.bits = bits;
+    image->bits.free_me = free_me;
+    image->bits.read_func = NULL;
+    image->bits.write_func = NULL;
+
+    /* The rowstride is stored in number of uint32_t */
+    image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t);
+
+    image->bits.indexed = NULL;
+
+    image->common.property_changed = bits_image_property_changed;
+
+    _pixman_image_reset_clip_region (image);
+
+    return image;
+}
diff --git a/pixman/pixman/pixman-conical-gradient.c b/pixman/pixman/pixman-conical-gradient.c
index 9d7d2e8b5..e3f230262 100644
--- a/pixman/pixman/pixman-conical-gradient.c
+++ b/pixman/pixman/pixman-conical-gradient.c
@@ -1,214 +1,211 @@
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- *             2005 Lars Knoll & Zack Rusin, Trolltech
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <stdlib.h>
-#include <math.h>
-#include "pixman-private.h"
-
-static force_inline double
-coordinates_to_parameter (double x, double y, double angle)
-{
-    double t;
-
-    t = atan2 (y, x) + angle;
-
-    while (t < 0)
-	t += 2 * M_PI;
-
-    while (t >= 2 * M_PI)
-	t -= 2 * M_PI;
-
-    return 1 - t * (1 / (2 * M_PI)); /* Scale t to [0, 1] and
-				      * make rotation CCW
-				      */
-}
-
-static uint32_t *
-conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
-{
-    pixman_image_t *image = iter->image;
-    int x = iter->x;
-    int y = iter->y;
-    int width = iter->width;
-    uint32_t *buffer = iter->buffer;
-
-    gradient_t *gradient = (gradient_t *)image;
-    conical_gradient_t *conical = (conical_gradient_t *)image;
-    uint32_t       *end = buffer + width;
-    pixman_gradient_walker_t walker;
-    pixman_bool_t affine = TRUE;
-    double cx = 1.;
-    double cy = 0.;
-    double cz = 0.;
-    double rx = x + 0.5;
-    double ry = y + 0.5;
-    double rz = 1.;
-
-    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
-
-    if (image->common.transform)
-    {
-	pixman_vector_t v;
-
-	/* reference point is the center of the pixel */
-	v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
-	v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
-	v.vector[2] = pixman_fixed_1;
-
-	if (!pixman_transform_point_3d (image->common.transform, &v))
-	    return iter->buffer;
-
-	cx = image->common.transform->matrix[0][0] / 65536.;
-	cy = image->common.transform->matrix[1][0] / 65536.;
-	cz = image->common.transform->matrix[2][0] / 65536.;
-
-	rx = v.vector[0] / 65536.;
-	ry = v.vector[1] / 65536.;
-	rz = v.vector[2] / 65536.;
-
-	affine =
-	    image->common.transform->matrix[2][0] == 0 &&
-	    v.vector[2] == pixman_fixed_1;
-    }
-
-    if (affine)
-    {
-	rx -= conical->center.x / 65536.;
-	ry -= conical->center.y / 65536.;
-
-	while (buffer < end)
-	{
-	    if (!mask || *mask++)
-	    {
-		double t = coordinates_to_parameter (rx, ry, conical->angle);
-
-		*buffer = _pixman_gradient_walker_pixel (
-		    &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
-	    }
-
-	    ++buffer;
-
-	    rx += cx;
-	    ry += cy;
-	}
-    }
-    else
-    {
-	while (buffer < end)
-	{
-	    double x, y;
-
-	    if (!mask || *mask++)
-	    {
-		double t;
-
-		if (rz != 0)
-		{
-		    x = rx / rz;
-		    y = ry / rz;
-		}
-		else
-		{
-		    x = y = 0.;
-		}
-
-		x -= conical->center.x / 65536.;
-		y -= conical->center.y / 65536.;
-
-		t = coordinates_to_parameter (x, y, conical->angle);
-
-		*buffer = _pixman_gradient_walker_pixel (
-		    &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
-	    }
-
-	    ++buffer;
-
-	    rx += cx;
-	    ry += cy;
-	    rz += cz;
-	}
-    }
-
-    iter->y++;
-    return iter->buffer;
-}
-
-static uint32_t *
-conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
-    uint32_t *buffer = conical_get_scanline_narrow (iter, NULL);
-
-    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
-
-    return buffer;
-}
-
-void
-_pixman_conical_gradient_iter_init (pixman_image_t *image,
-				    pixman_iter_t *iter,
-				    int x, int y, int width, int height,
-				    uint8_t *buffer, iter_flags_t flags)
-{
-    if (flags & ITER_NARROW)
-	iter->get_scanline = conical_get_scanline_narrow;
-    else
-	iter->get_scanline = conical_get_scanline_wide;
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_conical_gradient (pixman_point_fixed_t *        center,
-                                      pixman_fixed_t                angle,
-                                      const pixman_gradient_stop_t *stops,
-                                      int                           n_stops)
-{
-    pixman_image_t *image = _pixman_image_allocate ();
-    conical_gradient_t *conical;
-
-    if (!image)
-	return NULL;
-
-    conical = &image->conical;
-
-    if (!_pixman_init_gradient (&conical->common, stops, n_stops))
-    {
-	free (image);
-	return NULL;
-    }
-
-    angle = MOD (angle, pixman_int_to_fixed (360));
-
-    image->type = CONICAL;
-
-    conical->center = *center;
-    conical->angle = (pixman_fixed_to_double (angle) / 180.0) * M_PI;
-
-    return image;
-}
-
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+static force_inline double
+coordinates_to_parameter (double x, double y, double angle)
+{
+    double t;
+
+    t = atan2 (y, x) + angle;
+
+    while (t < 0)
+	t += 2 * M_PI;
+
+    while (t >= 2 * M_PI)
+	t -= 2 * M_PI;
+
+    return 1 - t * (1 / (2 * M_PI)); /* Scale t to [0, 1] and
+				      * make rotation CCW
+				      */
+}
+
+static uint32_t *
+conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    uint32_t *buffer = iter->buffer;
+
+    gradient_t *gradient = (gradient_t *)image;
+    conical_gradient_t *conical = (conical_gradient_t *)image;
+    uint32_t       *end = buffer + width;
+    pixman_gradient_walker_t walker;
+    pixman_bool_t affine = TRUE;
+    double cx = 1.;
+    double cy = 0.;
+    double cz = 0.;
+    double rx = x + 0.5;
+    double ry = y + 0.5;
+    double rz = 1.;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    if (image->common.transform)
+    {
+	pixman_vector_t v;
+
+	/* reference point is the center of the pixel */
+	v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+	v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+	v.vector[2] = pixman_fixed_1;
+
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+
+	cx = image->common.transform->matrix[0][0] / 65536.;
+	cy = image->common.transform->matrix[1][0] / 65536.;
+	cz = image->common.transform->matrix[2][0] / 65536.;
+
+	rx = v.vector[0] / 65536.;
+	ry = v.vector[1] / 65536.;
+	rz = v.vector[2] / 65536.;
+
+	affine =
+	    image->common.transform->matrix[2][0] == 0 &&
+	    v.vector[2] == pixman_fixed_1;
+    }
+
+    if (affine)
+    {
+	rx -= conical->center.x / 65536.;
+	ry -= conical->center.y / 65536.;
+
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+		double t = coordinates_to_parameter (rx, ry, conical->angle);
+
+		*buffer = _pixman_gradient_walker_pixel (
+		    &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+	    }
+
+	    ++buffer;
+
+	    rx += cx;
+	    ry += cy;
+	}
+    }
+    else
+    {
+	while (buffer < end)
+	{
+	    double x, y;
+
+	    if (!mask || *mask++)
+	    {
+		double t;
+
+		if (rz != 0)
+		{
+		    x = rx / rz;
+		    y = ry / rz;
+		}
+		else
+		{
+		    x = y = 0.;
+		}
+
+		x -= conical->center.x / 65536.;
+		y -= conical->center.y / 65536.;
+
+		t = coordinates_to_parameter (x, y, conical->angle);
+
+		*buffer = _pixman_gradient_walker_pixel (
+		    &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+	    }
+
+	    ++buffer;
+
+	    rx += cx;
+	    ry += cy;
+	    rz += cz;
+	}
+    }
+
+    iter->y++;
+    return iter->buffer;
+}
+
+static uint32_t *
+conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = conical_get_scanline_narrow (iter, NULL);
+
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+	iter->get_scanline = conical_get_scanline_narrow;
+    else
+	iter->get_scanline = conical_get_scanline_wide;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_conical_gradient (pixman_point_fixed_t *        center,
+                                      pixman_fixed_t                angle,
+                                      const pixman_gradient_stop_t *stops,
+                                      int                           n_stops)
+{
+    pixman_image_t *image = _pixman_image_allocate ();
+    conical_gradient_t *conical;
+
+    if (!image)
+	return NULL;
+
+    conical = &image->conical;
+
+    if (!_pixman_init_gradient (&conical->common, stops, n_stops))
+    {
+	free (image);
+	return NULL;
+    }
+
+    angle = MOD (angle, pixman_int_to_fixed (360));
+
+    image->type = CONICAL;
+
+    conical->center = *center;
+    conical->angle = (pixman_fixed_to_double (angle) / 180.0) * M_PI;
+
+    return image;
+}
+
diff --git a/pixman/pixman/pixman-general.c b/pixman/pixman/pixman-general.c
index 872fb7e9f..5bac6c65a 100644
--- a/pixman/pixman/pixman-general.c
+++ b/pixman/pixman/pixman-general.c
@@ -1,311 +1,275 @@
-/*
- * Copyright © 2009 Red Hat, Inc.
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- *             2005 Lars Knoll & Zack Rusin, Trolltech
- *             2008 Aaron Plattner, NVIDIA Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Red Hat not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  Red Hat makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "pixman-private.h"
-
-static void
-general_src_iter_init (pixman_implementation_t *imp,
-		       pixman_iter_t *iter,
-		       pixman_image_t *image,
-		       int x, int y, int width, int height,
-		       uint8_t *buffer, iter_flags_t flags)
-{
-    iter->image = image;
-    iter->x = x;
-    iter->y = y;
-    iter->width = width;
-    iter->buffer = (uint32_t *)buffer;
-
-    if (image->type == SOLID)
-    {
-	_pixman_solid_fill_iter_init (
-	    image, iter, x, y, width, height, buffer, flags);
-    }
-    else if (image->type == LINEAR)
-    {
-	_pixman_linear_gradient_iter_init (
-	    image, iter, x, y, width, height, buffer, flags);
-    }
-    else if (image->type == RADIAL)
-    {
-	_pixman_radial_gradient_iter_init (
-	    image, iter, x, y, width, height, buffer, flags);
-    }
-    else if (image->type == CONICAL)
-    {
-	_pixman_conical_gradient_iter_init (
-	    image, iter, x, y, width, height, buffer, flags);
-    }
-    else if (image->type == BITS)
-    {
-	_pixman_bits_image_src_iter_init (
-	    image, iter, x, y, width, height, buffer, flags);
-    }
-    else
-    {
-	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
-    }
-}
-
-static void
-general_dest_iter_init (pixman_implementation_t *imp,
-			pixman_iter_t *iter,
-			pixman_image_t *image,
-			int x, int y, int width, int height,
-			uint8_t *buffer, iter_flags_t flags)
-{
-    iter->image = image;
-    iter->x = x;
-    iter->y = y;
-    iter->width = width;
-    iter->buffer = (uint32_t *)buffer;
-
-    if (image->type == BITS)
-    {
-	_pixman_bits_image_dest_iter_init (
-	    image, iter, x, y, width, height, buffer, flags);
-    }
-    else
-    {
-	_pixman_log_error (FUNC, "Trying to write to a non-writable image");
-    }
-}
-
-typedef struct op_info_t op_info_t;
-struct op_info_t
-{
-    uint8_t src, dst;
-};
-
-#define ITER_IGNORE_BOTH						\
-    (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_LOCALIZED_ALPHA)
-
-static const op_info_t op_flags[PIXMAN_N_OPERATORS] =
-{
-    /* Src                   Dst                   */
-    { ITER_IGNORE_BOTH,      ITER_IGNORE_BOTH      }, /* CLEAR */
-    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_BOTH      }, /* SRC */
-    { ITER_IGNORE_BOTH,      ITER_LOCALIZED_ALPHA  }, /* DST */
-    { 0,                     ITER_LOCALIZED_ALPHA  }, /* OVER */
-    { ITER_LOCALIZED_ALPHA,  0                     }, /* OVER_REVERSE */
-    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* IN */
-    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* IN_REVERSE */
-    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* OUT */
-    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* OUT_REVERSE */
-    { 0,                     0                     }, /* ATOP */
-    { 0,                     0                     }, /* ATOP_REVERSE */
-    { 0,                     0                     }, /* XOR */
-    { ITER_LOCALIZED_ALPHA,  ITER_LOCALIZED_ALPHA  }, /* ADD */
-    { 0,                     0                     }, /* SATURATE */
-};
-
-#define SCANLINE_BUFFER_LENGTH 8192
-
-static void
-general_composite_rect  (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         pixman_image_t *         src,
-                         pixman_image_t *         mask,
-                         pixman_image_t *         dest,
-                         int32_t                  src_x,
-                         int32_t                  src_y,
-                         int32_t                  mask_x,
-                         int32_t                  mask_y,
-                         int32_t                  dest_x,
-                         int32_t                  dest_y,
-                         int32_t                  width,
-                         int32_t                  height)
-{
-    uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8];
-    uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
-    uint8_t *src_buffer, *mask_buffer, *dest_buffer;
-    pixman_iter_t src_iter, mask_iter, dest_iter;
-    pixman_combine_32_func_t compose;
-    pixman_bool_t component_alpha;
-    iter_flags_t narrow, src_flags;
-    int Bpp;
-    int i;
-
-    if ((src->common.flags & FAST_PATH_NARROW_FORMAT)		&&
-	(!mask || mask->common.flags & FAST_PATH_NARROW_FORMAT)	&&
-	(dest->common.flags & FAST_PATH_NARROW_FORMAT))
-    {
-	narrow = ITER_NARROW;
-	Bpp = 4;
-    }
-    else
-    {
-	narrow = 0;
-	Bpp = 8;
-    }
-
-    if (width * Bpp > SCANLINE_BUFFER_LENGTH)
-    {
-	scanline_buffer = pixman_malloc_abc (width, 3, Bpp);
-
-	if (!scanline_buffer)
-	    return;
-    }
-
-    src_buffer = scanline_buffer;
-    mask_buffer = src_buffer + width * Bpp;
-    dest_buffer = mask_buffer + width * Bpp;
-
-    /* src iter */
-    src_flags = narrow | op_flags[op].src;
-
-    _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src,
-					  src_x, src_y, width, height,
-					  src_buffer, src_flags);
-
-    /* mask iter */
-    if ((src_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
-	(ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
-    {
-	/* If it doesn't matter what the source is, then it doesn't matter
-	 * what the mask is
-	 */
-	mask = NULL;
-    }
-
-    component_alpha =
-        mask                            &&
-        mask->common.type == BITS       &&
-        mask->common.component_alpha    &&
-        PIXMAN_FORMAT_RGB (mask->bits.format);
-
-    _pixman_implementation_src_iter_init (
-	imp->toplevel, &mask_iter, mask, mask_x, mask_y, width, height,
-	mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB));
-
-    /* dest iter */
-    _pixman_implementation_dest_iter_init (imp->toplevel, &dest_iter, dest,
-					   dest_x, dest_y, width, height,
-					   dest_buffer,
-					   narrow | op_flags[op].dst);
-
-    if (narrow)
-    {
-	if (component_alpha)
-	    compose = _pixman_implementation_combine_32_ca;
-	else
-	    compose = _pixman_implementation_combine_32;
-    }
-    else
-    {
-	if (component_alpha)
-	    compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64_ca;
-	else
-	    compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64;
-    }
-
-    if (!compose)
-	return;
-
-    for (i = 0; i < height; ++i)
-    {
-	uint32_t *s, *m, *d;
-
-	m = mask_iter.get_scanline (&mask_iter, NULL);
-	s = src_iter.get_scanline (&src_iter, m);
-	d = dest_iter.get_scanline (&dest_iter, NULL);
-
-	compose (imp->toplevel, op, d, s, m, width);
-
-	dest_iter.write_back (&dest_iter);
-    }
-
-    if (scanline_buffer != (uint8_t *) stack_scanline_buffer)
-	free (scanline_buffer);
-}
-
-static const pixman_fast_path_t general_fast_path[] =
-{
-    { PIXMAN_OP_any, PIXMAN_any, 0, PIXMAN_any,	0, PIXMAN_any, 0, general_composite_rect },
-    { PIXMAN_OP_NONE }
-};
-
-static pixman_bool_t
-general_blt (pixman_implementation_t *imp,
-             uint32_t *               src_bits,
-             uint32_t *               dst_bits,
-             int                      src_stride,
-             int                      dst_stride,
-             int                      src_bpp,
-             int                      dst_bpp,
-             int                      src_x,
-             int                      src_y,
-             int                      dst_x,
-             int                      dst_y,
-             int                      width,
-             int                      height)
-{
-    /* We can't blit unless we have sse2 or mmx */
-
-    return FALSE;
-}
-
-static pixman_bool_t
-general_fill (pixman_implementation_t *imp,
-              uint32_t *               bits,
-              int                      stride,
-              int                      bpp,
-              int                      x,
-              int                      y,
-              int                      width,
-              int                      height,
-              uint32_t xor)
-{
-    return FALSE;
-}
-
-pixman_implementation_t *
-_pixman_implementation_create_general (void)
-{
-    pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path);
-
-    _pixman_setup_combiner_functions_32 (imp);
-    _pixman_setup_combiner_functions_64 (imp);
-
-    imp->blt = general_blt;
-    imp->fill = general_fill;
-    imp->src_iter_init = general_src_iter_init;
-    imp->dest_iter_init = general_dest_iter_init;
-
-    return imp;
-}
-
+/*
+ * Copyright © 2009 Red Hat, Inc.
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+
+static void
+general_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+    if (image->type == SOLID)
+	_pixman_solid_fill_iter_init (image, iter);
+    else if (image->type == LINEAR)
+	_pixman_linear_gradient_iter_init (image, iter);
+    else if (image->type == RADIAL)
+	_pixman_radial_gradient_iter_init (image, iter);
+    else if (image->type == CONICAL)
+	_pixman_conical_gradient_iter_init (image, iter);
+    else if (image->type == BITS)
+	_pixman_bits_image_src_iter_init (image, iter);
+    else
+	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+}
+
+static void
+general_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    if (iter->image->type == BITS)
+    {
+	_pixman_bits_image_dest_iter_init (iter->image, iter);
+    }
+    else
+    {
+	_pixman_log_error (FUNC, "Trying to write to a non-writable image");
+    }
+}
+
+typedef struct op_info_t op_info_t;
+struct op_info_t
+{
+    uint8_t src, dst;
+};
+
+#define ITER_IGNORE_BOTH						\
+    (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_LOCALIZED_ALPHA)
+
+static const op_info_t op_flags[PIXMAN_N_OPERATORS] =
+{
+    /* Src                   Dst                   */
+    { ITER_IGNORE_BOTH,      ITER_IGNORE_BOTH      }, /* CLEAR */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_BOTH      }, /* SRC */
+    { ITER_IGNORE_BOTH,      ITER_LOCALIZED_ALPHA  }, /* DST */
+    { 0,                     ITER_LOCALIZED_ALPHA  }, /* OVER */
+    { ITER_LOCALIZED_ALPHA,  0                     }, /* OVER_REVERSE */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* IN */
+    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* IN_REVERSE */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* OUT */
+    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* OUT_REVERSE */
+    { 0,                     0                     }, /* ATOP */
+    { 0,                     0                     }, /* ATOP_REVERSE */
+    { 0,                     0                     }, /* XOR */
+    { ITER_LOCALIZED_ALPHA,  ITER_LOCALIZED_ALPHA  }, /* ADD */
+    { 0,                     0                     }, /* SATURATE */
+};
+
+#define SCANLINE_BUFFER_LENGTH 8192
+
+static void
+general_composite_rect  (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         pixman_image_t *         src,
+                         pixman_image_t *         mask,
+                         pixman_image_t *         dest,
+                         int32_t                  src_x,
+                         int32_t                  src_y,
+                         int32_t                  mask_x,
+                         int32_t                  mask_y,
+                         int32_t                  dest_x,
+                         int32_t                  dest_y,
+                         int32_t                  width,
+                         int32_t                  height)
+{
+    uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8];
+    uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
+    uint8_t *src_buffer, *mask_buffer, *dest_buffer;
+    pixman_iter_t src_iter, mask_iter, dest_iter;
+    pixman_combine_32_func_t compose;
+    pixman_bool_t component_alpha;
+    iter_flags_t narrow, src_flags;
+    int Bpp;
+    int i;
+
+    if ((src->common.flags & FAST_PATH_NARROW_FORMAT)		&&
+	(!mask || mask->common.flags & FAST_PATH_NARROW_FORMAT)	&&
+	(dest->common.flags & FAST_PATH_NARROW_FORMAT))
+    {
+	narrow = ITER_NARROW;
+	Bpp = 4;
+    }
+    else
+    {
+	narrow = 0;
+	Bpp = 8;
+    }
+
+    if (width * Bpp > SCANLINE_BUFFER_LENGTH)
+    {
+	scanline_buffer = pixman_malloc_abc (width, 3, Bpp);
+
+	if (!scanline_buffer)
+	    return;
+    }
+
+    src_buffer = scanline_buffer;
+    mask_buffer = src_buffer + width * Bpp;
+    dest_buffer = mask_buffer + width * Bpp;
+
+    /* src iter */
+    src_flags = narrow | op_flags[op].src;
+
+    _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src,
+					  src_x, src_y, width, height,
+					  src_buffer, src_flags);
+
+    /* mask iter */
+    if ((src_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
+	(ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
+    {
+	/* If it doesn't matter what the source is, then it doesn't matter
+	 * what the mask is
+	 */
+	mask = NULL;
+    }
+
+    component_alpha =
+        mask                            &&
+        mask->common.type == BITS       &&
+        mask->common.component_alpha    &&
+        PIXMAN_FORMAT_RGB (mask->bits.format);
+
+    _pixman_implementation_src_iter_init (
+	imp->toplevel, &mask_iter, mask, mask_x, mask_y, width, height,
+	mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB));
+
+    /* dest iter */
+    _pixman_implementation_dest_iter_init (imp->toplevel, &dest_iter, dest,
+					   dest_x, dest_y, width, height,
+					   dest_buffer,
+					   narrow | op_flags[op].dst);
+
+    if (narrow)
+    {
+	if (component_alpha)
+	    compose = _pixman_implementation_combine_32_ca;
+	else
+	    compose = _pixman_implementation_combine_32;
+    }
+    else
+    {
+	if (component_alpha)
+	    compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64_ca;
+	else
+	    compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64;
+    }
+
+    if (!compose)
+	return;
+
+    for (i = 0; i < height; ++i)
+    {
+	uint32_t *s, *m, *d;
+
+	m = mask_iter.get_scanline (&mask_iter, NULL);
+	s = src_iter.get_scanline (&src_iter, m);
+	d = dest_iter.get_scanline (&dest_iter, NULL);
+
+	compose (imp->toplevel, op, d, s, m, width);
+
+	dest_iter.write_back (&dest_iter);
+    }
+
+    if (scanline_buffer != (uint8_t *) stack_scanline_buffer)
+	free (scanline_buffer);
+}
+
+static const pixman_fast_path_t general_fast_path[] =
+{
+    { PIXMAN_OP_any, PIXMAN_any, 0, PIXMAN_any,	0, PIXMAN_any, 0, general_composite_rect },
+    { PIXMAN_OP_NONE }
+};
+
+static pixman_bool_t
+general_blt (pixman_implementation_t *imp,
+             uint32_t *               src_bits,
+             uint32_t *               dst_bits,
+             int                      src_stride,
+             int                      dst_stride,
+             int                      src_bpp,
+             int                      dst_bpp,
+             int                      src_x,
+             int                      src_y,
+             int                      dst_x,
+             int                      dst_y,
+             int                      width,
+             int                      height)
+{
+    /* We can't blit unless we have sse2 or mmx */
+
+    return FALSE;
+}
+
+static pixman_bool_t
+general_fill (pixman_implementation_t *imp,
+              uint32_t *               bits,
+              int                      stride,
+              int                      bpp,
+              int                      x,
+              int                      y,
+              int                      width,
+              int                      height,
+              uint32_t xor)
+{
+    return FALSE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_general (void)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path);
+
+    _pixman_setup_combiner_functions_32 (imp);
+    _pixman_setup_combiner_functions_64 (imp);
+
+    imp->blt = general_blt;
+    imp->fill = general_fill;
+    imp->src_iter_init = general_src_iter_init;
+    imp->dest_iter_init = general_dest_iter_init;
+
+    return imp;
+}
+
diff --git a/pixman/pixman/pixman-implementation.c b/pixman/pixman/pixman-implementation.c
index adaf9c61e..caade9332 100644
--- a/pixman/pixman/pixman-implementation.c
+++ b/pixman/pixman/pixman-implementation.c
@@ -1,306 +1,304 @@
-/*
- * Copyright © 2009 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Red Hat not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  Red Hat makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdlib.h>
-#include "pixman-private.h"
-
-static void
-delegate_combine_32 (pixman_implementation_t * imp,
-                     pixman_op_t               op,
-                     uint32_t *                dest,
-                     const uint32_t *          src,
-                     const uint32_t *          mask,
-                     int                       width)
-{
-    _pixman_implementation_combine_32 (imp->delegate,
-                                       op, dest, src, mask, width);
-}
-
-static void
-delegate_combine_64 (pixman_implementation_t * imp,
-                     pixman_op_t               op,
-                     uint64_t *                dest,
-                     const uint64_t *          src,
-                     const uint64_t *          mask,
-                     int                       width)
-{
-    _pixman_implementation_combine_64 (imp->delegate,
-                                       op, dest, src, mask, width);
-}
-
-static void
-delegate_combine_32_ca (pixman_implementation_t * imp,
-                        pixman_op_t               op,
-                        uint32_t *                dest,
-                        const uint32_t *          src,
-                        const uint32_t *          mask,
-                        int                       width)
-{
-    _pixman_implementation_combine_32_ca (imp->delegate,
-                                          op, dest, src, mask, width);
-}
-
-static void
-delegate_combine_64_ca (pixman_implementation_t * imp,
-                        pixman_op_t               op,
-                        uint64_t *                dest,
-                        const uint64_t *          src,
-                        const uint64_t *          mask,
-                        int                       width)
-{
-    _pixman_implementation_combine_64_ca (imp->delegate,
-                                          op, dest, src, mask, width);
-}
-
-static pixman_bool_t
-delegate_blt (pixman_implementation_t * imp,
-              uint32_t *                src_bits,
-              uint32_t *                dst_bits,
-              int                       src_stride,
-              int                       dst_stride,
-              int                       src_bpp,
-              int                       dst_bpp,
-              int                       src_x,
-              int                       src_y,
-              int                       dst_x,
-              int                       dst_y,
-              int                       width,
-              int                       height)
-{
-    return _pixman_implementation_blt (
-	imp->delegate, src_bits, dst_bits, src_stride, dst_stride,
-	src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
-	width, height);
-}
-
-static pixman_bool_t
-delegate_fill (pixman_implementation_t *imp,
-               uint32_t *               bits,
-               int                      stride,
-               int                      bpp,
-               int                      x,
-               int                      y,
-               int                      width,
-               int                      height,
-               uint32_t                 xor)
-{
-    return _pixman_implementation_fill (
-	imp->delegate, bits, stride, bpp, x, y, width, height, xor);
-}
-
-static void
-delegate_src_iter_init (pixman_implementation_t *imp,
-			pixman_iter_t *	         iter,
-			pixman_image_t *         image,
-			int                      x,
-			int                      y,
-			int                      width,
-			int                      height,
-			uint8_t *		 buffer,
-			iter_flags_t             flags)
-{
-    _pixman_implementation_src_iter_init (
-	imp->delegate, iter, image, x, y, width, height, buffer, flags);
-}
-
-static void
-delegate_dest_iter_init (pixman_implementation_t *imp,
-			 pixman_iter_t *	  iter,
-			 pixman_image_t *         image,
-			 int                      x,
-			 int                      y,
-			 int                      width,
-			 int                      height,
-			 uint8_t *		  buffer,
-			 iter_flags_t             flags)
-{
-    _pixman_implementation_dest_iter_init (
-	imp->delegate, iter, image, x, y, width, height, buffer, flags);
-}
-
-pixman_implementation_t *
-_pixman_implementation_create (pixman_implementation_t *delegate,
-			       const pixman_fast_path_t *fast_paths)
-{
-    pixman_implementation_t *imp = malloc (sizeof (pixman_implementation_t));
-    pixman_implementation_t *d;
-    int i;
-
-    if (!imp)
-	return NULL;
-
-    assert (fast_paths);
-
-    /* Make sure the whole delegate chain has the right toplevel */
-    imp->delegate = delegate;
-    for (d = imp; d != NULL; d = d->delegate)
-	d->toplevel = imp;
-
-    /* Fill out function pointers with ones that just delegate
-     */
-    imp->blt = delegate_blt;
-    imp->fill = delegate_fill;
-    imp->src_iter_init = delegate_src_iter_init;
-    imp->dest_iter_init = delegate_dest_iter_init;
-
-    for (i = 0; i < PIXMAN_N_OPERATORS; ++i)
-    {
-	imp->combine_32[i] = delegate_combine_32;
-	imp->combine_64[i] = delegate_combine_64;
-	imp->combine_32_ca[i] = delegate_combine_32_ca;
-	imp->combine_64_ca[i] = delegate_combine_64_ca;
-    }
-
-    imp->fast_paths = fast_paths;
-
-    return imp;
-}
-
-void
-_pixman_implementation_combine_32 (pixman_implementation_t * imp,
-                                   pixman_op_t               op,
-                                   uint32_t *                dest,
-                                   const uint32_t *          src,
-                                   const uint32_t *          mask,
-                                   int                       width)
-{
-    (*imp->combine_32[op]) (imp, op, dest, src, mask, width);
-}
-
-void
-_pixman_implementation_combine_64 (pixman_implementation_t * imp,
-                                   pixman_op_t               op,
-                                   uint64_t *                dest,
-                                   const uint64_t *          src,
-                                   const uint64_t *          mask,
-                                   int                       width)
-{
-    (*imp->combine_64[op]) (imp, op, dest, src, mask, width);
-}
-
-void
-_pixman_implementation_combine_32_ca (pixman_implementation_t * imp,
-                                      pixman_op_t               op,
-                                      uint32_t *                dest,
-                                      const uint32_t *          src,
-                                      const uint32_t *          mask,
-                                      int                       width)
-{
-    (*imp->combine_32_ca[op]) (imp, op, dest, src, mask, width);
-}
-
-void
-_pixman_implementation_combine_64_ca (pixman_implementation_t * imp,
-                                      pixman_op_t               op,
-                                      uint64_t *                dest,
-                                      const uint64_t *          src,
-                                      const uint64_t *          mask,
-                                      int                       width)
-{
-    (*imp->combine_64_ca[op]) (imp, op, dest, src, mask, width);
-}
-
-pixman_bool_t
-_pixman_implementation_blt (pixman_implementation_t * imp,
-                            uint32_t *                src_bits,
-                            uint32_t *                dst_bits,
-                            int                       src_stride,
-                            int                       dst_stride,
-                            int                       src_bpp,
-                            int                       dst_bpp,
-                            int                       src_x,
-                            int                       src_y,
-                            int                       dst_x,
-                            int                       dst_y,
-                            int                       width,
-                            int                       height)
-{
-    return (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride,
-			src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
-			width, height);
-}
-
-pixman_bool_t
-_pixman_implementation_fill (pixman_implementation_t *imp,
-                             uint32_t *               bits,
-                             int                      stride,
-                             int                      bpp,
-                             int                      x,
-                             int                      y,
-                             int                      width,
-                             int                      height,
-                             uint32_t                 xor)
-{
-    return (*imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor);
-}
-
-static uint32_t *
-get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
-{
-    return NULL;
-}
-
-void
-_pixman_implementation_src_iter_init (pixman_implementation_t	*imp,
-				      pixman_iter_t             *iter,
-				      pixman_image_t		*image,
-				      int			 x,
-				      int			 y,
-				      int			 width,
-				      int			 height,
-				      uint8_t			*buffer,
-				      iter_flags_t		 flags)
-{
-    if (!image)
-    {
-	iter->get_scanline = get_scanline_null;
-    }
-    else if ((flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
-	     (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
-    {
-	iter->get_scanline = _pixman_iter_get_scanline_noop;
-    }
-    else
-    {
-	(*imp->src_iter_init) (
-	    imp, iter, image, x, y, width, height, buffer, flags);
-    }
-}
-
-void
-_pixman_implementation_dest_iter_init (pixman_implementation_t	*imp,
-				       pixman_iter_t            *iter,
-				       pixman_image_t		*image,
-				       int			 x,
-				       int			 y,
-				       int			 width,
-				       int			 height,
-				       uint8_t			*buffer,
-				       iter_flags_t		 flags)
-{
-    (*imp->dest_iter_init) (
-	imp, iter, image, x, y, width, height, buffer, flags);
-}
+/*
+ * Copyright © 2009 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+static void
+delegate_combine_32 (pixman_implementation_t * imp,
+                     pixman_op_t               op,
+                     uint32_t *                dest,
+                     const uint32_t *          src,
+                     const uint32_t *          mask,
+                     int                       width)
+{
+    _pixman_implementation_combine_32 (imp->delegate,
+                                       op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_64 (pixman_implementation_t * imp,
+                     pixman_op_t               op,
+                     uint64_t *                dest,
+                     const uint64_t *          src,
+                     const uint64_t *          mask,
+                     int                       width)
+{
+    _pixman_implementation_combine_64 (imp->delegate,
+                                       op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_32_ca (pixman_implementation_t * imp,
+                        pixman_op_t               op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                       width)
+{
+    _pixman_implementation_combine_32_ca (imp->delegate,
+                                          op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_64_ca (pixman_implementation_t * imp,
+                        pixman_op_t               op,
+                        uint64_t *                dest,
+                        const uint64_t *          src,
+                        const uint64_t *          mask,
+                        int                       width)
+{
+    _pixman_implementation_combine_64_ca (imp->delegate,
+                                          op, dest, src, mask, width);
+}
+
+static pixman_bool_t
+delegate_blt (pixman_implementation_t * imp,
+              uint32_t *                src_bits,
+              uint32_t *                dst_bits,
+              int                       src_stride,
+              int                       dst_stride,
+              int                       src_bpp,
+              int                       dst_bpp,
+              int                       src_x,
+              int                       src_y,
+              int                       dst_x,
+              int                       dst_y,
+              int                       width,
+              int                       height)
+{
+    return _pixman_implementation_blt (
+	imp->delegate, src_bits, dst_bits, src_stride, dst_stride,
+	src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
+	width, height);
+}
+
+static pixman_bool_t
+delegate_fill (pixman_implementation_t *imp,
+               uint32_t *               bits,
+               int                      stride,
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+               uint32_t                 xor)
+{
+    return _pixman_implementation_fill (
+	imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+}
+
+static void
+delegate_src_iter_init (pixman_implementation_t *imp,
+			pixman_iter_t *	         iter)
+{
+    imp->delegate->src_iter_init (imp->delegate, iter);
+}
+
+static void
+delegate_dest_iter_init (pixman_implementation_t *imp,
+			 pixman_iter_t *	  iter)
+{
+    imp->delegate->dest_iter_init (imp->delegate, iter);
+}
+
+pixman_implementation_t *
+_pixman_implementation_create (pixman_implementation_t *delegate,
+			       const pixman_fast_path_t *fast_paths)
+{
+    pixman_implementation_t *imp = malloc (sizeof (pixman_implementation_t));
+    pixman_implementation_t *d;
+    int i;
+
+    if (!imp)
+	return NULL;
+
+    assert (fast_paths);
+
+    /* Make sure the whole delegate chain has the right toplevel */
+    imp->delegate = delegate;
+    for (d = imp; d != NULL; d = d->delegate)
+	d->toplevel = imp;
+
+    /* Fill out function pointers with ones that just delegate
+     */
+    imp->blt = delegate_blt;
+    imp->fill = delegate_fill;
+    imp->src_iter_init = delegate_src_iter_init;
+    imp->dest_iter_init = delegate_dest_iter_init;
+
+    for (i = 0; i < PIXMAN_N_OPERATORS; ++i)
+    {
+	imp->combine_32[i] = delegate_combine_32;
+	imp->combine_64[i] = delegate_combine_64;
+	imp->combine_32_ca[i] = delegate_combine_32_ca;
+	imp->combine_64_ca[i] = delegate_combine_64_ca;
+    }
+
+    imp->fast_paths = fast_paths;
+
+    return imp;
+}
+
+void
+_pixman_implementation_combine_32 (pixman_implementation_t * imp,
+                                   pixman_op_t               op,
+                                   uint32_t *                dest,
+                                   const uint32_t *          src,
+                                   const uint32_t *          mask,
+                                   int                       width)
+{
+    (*imp->combine_32[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_64 (pixman_implementation_t * imp,
+                                   pixman_op_t               op,
+                                   uint64_t *                dest,
+                                   const uint64_t *          src,
+                                   const uint64_t *          mask,
+                                   int                       width)
+{
+    (*imp->combine_64[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_32_ca (pixman_implementation_t * imp,
+                                      pixman_op_t               op,
+                                      uint32_t *                dest,
+                                      const uint32_t *          src,
+                                      const uint32_t *          mask,
+                                      int                       width)
+{
+    (*imp->combine_32_ca[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_64_ca (pixman_implementation_t * imp,
+                                      pixman_op_t               op,
+                                      uint64_t *                dest,
+                                      const uint64_t *          src,
+                                      const uint64_t *          mask,
+                                      int                       width)
+{
+    (*imp->combine_64_ca[op]) (imp, op, dest, src, mask, width);
+}
+
+pixman_bool_t
+_pixman_implementation_blt (pixman_implementation_t * imp,
+                            uint32_t *                src_bits,
+                            uint32_t *                dst_bits,
+                            int                       src_stride,
+                            int                       dst_stride,
+                            int                       src_bpp,
+                            int                       dst_bpp,
+                            int                       src_x,
+                            int                       src_y,
+                            int                       dst_x,
+                            int                       dst_y,
+                            int                       width,
+                            int                       height)
+{
+    return (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride,
+			src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
+			width, height);
+}
+
+pixman_bool_t
+_pixman_implementation_fill (pixman_implementation_t *imp,
+                             uint32_t *               bits,
+                             int                      stride,
+                             int                      bpp,
+                             int                      x,
+                             int                      y,
+                             int                      width,
+                             int                      height,
+                             uint32_t                 xor)
+{
+    return (*imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor);
+}
+
+static uint32_t *
+get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return NULL;
+}
+
+void
+_pixman_implementation_src_iter_init (pixman_implementation_t	*imp,
+				      pixman_iter_t             *iter,
+				      pixman_image_t		*image,
+				      int			 x,
+				      int			 y,
+				      int			 width,
+				      int			 height,
+				      uint8_t			*buffer,
+				      iter_flags_t		 flags)
+{
+    iter->image = image;
+    iter->buffer = (uint32_t *)buffer;
+    iter->x = x;
+    iter->y = y;
+    iter->width = width;
+    iter->height = height;
+    iter->flags = flags;
+
+    if (!image)
+    {
+	iter->get_scanline = get_scanline_null;
+    }
+    else if ((flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
+	     (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
+    {
+	iter->get_scanline = _pixman_iter_get_scanline_noop;
+    }
+    else
+    {
+	(*imp->src_iter_init) (imp, iter);
+    }
+}
+
+void
+_pixman_implementation_dest_iter_init (pixman_implementation_t	*imp,
+				       pixman_iter_t            *iter,
+				       pixman_image_t		*image,
+				       int			 x,
+				       int			 y,
+				       int			 width,
+				       int			 height,
+				       uint8_t			*buffer,
+				       iter_flags_t		 flags)
+{
+    iter->image = image;
+    iter->buffer = (uint32_t *)buffer;
+    iter->x = x;
+    iter->y = y;
+    iter->width = width;
+    iter->height = height;
+    iter->flags = flags;
+
+    (*imp->dest_iter_init) (imp, iter);
+}
diff --git a/pixman/pixman/pixman-linear-gradient.c b/pixman/pixman/pixman-linear-gradient.c
index 07303fc03..3d5bbf63d 100644
--- a/pixman/pixman/pixman-linear-gradient.c
+++ b/pixman/pixman/pixman-linear-gradient.c
@@ -1,292 +1,286 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- *             2005 Lars Knoll & Zack Rusin, Trolltech
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdlib.h>
-#include "pixman-private.h"
-
-static pixman_bool_t
-linear_gradient_is_horizontal (pixman_image_t *image,
-			       int             x,
-			       int             y,
-			       int             width,
-			       int             height)
-{
-    linear_gradient_t *linear = (linear_gradient_t *)image;
-    pixman_vector_t v;
-    pixman_fixed_32_32_t l;
-    pixman_fixed_48_16_t dx, dy;
-    double inc;
-
-    if (image->common.transform)
-    {
-	/* projective transformation */
-	if (image->common.transform->matrix[2][0] != 0 ||
-	    image->common.transform->matrix[2][1] != 0 ||
-	    image->common.transform->matrix[2][2] == 0)
-	{
-	    return FALSE;
-	}
-
-	v.vector[0] = image->common.transform->matrix[0][1];
-	v.vector[1] = image->common.transform->matrix[1][1];
-	v.vector[2] = image->common.transform->matrix[2][2];
-    }
-    else
-    {
-	v.vector[0] = 0;
-	v.vector[1] = pixman_fixed_1;
-	v.vector[2] = pixman_fixed_1;
-    }
-
-    dx = linear->p2.x - linear->p1.x;
-    dy = linear->p2.y - linear->p1.y;
-
-    l = dx * dx + dy * dy;
-
-    if (l == 0)
-	return FALSE;
-
-    /*
-     * compute how much the input of the gradient walked changes
-     * when moving vertically through the whole image
-     */
-    inc = height * (double) pixman_fixed_1 * pixman_fixed_1 *
-	(dx * v.vector[0] + dy * v.vector[1]) /
-	(v.vector[2] * (double) l);
-
-    /* check that casting to integer would result in 0 */
-    if (-1 < inc && inc < 1)
-	return TRUE;
-
-    return FALSE;
-}
-
-static uint32_t *
-linear_get_scanline_narrow (pixman_iter_t  *iter,
-			    const uint32_t *mask)
-{
-    pixman_image_t *image  = iter->image;
-    int             x      = iter->x;
-    int             y      = iter->y;
-    int             width  = iter->width;
-    uint32_t *      buffer = iter->buffer;
-
-    pixman_vector_t v, unit;
-    pixman_fixed_32_32_t l;
-    pixman_fixed_48_16_t dx, dy;
-    gradient_t *gradient = (gradient_t *)image;
-    linear_gradient_t *linear = (linear_gradient_t *)image;
-    uint32_t *end = buffer + width;
-    pixman_gradient_walker_t walker;
-
-    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (image->common.transform)
-    {
-	if (!pixman_transform_point_3d (image->common.transform, &v))
-	    return iter->buffer;
-
-	unit.vector[0] = image->common.transform->matrix[0][0];
-	unit.vector[1] = image->common.transform->matrix[1][0];
-	unit.vector[2] = image->common.transform->matrix[2][0];
-    }
-    else
-    {
-	unit.vector[0] = pixman_fixed_1;
-	unit.vector[1] = 0;
-	unit.vector[2] = 0;
-    }
-
-    dx = linear->p2.x - linear->p1.x;
-    dy = linear->p2.y - linear->p1.y;
-
-    l = dx * dx + dy * dy;
-
-    if (l == 0 || unit.vector[2] == 0)
-    {
-	/* affine transformation only */
-        pixman_fixed_32_32_t t, next_inc;
-	double inc;
-
-	if (l == 0 || v.vector[2] == 0)
-	{
-	    t = 0;
-	    inc = 0;
-	}
-	else
-	{
-	    double invden, v2;
-
-	    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
-		(l * (double) v.vector[2]);
-	    v2 = v.vector[2] * (1. / pixman_fixed_1);
-	    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
-		 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
-	    inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden;
-	}
-	next_inc = 0;
-
-	if (((pixman_fixed_32_32_t )(inc * width)) == 0)
-	{
-	    register uint32_t color;
-
-	    color = _pixman_gradient_walker_pixel (&walker, t);
-	    while (buffer < end)
-		*buffer++ = color;
-	}
-	else
-	{
-	    int i;
-
-	    i = 0;
-	    while (buffer < end)
-	    {
-		if (!mask || *mask++)
-		{
-		    *buffer = _pixman_gradient_walker_pixel (&walker,
-							     t + next_inc);
-		}
-		i++;
-		next_inc = inc * i;
-		buffer++;
-	    }
-	}
-    }
-    else
-    {
-	/* projective transformation */
-        double t;
-
-	t = 0;
-
-	while (buffer < end)
-	{
-	    if (!mask || *mask++)
-	    {
-	        if (v.vector[2] != 0)
-		{
-		    double invden, v2;
-
-		    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
-			(l * (double) v.vector[2]);
-		    v2 = v.vector[2] * (1. / pixman_fixed_1);
-		    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
-			 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
-		}
-
-		*buffer = _pixman_gradient_walker_pixel (&walker, t);
-	    }
-
-	    ++buffer;
-
-	    v.vector[0] += unit.vector[0];
-	    v.vector[1] += unit.vector[1];
-	    v.vector[2] += unit.vector[2];
-	}
-    }
-
-    iter->y++;
-
-    return iter->buffer;
-}
-
-static uint32_t *
-linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
-    uint32_t *buffer = linear_get_scanline_narrow (iter, NULL);
-
-    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
-
-    return buffer;
-}
-
-void
-_pixman_linear_gradient_iter_init (pixman_image_t *image,
-				   pixman_iter_t  *iter,
-				   int             x,
-				   int             y,
-				   int             width,
-				   int             height,
-				   uint8_t        *buffer,
-				   iter_flags_t    flags)
-{
-    if (linear_gradient_is_horizontal (image, x, y, width, height))
-    {
-	if (flags & ITER_NARROW)
-	    linear_get_scanline_narrow (iter, NULL);
-	else
-	    linear_get_scanline_wide (iter, NULL);
-
-	iter->get_scanline = _pixman_iter_get_scanline_noop;
-    }
-    else
-    {
-	if (flags & ITER_NARROW)
-	    iter->get_scanline = linear_get_scanline_narrow;
-	else
-	    iter->get_scanline = linear_get_scanline_wide;
-    }
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_linear_gradient (pixman_point_fixed_t *        p1,
-                                     pixman_point_fixed_t *        p2,
-                                     const pixman_gradient_stop_t *stops,
-                                     int                           n_stops)
-{
-    pixman_image_t *image;
-    linear_gradient_t *linear;
-
-    image = _pixman_image_allocate ();
-
-    if (!image)
-	return NULL;
-
-    linear = &image->linear;
-
-    if (!_pixman_init_gradient (&linear->common, stops, n_stops))
-    {
-	free (image);
-	return NULL;
-    }
-
-    linear->p1 = *p1;
-    linear->p2 = *p2;
-
-    image->type = LINEAR;
-
-    return image;
-}
-
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+static pixman_bool_t
+linear_gradient_is_horizontal (pixman_image_t *image,
+			       int             x,
+			       int             y,
+			       int             width,
+			       int             height)
+{
+    linear_gradient_t *linear = (linear_gradient_t *)image;
+    pixman_vector_t v;
+    pixman_fixed_32_32_t l;
+    pixman_fixed_48_16_t dx, dy;
+    double inc;
+
+    if (image->common.transform)
+    {
+	/* projective transformation */
+	if (image->common.transform->matrix[2][0] != 0 ||
+	    image->common.transform->matrix[2][1] != 0 ||
+	    image->common.transform->matrix[2][2] == 0)
+	{
+	    return FALSE;
+	}
+
+	v.vector[0] = image->common.transform->matrix[0][1];
+	v.vector[1] = image->common.transform->matrix[1][1];
+	v.vector[2] = image->common.transform->matrix[2][2];
+    }
+    else
+    {
+	v.vector[0] = 0;
+	v.vector[1] = pixman_fixed_1;
+	v.vector[2] = pixman_fixed_1;
+    }
+
+    dx = linear->p2.x - linear->p1.x;
+    dy = linear->p2.y - linear->p1.y;
+
+    l = dx * dx + dy * dy;
+
+    if (l == 0)
+	return FALSE;
+
+    /*
+     * compute how much the input of the gradient walked changes
+     * when moving vertically through the whole image
+     */
+    inc = height * (double) pixman_fixed_1 * pixman_fixed_1 *
+	(dx * v.vector[0] + dy * v.vector[1]) /
+	(v.vector[2] * (double) l);
+
+    /* check that casting to integer would result in 0 */
+    if (-1 < inc && inc < 1)
+	return TRUE;
+
+    return FALSE;
+}
+
+static uint32_t *
+linear_get_scanline_narrow (pixman_iter_t  *iter,
+			    const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
+    pixman_vector_t v, unit;
+    pixman_fixed_32_32_t l;
+    pixman_fixed_48_16_t dx, dy;
+    gradient_t *gradient = (gradient_t *)image;
+    linear_gradient_t *linear = (linear_gradient_t *)image;
+    uint32_t *end = buffer + width;
+    pixman_gradient_walker_t walker;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+
+	unit.vector[0] = image->common.transform->matrix[0][0];
+	unit.vector[1] = image->common.transform->matrix[1][0];
+	unit.vector[2] = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+	unit.vector[0] = pixman_fixed_1;
+	unit.vector[1] = 0;
+	unit.vector[2] = 0;
+    }
+
+    dx = linear->p2.x - linear->p1.x;
+    dy = linear->p2.y - linear->p1.y;
+
+    l = dx * dx + dy * dy;
+
+    if (l == 0 || unit.vector[2] == 0)
+    {
+	/* affine transformation only */
+        pixman_fixed_32_32_t t, next_inc;
+	double inc;
+
+	if (l == 0 || v.vector[2] == 0)
+	{
+	    t = 0;
+	    inc = 0;
+	}
+	else
+	{
+	    double invden, v2;
+
+	    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+		(l * (double) v.vector[2]);
+	    v2 = v.vector[2] * (1. / pixman_fixed_1);
+	    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+		 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
+	    inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden;
+	}
+	next_inc = 0;
+
+	if (((pixman_fixed_32_32_t )(inc * width)) == 0)
+	{
+	    register uint32_t color;
+
+	    color = _pixman_gradient_walker_pixel (&walker, t);
+	    while (buffer < end)
+		*buffer++ = color;
+	}
+	else
+	{
+	    int i;
+
+	    i = 0;
+	    while (buffer < end)
+	    {
+		if (!mask || *mask++)
+		{
+		    *buffer = _pixman_gradient_walker_pixel (&walker,
+							     t + next_inc);
+		}
+		i++;
+		next_inc = inc * i;
+		buffer++;
+	    }
+	}
+    }
+    else
+    {
+	/* projective transformation */
+        double t;
+
+	t = 0;
+
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+	        if (v.vector[2] != 0)
+		{
+		    double invden, v2;
+
+		    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+			(l * (double) v.vector[2]);
+		    v2 = v.vector[2] * (1. / pixman_fixed_1);
+		    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+			 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
+		}
+
+		*buffer = _pixman_gradient_walker_pixel (&walker, t);
+	    }
+
+	    ++buffer;
+
+	    v.vector[0] += unit.vector[0];
+	    v.vector[1] += unit.vector[1];
+	    v.vector[2] += unit.vector[2];
+	}
+    }
+
+    iter->y++;
+
+    return iter->buffer;
+}
+
+static uint32_t *
+linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = linear_get_scanline_narrow (iter, NULL);
+
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter)
+{
+    if (linear_gradient_is_horizontal (
+	    iter->image, iter->x, iter->y, iter->width, iter->height))
+    {
+	if (iter->flags & ITER_NARROW)
+	    linear_get_scanline_narrow (iter, NULL);
+	else
+	    linear_get_scanline_wide (iter, NULL);
+
+	iter->get_scanline = _pixman_iter_get_scanline_noop;
+    }
+    else
+    {
+	if (iter->flags & ITER_NARROW)
+	    iter->get_scanline = linear_get_scanline_narrow;
+	else
+	    iter->get_scanline = linear_get_scanline_wide;
+    }
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_linear_gradient (pixman_point_fixed_t *        p1,
+                                     pixman_point_fixed_t *        p2,
+                                     const pixman_gradient_stop_t *stops,
+                                     int                           n_stops)
+{
+    pixman_image_t *image;
+    linear_gradient_t *linear;
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+	return NULL;
+
+    linear = &image->linear;
+
+    if (!_pixman_init_gradient (&linear->common, stops, n_stops))
+    {
+	free (image);
+	return NULL;
+    }
+
+    linear->p1 = *p1;
+    linear->p2 = *p2;
+
+    image->type = LINEAR;
+
+    return image;
+}
+
diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h
index ee7f4d676..658aeea8a 100644
--- a/pixman/pixman/pixman-private.h
+++ b/pixman/pixman/pixman-private.h
@@ -212,14 +212,19 @@ typedef enum
 
 struct pixman_iter_t
 {
-    pixman_iter_get_scanline_t	get_scanline;
-    pixman_iter_write_back_t	write_back;
-
+    /* These are initialized by _pixman_implementation_{src,dest}_init */
     pixman_image_t *		image;
     uint32_t *			buffer;
     int				x, y;
     int				width;
+    int				height;
+    iter_flags_t		flags;
 
+    /* These function pointers are initialized by the implementation */
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+
+    /* These fields are scratch data that implementations can use */
     uint8_t *			bits;
     int				stride;
 };
@@ -228,39 +233,22 @@ void
 _pixman_bits_image_setup_accessors (bits_image_t *image);
 
 void
-_pixman_bits_image_src_iter_init (pixman_image_t *image,
-				  pixman_iter_t *iter,
-				  int x, int y, int width, int height,
-				  uint8_t *buffer, iter_flags_t flags);
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
 void
-_pixman_bits_image_dest_iter_init (pixman_image_t *image,
-				   pixman_iter_t *iter,
-				   int x, int y, int width, int height,
-				   uint8_t *buffer, iter_flags_t flags);
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter);
 
 void
-_pixman_solid_fill_iter_init (pixman_image_t *image,
-			      pixman_iter_t  *iter,
-			      int x, int y, int width, int height,
-			      uint8_t *buffer, iter_flags_t flags);
+_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t  *iter);
 
 void
-_pixman_linear_gradient_iter_init (pixman_image_t *image,
-				   pixman_iter_t  *iter,
-				   int x, int y, int width, int height,
-				   uint8_t *buffer, iter_flags_t flags);
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter);
 
 void
-_pixman_radial_gradient_iter_init (pixman_image_t *image,
-				   pixman_iter_t *iter,
-				   int x, int y, int width, int height,
-				   uint8_t *buffer, iter_flags_t flags);
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
 
 void
-_pixman_conical_gradient_iter_init (pixman_image_t *image,
-				    pixman_iter_t *iter,
-				    int x, int y, int width, int height,
-				    uint8_t *buffer, iter_flags_t flags);
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
 
 pixman_image_t *
 _pixman_image_allocate (void);
@@ -408,14 +396,7 @@ typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
 					     int                      height,
 					     uint32_t                 xor);
 typedef void (*pixman_iter_init_func_t) (pixman_implementation_t *imp,
-                                         pixman_iter_t           *iter,
-                                         pixman_image_t          *image,
-                                         int                      x,
-                                         int                      y,
-                                         int                      width,
-                                         int                      height,
-                                         uint8_t                 *buffer,
-                                         iter_flags_t             flags);
+                                         pixman_iter_t           *iter);
 
 void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
 void _pixman_setup_combiner_functions_64 (pixman_implementation_t *imp);
diff --git a/pixman/pixman/pixman-radial-gradient.c b/pixman/pixman/pixman-radial-gradient.c
index 6523b8259..63c712cc2 100644
--- a/pixman/pixman/pixman-radial-gradient.c
+++ b/pixman/pixman/pixman-radial-gradient.c
@@ -1,463 +1,460 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- *
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- * Copyright © 2000 SuSE, Inc.
- *             2005 Lars Knoll & Zack Rusin, Trolltech
- * Copyright © 2007 Red Hat, Inc.
- *
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdlib.h>
-#include <math.h>
-#include "pixman-private.h"
-
-static inline pixman_fixed_32_32_t
-dot (pixman_fixed_48_16_t x1,
-     pixman_fixed_48_16_t y1,
-     pixman_fixed_48_16_t z1,
-     pixman_fixed_48_16_t x2,
-     pixman_fixed_48_16_t y2,
-     pixman_fixed_48_16_t z2)
-{
-    /*
-     * Exact computation, assuming that the input values can
-     * be represented as pixman_fixed_16_16_t
-     */
-    return x1 * x2 + y1 * y2 + z1 * z2;
-}
-
-static inline double
-fdot (double x1,
-      double y1,
-      double z1,
-      double x2,
-      double y2,
-      double z2)
-{
-    /*
-     * Error can be unbound in some special cases.
-     * Using clever dot product algorithms (for example compensated
-     * dot product) would improve this but make the code much less
-     * obvious
-     */
-    return x1 * x2 + y1 * y2 + z1 * z2;
-}
-
-static uint32_t
-radial_compute_color (double                    a,
-		      double                    b,
-		      double                    c,
-		      double                    inva,
-		      double                    dr,
-		      double                    mindr,
-		      pixman_gradient_walker_t *walker,
-		      pixman_repeat_t           repeat)
-{
-    /*
-     * In this function error propagation can lead to bad results:
-     *  - det can have an unbound error (if b*b-a*c is very small),
-     *    potentially making it the opposite sign of what it should have been
-     *    (thus clearing a pixel that would have been colored or vice-versa)
-     *    or propagating the error to sqrtdet;
-     *    if det has the wrong sign or b is very small, this can lead to bad
-     *    results
-     *
-     *  - the algorithm used to compute the solutions of the quadratic
-     *    equation is not numerically stable (but saves one division compared
-     *    to the numerically stable one);
-     *    this can be a problem if a*c is much smaller than b*b
-     *
-     *  - the above problems are worse if a is small (as inva becomes bigger)
-     */
-    double det;
-
-    if (a == 0)
-    {
-	double t;
-
-	if (b == 0)
-	    return 0;
-
-	t = pixman_fixed_1 / 2 * c / b;
-	if (repeat == PIXMAN_REPEAT_NONE)
-	{
-	    if (0 <= t && t <= pixman_fixed_1)
-		return _pixman_gradient_walker_pixel (walker, t);
-	}
-	else
-	{
-	    if (t * dr > mindr)
-		return _pixman_gradient_walker_pixel (walker, t);
-	}
-
-	return 0;
-    }
-
-    det = fdot (b, a, 0, b, -c, 0);
-    if (det >= 0)
-    {
-	double sqrtdet, t0, t1;
-
-	sqrtdet = sqrt (det);
-	t0 = (b + sqrtdet) * inva;
-	t1 = (b - sqrtdet) * inva;
-
-	if (repeat == PIXMAN_REPEAT_NONE)
-	{
-	    if (0 <= t0 && t0 <= pixman_fixed_1)
-		return _pixman_gradient_walker_pixel (walker, t0);
-	    else if (0 <= t1 && t1 <= pixman_fixed_1)
-		return _pixman_gradient_walker_pixel (walker, t1);
-	}
-	else
-	{
-	    if (t0 * dr > mindr)
-		return _pixman_gradient_walker_pixel (walker, t0);
-	    else if (t1 * dr > mindr)
-		return _pixman_gradient_walker_pixel (walker, t1);
-	}
-    }
-
-    return 0;
-}
-
-static uint32_t *
-radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
-{
-    /*
-     * Implementation of radial gradients following the PDF specification.
-     * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference
-     * Manual (PDF 32000-1:2008 at the time of this writing).
-     * 
-     * In the radial gradient problem we are given two circles (c₁,r₁) and
-     * (c₂,r₂) that define the gradient itself.
-     *
-     * Mathematically the gradient can be defined as the family of circles
-     *
-     *     ((1-t)·c₁ + t·(c₂), (1-t)·r₁ + t·r₂)
-     *
-     * excluding those circles whose radius would be < 0. When a point
-     * belongs to more than one circle, the one with a bigger t is the only
-     * one that contributes to its color. When a point does not belong
-     * to any of the circles, it is transparent black, i.e. RGBA (0, 0, 0, 0).
-     * Further limitations on the range of values for t are imposed when
-     * the gradient is not repeated, namely t must belong to [0,1].
-     *
-     * The graphical result is the same as drawing the valid (radius > 0)
-     * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient
-     * is not repeated) using SOURCE operatior composition.
-     *
-     * It looks like a cone pointing towards the viewer if the ending circle
-     * is smaller than the starting one, a cone pointing inside the page if
-     * the starting circle is the smaller one and like a cylinder if they
-     * have the same radius.
-     *
-     * What we actually do is, given the point whose color we are interested
-     * in, compute the t values for that point, solving for t in:
-     *
-     *     length((1-t)·c₁ + t·(c₂) - p) = (1-t)·r₁ + t·r₂
-     * 
-     * Let's rewrite it in a simpler way, by defining some auxiliary
-     * variables:
-     *
-     *     cd = c₂ - c₁
-     *     pd = p - c₁
-     *     dr = r₂ - r₁
-     *     lenght(t·cd - pd) = r₁ + t·dr
-     *
-     * which actually means
-     *
-     *     hypot(t·cdx - pdx, t·cdy - pdy) = r₁ + t·dr
-     *
-     * or
-     *
-     *     ⎷((t·cdx - pdx)² + (t·cdy - pdy)²) = r₁ + t·dr.
-     *
-     * If we impose (as stated earlier) that r₁ + t·dr >= 0, it becomes:
-     *
-     *     (t·cdx - pdx)² + (t·cdy - pdy)² = (r₁ + t·dr)²
-     *
-     * where we can actually expand the squares and solve for t:
-     *
-     *     t²cdx² - 2t·cdx·pdx + pdx² + t²cdy² - 2t·cdy·pdy + pdy² =
-     *       = r₁² + 2·r₁·t·dr + t²·dr²
-     *
-     *     (cdx² + cdy² - dr²)t² - 2(cdx·pdx + cdy·pdy + r₁·dr)t +
-     *         (pdx² + pdy² - r₁²) = 0
-     *
-     *     A = cdx² + cdy² - dr²
-     *     B = pdx·cdx + pdy·cdy + r₁·dr
-     *     C = pdx² + pdy² - r₁²
-     *     At² - 2Bt + C = 0
-     * 
-     * The solutions (unless the equation degenerates because of A = 0) are:
-     *
-     *     t = (B ± ⎷(B² - A·C)) / A
-     *
-     * The solution we are going to prefer is the bigger one, unless the
-     * radius associated to it is negative (or it falls outside the valid t
-     * range).
-     *
-     * Additional observations (useful for optimizations):
-     * A does not depend on p
-     *
-     * A < 0 <=> one of the two circles completely contains the other one
-     *   <=> for every p, the radiuses associated with the two t solutions
-     *       have opposite sign
-     */
-    pixman_image_t *image = iter->image;
-    int x = iter->x;
-    int y = iter->y;
-    int width = iter->width;
-    uint32_t *buffer = iter->buffer;
-
-    gradient_t *gradient = (gradient_t *)image;
-    radial_gradient_t *radial = (radial_gradient_t *)image;
-    uint32_t *end = buffer + width;
-    pixman_gradient_walker_t walker;
-    pixman_vector_t v, unit;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
-
-    if (image->common.transform)
-    {
-	if (!pixman_transform_point_3d (image->common.transform, &v))
-	    return iter->buffer;
-	
-	unit.vector[0] = image->common.transform->matrix[0][0];
-	unit.vector[1] = image->common.transform->matrix[1][0];
-	unit.vector[2] = image->common.transform->matrix[2][0];
-    }
-    else
-    {
-	unit.vector[0] = pixman_fixed_1;
-	unit.vector[1] = 0;
-	unit.vector[2] = 0;
-    }
-
-    if (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1)
-    {
-	/*
-	 * Given:
-	 *
-	 * t = (B ± ⎷(B² - A·C)) / A
-	 *
-	 * where
-	 *
-	 * A = cdx² + cdy² - dr²
-	 * B = pdx·cdx + pdy·cdy + r₁·dr
-	 * C = pdx² + pdy² - r₁²
-	 * det = B² - A·C
-	 *
-	 * Since we have an affine transformation, we know that (pdx, pdy)
-	 * increase linearly with each pixel,
-	 *
-	 * pdx = pdx₀ + n·ux,
-	 * pdy = pdy₀ + n·uy,
-	 *
-	 * we can then express B, C and det through multiple differentiation.
-	 */
-	pixman_fixed_32_32_t b, db, c, dc, ddc;
-
-	/* warning: this computation may overflow */
-	v.vector[0] -= radial->c1.x;
-	v.vector[1] -= radial->c1.y;
-
-	/*
-	 * B and C are computed and updated exactly.
-	 * If fdot was used instead of dot, in the worst case it would
-	 * lose 11 bits of precision in each of the multiplication and
-	 * summing up would zero out all the bit that were preserved,
-	 * thus making the result 0 instead of the correct one.
-	 * This would mean a worst case of unbound relative error or
-	 * about 2^10 absolute error
-	 */
-	b = dot (v.vector[0], v.vector[1], radial->c1.radius,
-		 radial->delta.x, radial->delta.y, radial->delta.radius);
-	db = dot (unit.vector[0], unit.vector[1], 0,
-		  radial->delta.x, radial->delta.y, 0);
-
-	c = dot (v.vector[0], v.vector[1],
-		 -((pixman_fixed_48_16_t) radial->c1.radius),
-		 v.vector[0], v.vector[1], radial->c1.radius);
-	dc = dot (2 * (pixman_fixed_48_16_t) v.vector[0] + unit.vector[0],
-		  2 * (pixman_fixed_48_16_t) v.vector[1] + unit.vector[1],
-		  0,
-		  unit.vector[0], unit.vector[1], 0);
-	ddc = 2 * dot (unit.vector[0], unit.vector[1], 0,
-		       unit.vector[0], unit.vector[1], 0);
-
-	while (buffer < end)
-	{
-	    if (!mask || *mask++)
-	    {
-		*buffer = radial_compute_color (radial->a, b, c,
-						radial->inva,
-						radial->delta.radius,
-						radial->mindr,
-						&walker,
-						image->common.repeat);
-	    }
-
-	    b += db;
-	    c += dc;
-	    dc += ddc;
-	    ++buffer;
-	}
-    }
-    else
-    {
-	/* projective */
-	/* Warning:
-	 * error propagation guarantees are much looser than in the affine case
-	 */
-	while (buffer < end)
-	{
-	    if (!mask || *mask++)
-	    {
-		if (v.vector[2] != 0)
-		{
-		    double pdx, pdy, invv2, b, c;
-
-		    invv2 = 1. * pixman_fixed_1 / v.vector[2];
-
-		    pdx = v.vector[0] * invv2 - radial->c1.x;
-		    /*    / pixman_fixed_1 */
-
-		    pdy = v.vector[1] * invv2 - radial->c1.y;
-		    /*    / pixman_fixed_1 */
-
-		    b = fdot (pdx, pdy, radial->c1.radius,
-			      radial->delta.x, radial->delta.y,
-			      radial->delta.radius);
-		    /*  / pixman_fixed_1 / pixman_fixed_1 */
-
-		    c = fdot (pdx, pdy, -radial->c1.radius,
-			      pdx, pdy, radial->c1.radius);
-		    /*  / pixman_fixed_1 / pixman_fixed_1 */
-
-		    *buffer = radial_compute_color (radial->a, b, c,
-						    radial->inva,
-						    radial->delta.radius,
-						    radial->mindr,
-						    &walker,
-						    image->common.repeat);
-		}
-		else
-		{
-		    *buffer = 0;
-		}
-	    }
-
-	    ++buffer;
-
-	    v.vector[0] += unit.vector[0];
-	    v.vector[1] += unit.vector[1];
-	    v.vector[2] += unit.vector[2];
-	}
-    }
-
-    iter->y++;
-    return iter->buffer;
-}
-
-static uint32_t *
-radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
-    uint32_t *buffer = radial_get_scanline_narrow (iter, NULL);
-
-    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
-
-    return buffer;
-}
-
-void
-_pixman_radial_gradient_iter_init (pixman_image_t *image,
-				   pixman_iter_t *iter,
-				   int x, int y, int width, int height,
-				   uint8_t *buffer, iter_flags_t flags)
-{
-    if (flags & ITER_NARROW)
-	iter->get_scanline = radial_get_scanline_narrow;
-    else
-	iter->get_scanline = radial_get_scanline_wide;
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_radial_gradient (pixman_point_fixed_t *        inner,
-                                     pixman_point_fixed_t *        outer,
-                                     pixman_fixed_t                inner_radius,
-                                     pixman_fixed_t                outer_radius,
-                                     const pixman_gradient_stop_t *stops,
-                                     int                           n_stops)
-{
-    pixman_image_t *image;
-    radial_gradient_t *radial;
-
-    image = _pixman_image_allocate ();
-
-    if (!image)
-	return NULL;
-
-    radial = &image->radial;
-
-    if (!_pixman_init_gradient (&radial->common, stops, n_stops))
-    {
-	free (image);
-	return NULL;
-    }
-
-    image->type = RADIAL;
-
-    radial->c1.x = inner->x;
-    radial->c1.y = inner->y;
-    radial->c1.radius = inner_radius;
-    radial->c2.x = outer->x;
-    radial->c2.y = outer->y;
-    radial->c2.radius = outer_radius;
-
-    /* warning: this computations may overflow */
-    radial->delta.x = radial->c2.x - radial->c1.x;
-    radial->delta.y = radial->c2.y - radial->c1.y;
-    radial->delta.radius = radial->c2.radius - radial->c1.radius;
-
-    /* computed exactly, then cast to double -> every bit of the double
-       representation is correct (53 bits) */
-    radial->a = dot (radial->delta.x, radial->delta.y, -radial->delta.radius,
-		     radial->delta.x, radial->delta.y, radial->delta.radius);
-    if (radial->a != 0)
-	radial->inva = 1. * pixman_fixed_1 / radial->a;
-
-    radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius;
-
-    return image;
-}
-
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright © 2000 SuSE, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+static inline pixman_fixed_32_32_t
+dot (pixman_fixed_48_16_t x1,
+     pixman_fixed_48_16_t y1,
+     pixman_fixed_48_16_t z1,
+     pixman_fixed_48_16_t x2,
+     pixman_fixed_48_16_t y2,
+     pixman_fixed_48_16_t z2)
+{
+    /*
+     * Exact computation, assuming that the input values can
+     * be represented as pixman_fixed_16_16_t
+     */
+    return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static inline double
+fdot (double x1,
+      double y1,
+      double z1,
+      double x2,
+      double y2,
+      double z2)
+{
+    /*
+     * Error can be unbound in some special cases.
+     * Using clever dot product algorithms (for example compensated
+     * dot product) would improve this but make the code much less
+     * obvious
+     */
+    return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static uint32_t
+radial_compute_color (double                    a,
+		      double                    b,
+		      double                    c,
+		      double                    inva,
+		      double                    dr,
+		      double                    mindr,
+		      pixman_gradient_walker_t *walker,
+		      pixman_repeat_t           repeat)
+{
+    /*
+     * In this function error propagation can lead to bad results:
+     *  - det can have an unbound error (if b*b-a*c is very small),
+     *    potentially making it the opposite sign of what it should have been
+     *    (thus clearing a pixel that would have been colored or vice-versa)
+     *    or propagating the error to sqrtdet;
+     *    if det has the wrong sign or b is very small, this can lead to bad
+     *    results
+     *
+     *  - the algorithm used to compute the solutions of the quadratic
+     *    equation is not numerically stable (but saves one division compared
+     *    to the numerically stable one);
+     *    this can be a problem if a*c is much smaller than b*b
+     *
+     *  - the above problems are worse if a is small (as inva becomes bigger)
+     */
+    double det;
+
+    if (a == 0)
+    {
+	double t;
+
+	if (b == 0)
+	    return 0;
+
+	t = pixman_fixed_1 / 2 * c / b;
+	if (repeat == PIXMAN_REPEAT_NONE)
+	{
+	    if (0 <= t && t <= pixman_fixed_1)
+		return _pixman_gradient_walker_pixel (walker, t);
+	}
+	else
+	{
+	    if (t * dr > mindr)
+		return _pixman_gradient_walker_pixel (walker, t);
+	}
+
+	return 0;
+    }
+
+    det = fdot (b, a, 0, b, -c, 0);
+    if (det >= 0)
+    {
+	double sqrtdet, t0, t1;
+
+	sqrtdet = sqrt (det);
+	t0 = (b + sqrtdet) * inva;
+	t1 = (b - sqrtdet) * inva;
+
+	if (repeat == PIXMAN_REPEAT_NONE)
+	{
+	    if (0 <= t0 && t0 <= pixman_fixed_1)
+		return _pixman_gradient_walker_pixel (walker, t0);
+	    else if (0 <= t1 && t1 <= pixman_fixed_1)
+		return _pixman_gradient_walker_pixel (walker, t1);
+	}
+	else
+	{
+	    if (t0 * dr > mindr)
+		return _pixman_gradient_walker_pixel (walker, t0);
+	    else if (t1 * dr > mindr)
+		return _pixman_gradient_walker_pixel (walker, t1);
+	}
+    }
+
+    return 0;
+}
+
+static uint32_t *
+radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    /*
+     * Implementation of radial gradients following the PDF specification.
+     * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference
+     * Manual (PDF 32000-1:2008 at the time of this writing).
+     * 
+     * In the radial gradient problem we are given two circles (c₁,r₁) and
+     * (c₂,r₂) that define the gradient itself.
+     *
+     * Mathematically the gradient can be defined as the family of circles
+     *
+     *     ((1-t)·c₁ + t·(c₂), (1-t)·r₁ + t·r₂)
+     *
+     * excluding those circles whose radius would be < 0. When a point
+     * belongs to more than one circle, the one with a bigger t is the only
+     * one that contributes to its color. When a point does not belong
+     * to any of the circles, it is transparent black, i.e. RGBA (0, 0, 0, 0).
+     * Further limitations on the range of values for t are imposed when
+     * the gradient is not repeated, namely t must belong to [0,1].
+     *
+     * The graphical result is the same as drawing the valid (radius > 0)
+     * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient
+     * is not repeated) using SOURCE operatior composition.
+     *
+     * It looks like a cone pointing towards the viewer if the ending circle
+     * is smaller than the starting one, a cone pointing inside the page if
+     * the starting circle is the smaller one and like a cylinder if they
+     * have the same radius.
+     *
+     * What we actually do is, given the point whose color we are interested
+     * in, compute the t values for that point, solving for t in:
+     *
+     *     length((1-t)·c₁ + t·(c₂) - p) = (1-t)·r₁ + t·r₂
+     * 
+     * Let's rewrite it in a simpler way, by defining some auxiliary
+     * variables:
+     *
+     *     cd = c₂ - c₁
+     *     pd = p - c₁
+     *     dr = r₂ - r₁
+     *     lenght(t·cd - pd) = r₁ + t·dr
+     *
+     * which actually means
+     *
+     *     hypot(t·cdx - pdx, t·cdy - pdy) = r₁ + t·dr
+     *
+     * or
+     *
+     *     ⎷((t·cdx - pdx)² + (t·cdy - pdy)²) = r₁ + t·dr.
+     *
+     * If we impose (as stated earlier) that r₁ + t·dr >= 0, it becomes:
+     *
+     *     (t·cdx - pdx)² + (t·cdy - pdy)² = (r₁ + t·dr)²
+     *
+     * where we can actually expand the squares and solve for t:
+     *
+     *     t²cdx² - 2t·cdx·pdx + pdx² + t²cdy² - 2t·cdy·pdy + pdy² =
+     *       = r₁² + 2·r₁·t·dr + t²·dr²
+     *
+     *     (cdx² + cdy² - dr²)t² - 2(cdx·pdx + cdy·pdy + r₁·dr)t +
+     *         (pdx² + pdy² - r₁²) = 0
+     *
+     *     A = cdx² + cdy² - dr²
+     *     B = pdx·cdx + pdy·cdy + r₁·dr
+     *     C = pdx² + pdy² - r₁²
+     *     At² - 2Bt + C = 0
+     * 
+     * The solutions (unless the equation degenerates because of A = 0) are:
+     *
+     *     t = (B ± ⎷(B² - A·C)) / A
+     *
+     * The solution we are going to prefer is the bigger one, unless the
+     * radius associated to it is negative (or it falls outside the valid t
+     * range).
+     *
+     * Additional observations (useful for optimizations):
+     * A does not depend on p
+     *
+     * A < 0 <=> one of the two circles completely contains the other one
+     *   <=> for every p, the radiuses associated with the two t solutions
+     *       have opposite sign
+     */
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    uint32_t *buffer = iter->buffer;
+
+    gradient_t *gradient = (gradient_t *)image;
+    radial_gradient_t *radial = (radial_gradient_t *)image;
+    uint32_t *end = buffer + width;
+    pixman_gradient_walker_t walker;
+    pixman_vector_t v, unit;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+	
+	unit.vector[0] = image->common.transform->matrix[0][0];
+	unit.vector[1] = image->common.transform->matrix[1][0];
+	unit.vector[2] = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+	unit.vector[0] = pixman_fixed_1;
+	unit.vector[1] = 0;
+	unit.vector[2] = 0;
+    }
+
+    if (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1)
+    {
+	/*
+	 * Given:
+	 *
+	 * t = (B ± ⎷(B² - A·C)) / A
+	 *
+	 * where
+	 *
+	 * A = cdx² + cdy² - dr²
+	 * B = pdx·cdx + pdy·cdy + r₁·dr
+	 * C = pdx² + pdy² - r₁²
+	 * det = B² - A·C
+	 *
+	 * Since we have an affine transformation, we know that (pdx, pdy)
+	 * increase linearly with each pixel,
+	 *
+	 * pdx = pdx₀ + n·ux,
+	 * pdy = pdy₀ + n·uy,
+	 *
+	 * we can then express B, C and det through multiple differentiation.
+	 */
+	pixman_fixed_32_32_t b, db, c, dc, ddc;
+
+	/* warning: this computation may overflow */
+	v.vector[0] -= radial->c1.x;
+	v.vector[1] -= radial->c1.y;
+
+	/*
+	 * B and C are computed and updated exactly.
+	 * If fdot was used instead of dot, in the worst case it would
+	 * lose 11 bits of precision in each of the multiplication and
+	 * summing up would zero out all the bit that were preserved,
+	 * thus making the result 0 instead of the correct one.
+	 * This would mean a worst case of unbound relative error or
+	 * about 2^10 absolute error
+	 */
+	b = dot (v.vector[0], v.vector[1], radial->c1.radius,
+		 radial->delta.x, radial->delta.y, radial->delta.radius);
+	db = dot (unit.vector[0], unit.vector[1], 0,
+		  radial->delta.x, radial->delta.y, 0);
+
+	c = dot (v.vector[0], v.vector[1],
+		 -((pixman_fixed_48_16_t) radial->c1.radius),
+		 v.vector[0], v.vector[1], radial->c1.radius);
+	dc = dot (2 * (pixman_fixed_48_16_t) v.vector[0] + unit.vector[0],
+		  2 * (pixman_fixed_48_16_t) v.vector[1] + unit.vector[1],
+		  0,
+		  unit.vector[0], unit.vector[1], 0);
+	ddc = 2 * dot (unit.vector[0], unit.vector[1], 0,
+		       unit.vector[0], unit.vector[1], 0);
+
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+		*buffer = radial_compute_color (radial->a, b, c,
+						radial->inva,
+						radial->delta.radius,
+						radial->mindr,
+						&walker,
+						image->common.repeat);
+	    }
+
+	    b += db;
+	    c += dc;
+	    dc += ddc;
+	    ++buffer;
+	}
+    }
+    else
+    {
+	/* projective */
+	/* Warning:
+	 * error propagation guarantees are much looser than in the affine case
+	 */
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+		if (v.vector[2] != 0)
+		{
+		    double pdx, pdy, invv2, b, c;
+
+		    invv2 = 1. * pixman_fixed_1 / v.vector[2];
+
+		    pdx = v.vector[0] * invv2 - radial->c1.x;
+		    /*    / pixman_fixed_1 */
+
+		    pdy = v.vector[1] * invv2 - radial->c1.y;
+		    /*    / pixman_fixed_1 */
+
+		    b = fdot (pdx, pdy, radial->c1.radius,
+			      radial->delta.x, radial->delta.y,
+			      radial->delta.radius);
+		    /*  / pixman_fixed_1 / pixman_fixed_1 */
+
+		    c = fdot (pdx, pdy, -radial->c1.radius,
+			      pdx, pdy, radial->c1.radius);
+		    /*  / pixman_fixed_1 / pixman_fixed_1 */
+
+		    *buffer = radial_compute_color (radial->a, b, c,
+						    radial->inva,
+						    radial->delta.radius,
+						    radial->mindr,
+						    &walker,
+						    image->common.repeat);
+		}
+		else
+		{
+		    *buffer = 0;
+		}
+	    }
+
+	    ++buffer;
+
+	    v.vector[0] += unit.vector[0];
+	    v.vector[1] += unit.vector[1];
+	    v.vector[2] += unit.vector[2];
+	}
+    }
+
+    iter->y++;
+    return iter->buffer;
+}
+
+static uint32_t *
+radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = radial_get_scanline_narrow (iter, NULL);
+
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+	iter->get_scanline = radial_get_scanline_narrow;
+    else
+	iter->get_scanline = radial_get_scanline_wide;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_radial_gradient (pixman_point_fixed_t *        inner,
+                                     pixman_point_fixed_t *        outer,
+                                     pixman_fixed_t                inner_radius,
+                                     pixman_fixed_t                outer_radius,
+                                     const pixman_gradient_stop_t *stops,
+                                     int                           n_stops)
+{
+    pixman_image_t *image;
+    radial_gradient_t *radial;
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+	return NULL;
+
+    radial = &image->radial;
+
+    if (!_pixman_init_gradient (&radial->common, stops, n_stops))
+    {
+	free (image);
+	return NULL;
+    }
+
+    image->type = RADIAL;
+
+    radial->c1.x = inner->x;
+    radial->c1.y = inner->y;
+    radial->c1.radius = inner_radius;
+    radial->c2.x = outer->x;
+    radial->c2.y = outer->y;
+    radial->c2.radius = outer_radius;
+
+    /* warning: this computations may overflow */
+    radial->delta.x = radial->c2.x - radial->c1.x;
+    radial->delta.y = radial->c2.y - radial->c1.y;
+    radial->delta.radius = radial->c2.radius - radial->c1.radius;
+
+    /* computed exactly, then cast to double -> every bit of the double
+       representation is correct (53 bits) */
+    radial->a = dot (radial->delta.x, radial->delta.y, -radial->delta.radius,
+		     radial->delta.x, radial->delta.y, radial->delta.radius);
+    if (radial->a != 0)
+	radial->inva = 1. * pixman_fixed_1 / radial->a;
+
+    radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius;
+
+    return image;
+}
+
diff --git a/pixman/pixman/pixman-solid-fill.c b/pixman/pixman/pixman-solid-fill.c
index 67681f2c0..fcda3abb5 100644
--- a/pixman/pixman/pixman-solid-fill.c
+++ b/pixman/pixman/pixman-solid-fill.c
@@ -1,92 +1,89 @@
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007, 2009 Red Hat, Inc.
- * Copyright © 2009 Soren Sandmann
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  SuSE makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include "pixman-private.h"
-
-void
-_pixman_solid_fill_iter_init (pixman_image_t *image,
-			      pixman_iter_t  *iter,
-			      int x, int y, int width, int height,
-			      uint8_t *buffer, iter_flags_t flags)
-{
-    if (flags & ITER_NARROW)
-    {
-	uint32_t *b = (uint32_t *)buffer;
-	uint32_t *e = b + width;
-	uint32_t color = image->solid.color_32;
-
-	while (b < e)
-	    *(b++) = color;
-    }
-    else
-    {
-	uint64_t *b = (uint64_t *)buffer;
-	uint64_t *e = b + width;
-	uint64_t color = image->solid.color_64;
-
-	while (b < e)
-	    *(b++) = color;
-    }
-
-    iter->get_scanline = _pixman_iter_get_scanline_noop;
-}
-
-static uint32_t
-color_to_uint32 (const pixman_color_t *color)
-{
-    return
-        (color->alpha >> 8 << 24) |
-        (color->red >> 8 << 16) |
-        (color->green & 0xff00) |
-        (color->blue >> 8);
-}
-
-static uint64_t
-color_to_uint64 (const pixman_color_t *color)
-{
-    return
-        ((uint64_t)color->alpha << 48) |
-        ((uint64_t)color->red << 32) |
-        ((uint64_t)color->green << 16) |
-        ((uint64_t)color->blue);
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_solid_fill (pixman_color_t *color)
-{
-    pixman_image_t *img = _pixman_image_allocate ();
-
-    if (!img)
-	return NULL;
-
-    img->type = SOLID;
-    img->solid.color = *color;
-    img->solid.color_32 = color_to_uint32 (color);
-    img->solid.color_64 = color_to_uint64 (color);
-
-    return img;
-}
-
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007, 2009 Red Hat, Inc.
+ * Copyright © 2009 Soren Sandmann
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+void
+_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t  *iter)
+{
+    if (iter->flags & ITER_NARROW)
+    {
+	uint32_t *b = (uint32_t *)iter->buffer;
+	uint32_t *e = b + iter->width;
+	uint32_t color = iter->image->solid.color_32;
+
+	while (b < e)
+	    *(b++) = color;
+    }
+    else
+    {
+	uint64_t *b = (uint64_t *)iter->buffer;
+	uint64_t *e = b + iter->width;
+	uint64_t color = image->solid.color_64;
+
+	while (b < e)
+	    *(b++) = color;
+    }
+
+    iter->get_scanline = _pixman_iter_get_scanline_noop;
+}
+
+static uint32_t
+color_to_uint32 (const pixman_color_t *color)
+{
+    return
+        (color->alpha >> 8 << 24) |
+        (color->red >> 8 << 16) |
+        (color->green & 0xff00) |
+        (color->blue >> 8);
+}
+
+static uint64_t
+color_to_uint64 (const pixman_color_t *color)
+{
+    return
+        ((uint64_t)color->alpha << 48) |
+        ((uint64_t)color->red << 32) |
+        ((uint64_t)color->green << 16) |
+        ((uint64_t)color->blue);
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_solid_fill (pixman_color_t *color)
+{
+    pixman_image_t *img = _pixman_image_allocate ();
+
+    if (!img)
+	return NULL;
+
+    img->type = SOLID;
+    img->solid.color = *color;
+    img->solid.color_32 = color_to_uint32 (color);
+    img->solid.color_64 = color_to_uint64 (color);
+
+    return img;
+}
+
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index 696005f75..a52a959f5 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -1,6077 +1,6076 @@
-/*
- * Copyright © 2008 Rodrigo Kumpera
- * Copyright © 2008 André Tupinambá
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Red Hat not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  Red Hat makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author:  Rodrigo Kumpera (kumpera@gmail.com)
- *          André Tupinambá (andrelrt@gmail.com)
- *
- * Based on work by Owen Taylor and Søren Sandmann
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
-#include <emmintrin.h> /* for SSE2 intrinsics */
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-fast-path.h"
-
-static __m128i mask_0080;
-static __m128i mask_00ff;
-static __m128i mask_0101;
-static __m128i mask_ffff;
-static __m128i mask_ff000000;
-static __m128i mask_alpha;
-
-static __m128i mask_565_r;
-static __m128i mask_565_g1, mask_565_g2;
-static __m128i mask_565_b;
-static __m128i mask_red;
-static __m128i mask_green;
-static __m128i mask_blue;
-
-static __m128i mask_565_fix_rb;
-static __m128i mask_565_fix_g;
-
-static force_inline __m128i
-unpack_32_1x128 (uint32_t data)
-{
-    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
-}
-
-static force_inline void
-unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
-{
-    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
-    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
-}
-
-static force_inline __m128i
-unpack_565_to_8888 (__m128i lo)
-{
-    __m128i r, g, b, rb, t;
-
-    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
-    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
-    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
-
-    rb = _mm_or_si128 (r, b);
-    t  = _mm_and_si128 (rb, mask_565_fix_rb);
-    t  = _mm_srli_epi32 (t, 5);
-    rb = _mm_or_si128 (rb, t);
-
-    t  = _mm_and_si128 (g, mask_565_fix_g);
-    t  = _mm_srli_epi32 (t, 6);
-    g  = _mm_or_si128 (g, t);
-
-    return _mm_or_si128 (rb, g);
-}
-
-static force_inline void
-unpack_565_128_4x128 (__m128i  data,
-                      __m128i* data0,
-                      __m128i* data1,
-                      __m128i* data2,
-                      __m128i* data3)
-{
-    __m128i lo, hi;
-
-    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
-    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
-
-    lo = unpack_565_to_8888 (lo);
-    hi = unpack_565_to_8888 (hi);
-
-    unpack_128_2x128 (lo, data0, data1);
-    unpack_128_2x128 (hi, data2, data3);
-}
-
-static force_inline uint16_t
-pack_565_32_16 (uint32_t pixel)
-{
-    return (uint16_t) (((pixel >> 8) & 0xf800) |
-		       ((pixel >> 5) & 0x07e0) |
-		       ((pixel >> 3) & 0x001f));
-}
-
-static force_inline __m128i
-pack_2x128_128 (__m128i lo, __m128i hi)
-{
-    return _mm_packus_epi16 (lo, hi);
-}
-
-static force_inline __m128i
-pack_565_2x128_128 (__m128i lo, __m128i hi)
-{
-    __m128i data;
-    __m128i r, g1, g2, b;
-
-    data = pack_2x128_128 (lo, hi);
-
-    r  = _mm_and_si128 (data, mask_565_r);
-    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
-    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
-    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
-
-    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
-}
-
-static force_inline __m128i
-pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
-{
-    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
-			     pack_565_2x128_128 (*xmm2, *xmm3));
-}
-
-static force_inline int
-is_opaque (__m128i x)
-{
-    __m128i ffs = _mm_cmpeq_epi8 (x, x);
-
-    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
-}
-
-static force_inline int
-is_zero (__m128i x)
-{
-    return _mm_movemask_epi8 (
-	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
-}
-
-static force_inline int
-is_transparent (__m128i x)
-{
-    return (_mm_movemask_epi8 (
-		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
-}
-
-static force_inline __m128i
-expand_pixel_32_1x128 (uint32_t data)
-{
-    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
-}
-
-static force_inline __m128i
-expand_alpha_1x128 (__m128i data)
-{
-    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
-						     _MM_SHUFFLE (3, 3, 3, 3)),
-				_MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline void
-expand_alpha_2x128 (__m128i  data_lo,
-                    __m128i  data_hi,
-                    __m128i* alpha_lo,
-                    __m128i* alpha_hi)
-{
-    __m128i lo, hi;
-
-    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
-    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
-
-    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
-    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline void
-expand_alpha_rev_2x128 (__m128i  data_lo,
-                        __m128i  data_hi,
-                        __m128i* alpha_lo,
-                        __m128i* alpha_hi)
-{
-    __m128i lo, hi;
-
-    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
-    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
-    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
-    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline void
-pix_multiply_2x128 (__m128i* data_lo,
-                    __m128i* data_hi,
-                    __m128i* alpha_lo,
-                    __m128i* alpha_hi,
-                    __m128i* ret_lo,
-                    __m128i* ret_hi)
-{
-    __m128i lo, hi;
-
-    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
-    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
-    lo = _mm_adds_epu16 (lo, mask_0080);
-    hi = _mm_adds_epu16 (hi, mask_0080);
-    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
-    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
-}
-
-static force_inline void
-pix_add_multiply_2x128 (__m128i* src_lo,
-                        __m128i* src_hi,
-                        __m128i* alpha_dst_lo,
-                        __m128i* alpha_dst_hi,
-                        __m128i* dst_lo,
-                        __m128i* dst_hi,
-                        __m128i* alpha_src_lo,
-                        __m128i* alpha_src_hi,
-                        __m128i* ret_lo,
-                        __m128i* ret_hi)
-{
-    __m128i t1_lo, t1_hi;
-    __m128i t2_lo, t2_hi;
-
-    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
-    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
-
-    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
-    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
-}
-
-static force_inline void
-negate_2x128 (__m128i  data_lo,
-              __m128i  data_hi,
-              __m128i* neg_lo,
-              __m128i* neg_hi)
-{
-    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
-    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
-}
-
-static force_inline void
-invert_colors_2x128 (__m128i  data_lo,
-                     __m128i  data_hi,
-                     __m128i* inv_lo,
-                     __m128i* inv_hi)
-{
-    __m128i lo, hi;
-
-    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
-    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
-    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
-    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
-static force_inline void
-over_2x128 (__m128i* src_lo,
-            __m128i* src_hi,
-            __m128i* alpha_lo,
-            __m128i* alpha_hi,
-            __m128i* dst_lo,
-            __m128i* dst_hi)
-{
-    __m128i t1, t2;
-
-    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
-
-    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
-
-    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
-    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
-}
-
-static force_inline void
-over_rev_non_pre_2x128 (__m128i  src_lo,
-                        __m128i  src_hi,
-                        __m128i* dst_lo,
-                        __m128i* dst_hi)
-{
-    __m128i lo, hi;
-    __m128i alpha_lo, alpha_hi;
-
-    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
-
-    lo = _mm_or_si128 (alpha_lo, mask_alpha);
-    hi = _mm_or_si128 (alpha_hi, mask_alpha);
-
-    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
-
-    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
-
-    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
-}
-
-static force_inline void
-in_over_2x128 (__m128i* src_lo,
-               __m128i* src_hi,
-               __m128i* alpha_lo,
-               __m128i* alpha_hi,
-               __m128i* mask_lo,
-               __m128i* mask_hi,
-               __m128i* dst_lo,
-               __m128i* dst_hi)
-{
-    __m128i s_lo, s_hi;
-    __m128i a_lo, a_hi;
-
-    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
-    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
-
-    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
-}
-
-/* load 4 pixels from a 16-byte boundary aligned address */
-static force_inline __m128i
-load_128_aligned (__m128i* src)
-{
-    return _mm_load_si128 (src);
-}
-
-/* load 4 pixels from a unaligned address */
-static force_inline __m128i
-load_128_unaligned (const __m128i* src)
-{
-    return _mm_loadu_si128 (src);
-}
-
-/* save 4 pixels using Write Combining memory on a 16-byte
- * boundary aligned address
- */
-static force_inline void
-save_128_write_combining (__m128i* dst,
-                          __m128i  data)
-{
-    _mm_stream_si128 (dst, data);
-}
-
-/* save 4 pixels on a 16-byte boundary aligned address */
-static force_inline void
-save_128_aligned (__m128i* dst,
-                  __m128i  data)
-{
-    _mm_store_si128 (dst, data);
-}
-
-/* save 4 pixels on a unaligned address */
-static force_inline void
-save_128_unaligned (__m128i* dst,
-                    __m128i  data)
-{
-    _mm_storeu_si128 (dst, data);
-}
-
-static force_inline __m128i
-load_32_1x128 (uint32_t data)
-{
-    return _mm_cvtsi32_si128 (data);
-}
-
-static force_inline __m128i
-expand_alpha_rev_1x128 (__m128i data)
-{
-    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline __m128i
-expand_pixel_8_1x128 (uint8_t data)
-{
-    return _mm_shufflelo_epi16 (
-	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline __m128i
-pix_multiply_1x128 (__m128i data,
-		    __m128i alpha)
-{
-    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
-					    mask_0080),
-			    mask_0101);
-}
-
-static force_inline __m128i
-pix_add_multiply_1x128 (__m128i* src,
-			__m128i* alpha_dst,
-			__m128i* dst,
-			__m128i* alpha_src)
-{
-    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
-    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
-
-    return _mm_adds_epu8 (t1, t2);
-}
-
-static force_inline __m128i
-negate_1x128 (__m128i data)
-{
-    return _mm_xor_si128 (data, mask_00ff);
-}
-
-static force_inline __m128i
-invert_colors_1x128 (__m128i data)
-{
-    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
-static force_inline __m128i
-over_1x128 (__m128i src, __m128i alpha, __m128i dst)
-{
-    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
-}
-
-static force_inline __m128i
-in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
-{
-    return over_1x128 (pix_multiply_1x128 (*src, *mask),
-		       pix_multiply_1x128 (*alpha, *mask),
-		       *dst);
-}
-
-static force_inline __m128i
-over_rev_non_pre_1x128 (__m128i src, __m128i dst)
-{
-    __m128i alpha = expand_alpha_1x128 (src);
-
-    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
-					   _mm_or_si128 (alpha, mask_alpha)),
-		       alpha,
-		       dst);
-}
-
-static force_inline uint32_t
-pack_1x128_32 (__m128i data)
-{
-    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
-}
-
-static force_inline __m128i
-expand565_16_1x128 (uint16_t pixel)
-{
-    __m128i m = _mm_cvtsi32_si128 (pixel);
-
-    m = unpack_565_to_8888 (m);
-
-    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
-}
-
-static force_inline uint32_t
-core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
-{
-    uint8_t a;
-    __m128i xmms;
-
-    a = src >> 24;
-
-    if (a == 0xff)
-    {
-	return src;
-    }
-    else if (src)
-    {
-	xmms = unpack_32_1x128 (src);
-	return pack_1x128_32 (
-	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
-			unpack_32_1x128 (dst)));
-    }
-
-    return dst;
-}
-
-static force_inline uint32_t
-combine1 (const uint32_t *ps, const uint32_t *pm)
-{
-    uint32_t s = *ps;
-
-    if (pm)
-    {
-	__m128i ms, mm;
-
-	mm = unpack_32_1x128 (*pm);
-	mm = expand_alpha_1x128 (mm);
-
-	ms = unpack_32_1x128 (s);
-	ms = pix_multiply_1x128 (ms, mm);
-
-	s = pack_1x128_32 (ms);
-    }
-
-    return s;
-}
-
-static force_inline __m128i
-combine4 (const __m128i *ps, const __m128i *pm)
-{
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_msk_lo, xmm_msk_hi;
-    __m128i s;
-
-    if (pm)
-    {
-	xmm_msk_lo = load_128_unaligned (pm);
-
-	if (is_transparent (xmm_msk_lo))
-	    return _mm_setzero_si128 ();
-    }
-
-    s = load_128_unaligned (ps);
-
-    if (pm)
-    {
-	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
-
-	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
-
-	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-			    &xmm_msk_lo, &xmm_msk_hi,
-			    &xmm_src_lo, &xmm_src_hi);
-
-	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
-    }
-
-    return s;
-}
-
-static force_inline void
-core_combine_over_u_sse2_mask (uint32_t *	  pd,
-			       const uint32_t*    ps,
-			       const uint32_t*    pm,
-			       int                w)
-{
-    uint32_t s, d;
-
-    /* Align dst on a 16-byte boundary */
-    while (w && ((unsigned long)pd & 15))
-    {
-	d = *pd;
-	s = combine1 (ps, pm);
-
-	if (s)
-	    *pd = core_combine_over_u_pixel_sse2 (s, d);
-	pd++;
-	ps++;
-	pm++;
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	__m128i mask = load_128_unaligned ((__m128i *)pm);
-
-	if (!is_zero (mask))
-	{
-	    __m128i src;
-	    __m128i src_hi, src_lo;
-	    __m128i mask_hi, mask_lo;
-	    __m128i alpha_hi, alpha_lo;
-
-	    src = load_128_unaligned ((__m128i *)ps);
-
-	    if (is_opaque (_mm_and_si128 (src, mask)))
-	    {
-		save_128_aligned ((__m128i *)pd, src);
-	    }
-	    else
-	    {
-		__m128i dst = load_128_aligned ((__m128i *)pd);
-		__m128i dst_hi, dst_lo;
-
-		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
-		unpack_128_2x128 (src, &src_lo, &src_hi);
-
-		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
-		pix_multiply_2x128 (&src_lo, &src_hi,
-				    &mask_lo, &mask_hi,
-				    &src_lo, &src_hi);
-
-		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
-
-		expand_alpha_2x128 (src_lo, src_hi,
-				    &alpha_lo, &alpha_hi);
-
-		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
-			    &dst_lo, &dst_hi);
-
-		save_128_aligned (
-		    (__m128i *)pd,
-		    pack_2x128_128 (dst_lo, dst_hi));
-	    }
-	}
-
-	pm += 4;
-	ps += 4;
-	pd += 4;
-	w -= 4;
-    }
-    while (w)
-    {
-	d = *pd;
-	s = combine1 (ps, pm);
-
-	if (s)
-	    *pd = core_combine_over_u_pixel_sse2 (s, d);
-	pd++;
-	ps++;
-	pm++;
-
-	w--;
-    }
-}
-
-static force_inline void
-core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
-				  const uint32_t*    ps,
-				  int                w)
-{
-    uint32_t s, d;
-
-    /* Align dst on a 16-byte boundary */
-    while (w && ((unsigned long)pd & 15))
-    {
-	d = *pd;
-	s = *ps;
-
-	if (s)
-	    *pd = core_combine_over_u_pixel_sse2 (s, d);
-	pd++;
-	ps++;
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	__m128i src;
-	__m128i src_hi, src_lo, dst_hi, dst_lo;
-	__m128i alpha_hi, alpha_lo;
-
-	src = load_128_unaligned ((__m128i *)ps);
-
-	if (!is_zero (src))
-	{
-	    if (is_opaque (src))
-	    {
-		save_128_aligned ((__m128i *)pd, src);
-	    }
-	    else
-	    {
-		__m128i dst = load_128_aligned ((__m128i *)pd);
-
-		unpack_128_2x128 (src, &src_lo, &src_hi);
-		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
-
-		expand_alpha_2x128 (src_lo, src_hi,
-				    &alpha_lo, &alpha_hi);
-		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
-			    &dst_lo, &dst_hi);
-
-		save_128_aligned (
-		    (__m128i *)pd,
-		    pack_2x128_128 (dst_lo, dst_hi));
-	    }
-	}
-
-	ps += 4;
-	pd += 4;
-	w -= 4;
-    }
-    while (w)
-    {
-	d = *pd;
-	s = *ps;
-
-	if (s)
-	    *pd = core_combine_over_u_pixel_sse2 (s, d);
-	pd++;
-	ps++;
-
-	w--;
-    }
-}
-
-static force_inline void
-sse2_combine_over_u (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               pd,
-                     const uint32_t *         ps,
-                     const uint32_t *         pm,
-                     int                      w)
-{
-    if (pm)
-	core_combine_over_u_sse2_mask (pd, ps, pm, w);
-    else
-	core_combine_over_u_sse2_no_mask (pd, ps, w);
-}
-
-static void
-sse2_combine_over_reverse_u (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               pd,
-                             const uint32_t *         ps,
-                             const uint32_t *         pm,
-                             int                      w)
-{
-    uint32_t s, d;
-
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-
-    /* Align dst on a 16-byte boundary */
-    while (w &&
-           ((unsigned long)pd & 15))
-    {
-	d = *pd;
-	s = combine1 (ps, pm);
-
-	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
-	w--;
-	ps++;
-	if (pm)
-	    pm++;
-    }
-
-    while (w >= 4)
-    {
-	/* I'm loading unaligned because I'm not sure
-	 * about the address alignment.
-	 */
-	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
-	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi);
-
-	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
-		    &xmm_alpha_lo, &xmm_alpha_hi,
-		    &xmm_src_lo, &xmm_src_hi);
-
-	/* rebuid the 4 pixel data and save*/
-	save_128_aligned ((__m128i*)pd,
-			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
-
-	w -= 4;
-	ps += 4;
-	pd += 4;
-
-	if (pm)
-	    pm += 4;
-    }
-
-    while (w)
-    {
-	d = *pd;
-	s = combine1 (ps, pm);
-
-	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
-	ps++;
-	w--;
-	if (pm)
-	    pm++;
-    }
-}
-
-static force_inline uint32_t
-core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
-{
-    uint32_t maska = src >> 24;
-
-    if (maska == 0)
-    {
-	return 0;
-    }
-    else if (maska != 0xff)
-    {
-	return pack_1x128_32 (
-	    pix_multiply_1x128 (unpack_32_1x128 (dst),
-				expand_alpha_1x128 (unpack_32_1x128 (src))));
-    }
-
-    return dst;
-}
-
-static void
-sse2_combine_in_u (pixman_implementation_t *imp,
-                   pixman_op_t              op,
-                   uint32_t *               pd,
-                   const uint32_t *         ps,
-                   const uint32_t *         pm,
-                   int                      w)
-{
-    uint32_t s, d;
-
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-
-    while (w && ((unsigned long) pd & 15))
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
-	w--;
-	ps++;
-	if (pm)
-	    pm++;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
-
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-			    &xmm_dst_lo, &xmm_dst_hi,
-			    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned ((__m128i*)pd,
-			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	w -= 4;
-	if (pm)
-	    pm += 4;
-    }
-
-    while (w)
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
-	w--;
-	ps++;
-	if (pm)
-	    pm++;
-    }
-}
-
-static void
-sse2_combine_in_reverse_u (pixman_implementation_t *imp,
-                           pixman_op_t              op,
-                           uint32_t *               pd,
-                           const uint32_t *         ps,
-                           const uint32_t *         pm,
-                           int                      w)
-{
-    uint32_t s, d;
-
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-
-    while (w && ((unsigned long) pd & 15))
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
-	ps++;
-	w--;
-	if (pm)
-	    pm++;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
-
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
-			    &xmm_src_lo, &xmm_src_hi,
-			    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	w -= 4;
-	if (pm)
-	    pm += 4;
-    }
-
-    while (w)
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
-	w--;
-	ps++;
-	if (pm)
-	    pm++;
-    }
-}
-
-static void
-sse2_combine_out_reverse_u (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            uint32_t *               pd,
-                            const uint32_t *         ps,
-                            const uint32_t *         pm,
-                            int                      w)
-{
-    while (w && ((unsigned long) pd & 15))
-    {
-	uint32_t s = combine1 (ps, pm);
-	uint32_t d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		unpack_32_1x128 (d), negate_1x128 (
-		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
-
-	if (pm)
-	    pm++;
-	ps++;
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	__m128i xmm_src_lo, xmm_src_hi;
-	__m128i xmm_dst_lo, xmm_dst_hi;
-
-	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
-	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
-	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
-			    &xmm_src_lo, &xmm_src_hi,
-			    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	if (pm)
-	    pm += 4;
-
-	w -= 4;
-    }
-
-    while (w)
-    {
-	uint32_t s = combine1 (ps, pm);
-	uint32_t d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		unpack_32_1x128 (d), negate_1x128 (
-		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
-	ps++;
-	if (pm)
-	    pm++;
-	w--;
-    }
-}
-
-static void
-sse2_combine_out_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               pd,
-                    const uint32_t *         ps,
-                    const uint32_t *         pm,
-                    int                      w)
-{
-    while (w && ((unsigned long) pd & 15))
-    {
-	uint32_t s = combine1 (ps, pm);
-	uint32_t d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		unpack_32_1x128 (s), negate_1x128 (
-		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
-	w--;
-	ps++;
-	if (pm)
-	    pm++;
-    }
-
-    while (w >= 4)
-    {
-	__m128i xmm_src_lo, xmm_src_hi;
-	__m128i xmm_dst_lo, xmm_dst_hi;
-
-	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
-	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-			    &xmm_dst_lo, &xmm_dst_hi,
-			    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	w -= 4;
-	if (pm)
-	    pm += 4;
-    }
-
-    while (w)
-    {
-	uint32_t s = combine1 (ps, pm);
-	uint32_t d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		unpack_32_1x128 (s), negate_1x128 (
-		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
-	w--;
-	ps++;
-	if (pm)
-	    pm++;
-    }
-}
-
-static force_inline uint32_t
-core_combine_atop_u_pixel_sse2 (uint32_t src,
-                                uint32_t dst)
-{
-    __m128i s = unpack_32_1x128 (src);
-    __m128i d = unpack_32_1x128 (dst);
-
-    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
-    __m128i da = expand_alpha_1x128 (d);
-
-    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
-}
-
-static void
-sse2_combine_atop_u (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               pd,
-                     const uint32_t *         ps,
-                     const uint32_t *         pm,
-                     int                      w)
-{
-    uint32_t s, d;
-
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
-    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
-    while (w && ((unsigned long) pd & 15))
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
-	w--;
-	ps++;
-	if (pm)
-	    pm++;
-    }
-
-    while (w >= 4)
-    {
-	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
-	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
-			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
-		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-
-	pix_add_multiply_2x128 (
-	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
-	    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	w -= 4;
-	if (pm)
-	    pm += 4;
-    }
-
-    while (w)
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
-	w--;
-	ps++;
-	if (pm)
-	    pm++;
-    }
-}
-
-static force_inline uint32_t
-core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
-                                        uint32_t dst)
-{
-    __m128i s = unpack_32_1x128 (src);
-    __m128i d = unpack_32_1x128 (dst);
-
-    __m128i sa = expand_alpha_1x128 (s);
-    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
-
-    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
-}
-
-static void
-sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               pd,
-                             const uint32_t *         ps,
-                             const uint32_t *         pm,
-                             int                      w)
-{
-    uint32_t s, d;
-
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
-    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
-    while (w && ((unsigned long) pd & 15))
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
-	ps++;
-	w--;
-	if (pm)
-	    pm++;
-    }
-
-    while (w >= 4)
-    {
-	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
-	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
-			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
-		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-	pix_add_multiply_2x128 (
-	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
-	    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	w -= 4;
-	if (pm)
-	    pm += 4;
-    }
-
-    while (w)
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
-	ps++;
-	w--;
-	if (pm)
-	    pm++;
-    }
-}
-
-static force_inline uint32_t
-core_combine_xor_u_pixel_sse2 (uint32_t src,
-                               uint32_t dst)
-{
-    __m128i s = unpack_32_1x128 (src);
-    __m128i d = unpack_32_1x128 (dst);
-
-    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
-    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
-
-    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
-}
-
-static void
-sse2_combine_xor_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    int w = width;
-    uint32_t s, d;
-    uint32_t* pd = dst;
-    const uint32_t* ps = src;
-    const uint32_t* pm = mask;
-
-    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
-    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
-    while (w && ((unsigned long) pd & 15))
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
-	w--;
-	ps++;
-	if (pm)
-	    pm++;
-    }
-
-    while (w >= 4)
-    {
-	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
-	xmm_dst = load_128_aligned ((__m128i*) pd);
-
-	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
-			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
-		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
-		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-	pix_add_multiply_2x128 (
-	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
-	    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	w -= 4;
-	if (pm)
-	    pm += 4;
-    }
-
-    while (w)
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
-	w--;
-	ps++;
-	if (pm)
-	    pm++;
-    }
-}
-
-static force_inline void
-sse2_combine_add_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    int w = width;
-    uint32_t s, d;
-    uint32_t* pd = dst;
-    const uint32_t* ps = src;
-    const uint32_t* pm = mask;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	ps++;
-	if (pm)
-	    pm++;
-	*pd++ = _mm_cvtsi128_si32 (
-	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	__m128i s;
-
-	s = combine4 ((__m128i*)ps, (__m128i*)pm);
-
-	save_128_aligned (
-	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
-
-	pd += 4;
-	ps += 4;
-	if (pm)
-	    pm += 4;
-	w -= 4;
-    }
-
-    while (w--)
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	ps++;
-	*pd++ = _mm_cvtsi128_si32 (
-	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
-	if (pm)
-	    pm++;
-    }
-}
-
-static force_inline uint32_t
-core_combine_saturate_u_pixel_sse2 (uint32_t src,
-                                    uint32_t dst)
-{
-    __m128i ms = unpack_32_1x128 (src);
-    __m128i md = unpack_32_1x128 (dst);
-    uint32_t sa = src >> 24;
-    uint32_t da = ~dst >> 24;
-
-    if (sa > da)
-    {
-	ms = pix_multiply_1x128 (
-	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
-    }
-
-    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
-}
-
-static void
-sse2_combine_saturate_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint32_t *               pd,
-                         const uint32_t *         ps,
-                         const uint32_t *         pm,
-                         int                      w)
-{
-    uint32_t s, d;
-
-    uint32_t pack_cmp;
-    __m128i xmm_src, xmm_dst;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
-	w--;
-	ps++;
-	if (pm)
-	    pm++;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst = load_128_aligned  ((__m128i*)pd);
-	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
-
-	pack_cmp = _mm_movemask_epi8 (
-	    _mm_cmpgt_epi32 (
-		_mm_srli_epi32 (xmm_src, 24),
-		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
-
-	/* if some alpha src is grater than respective ~alpha dst */
-	if (pack_cmp)
-	{
-	    s = combine1 (ps++, pm);
-	    d = *pd;
-	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
-	    if (pm)
-		pm++;
-
-	    s = combine1 (ps++, pm);
-	    d = *pd;
-	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
-	    if (pm)
-		pm++;
-
-	    s = combine1 (ps++, pm);
-	    d = *pd;
-	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
-	    if (pm)
-		pm++;
-
-	    s = combine1 (ps++, pm);
-	    d = *pd;
-	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
-	    if (pm)
-		pm++;
-	}
-	else
-	{
-	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
-
-	    pd += 4;
-	    ps += 4;
-	    if (pm)
-		pm += 4;
-	}
-
-	w -= 4;
-    }
-
-    while (w--)
-    {
-	s = combine1 (ps, pm);
-	d = *pd;
-
-	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
-	ps++;
-	if (pm)
-	    pm++;
-    }
-}
-
-static void
-sse2_combine_src_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               pd,
-                     const uint32_t *         ps,
-                     const uint32_t *         pm,
-                     int                      w)
-{
-    uint32_t s, m;
-
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_mask_lo, xmm_mask_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = *ps++;
-	m = *pm++;
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-			    &xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	pm += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	s = *ps++;
-	m = *pm++;
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
-	w--;
-    }
-}
-
-static force_inline uint32_t
-core_combine_over_ca_pixel_sse2 (uint32_t src,
-                                 uint32_t mask,
-                                 uint32_t dst)
-{
-    __m128i s = unpack_32_1x128 (src);
-    __m128i expAlpha = expand_alpha_1x128 (s);
-    __m128i unpk_mask = unpack_32_1x128 (mask);
-    __m128i unpk_dst  = unpack_32_1x128 (dst);
-
-    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
-}
-
-static void
-sse2_combine_over_ca (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      uint32_t *               pd,
-                      const uint32_t *         ps,
-                      const uint32_t *         pm,
-                      int                      w)
-{
-    uint32_t s, m, d;
-
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask_lo, xmm_mask_hi;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi);
-
-	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
-		       &xmm_alpha_lo, &xmm_alpha_hi,
-		       &xmm_mask_lo, &xmm_mask_hi,
-		       &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	pm += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
-	w--;
-    }
-}
-
-static force_inline uint32_t
-core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
-                                         uint32_t mask,
-                                         uint32_t dst)
-{
-    __m128i d = unpack_32_1x128 (dst);
-
-    return pack_1x128_32 (
-	over_1x128 (d, expand_alpha_1x128 (d),
-		    pix_multiply_1x128 (unpack_32_1x128 (src),
-					unpack_32_1x128 (mask))));
-}
-
-static void
-sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              uint32_t *               pd,
-                              const uint32_t *         ps,
-                              const uint32_t *         pm,
-                              int                      w)
-{
-    uint32_t s, m, d;
-
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask_lo, xmm_mask_hi;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi);
-	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-			    &xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_mask_lo, &xmm_mask_hi);
-
-	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
-		    &xmm_alpha_lo, &xmm_alpha_hi,
-		    &xmm_mask_lo, &xmm_mask_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
-
-	ps += 4;
-	pd += 4;
-	pm += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
-	w--;
-    }
-}
-
-static void
-sse2_combine_in_ca (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               pd,
-                    const uint32_t *         ps,
-                    const uint32_t *         pm,
-                    int                      w)
-{
-    uint32_t s, m, d;
-
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask_lo, xmm_mask_hi;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
-		expand_alpha_1x128 (unpack_32_1x128 (d))));
-
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi);
-
-	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-			    &xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_dst_lo, &xmm_dst_hi);
-
-	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi,
-			    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	pm += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		pix_multiply_1x128 (
-		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
-		expand_alpha_1x128 (unpack_32_1x128 (d))));
-
-	w--;
-    }
-}
-
-static void
-sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            uint32_t *               pd,
-                            const uint32_t *         ps,
-                            const uint32_t *         pm,
-                            int                      w)
-{
-    uint32_t s, m, d;
-
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask_lo, xmm_mask_hi;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		unpack_32_1x128 (d),
-		pix_multiply_1x128 (unpack_32_1x128 (m),
-				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi);
-	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi);
-
-	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi,
-			    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	pm += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		unpack_32_1x128 (d),
-		pix_multiply_1x128 (unpack_32_1x128 (m),
-				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
-	w--;
-    }
-}
-
-static void
-sse2_combine_out_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               pd,
-                     const uint32_t *         ps,
-                     const uint32_t *         pm,
-                     int                      w)
-{
-    uint32_t s, m, d;
-
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask_lo, xmm_mask_hi;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		pix_multiply_1x128 (
-		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
-		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi);
-	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
-		      &xmm_alpha_lo, &xmm_alpha_hi);
-
-	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-			    &xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_dst_lo, &xmm_dst_hi);
-	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi,
-			    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	pm += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		pix_multiply_1x128 (
-		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
-		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
-
-	w--;
-    }
-}
-
-static void
-sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               pd,
-                             const uint32_t *         ps,
-                             const uint32_t *         pm,
-                             int                      w)
-{
-    uint32_t s, m, d;
-
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask_lo, xmm_mask_hi;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		unpack_32_1x128 (d),
-		negate_1x128 (pix_multiply_1x128 (
-				 unpack_32_1x128 (m),
-				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi);
-
-	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_alpha_lo, &xmm_alpha_hi,
-			    &xmm_mask_lo, &xmm_mask_hi);
-
-	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
-		      &xmm_mask_lo, &xmm_mask_hi);
-
-	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
-			    &xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	pm += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    pix_multiply_1x128 (
-		unpack_32_1x128 (d),
-		negate_1x128 (pix_multiply_1x128 (
-				 unpack_32_1x128 (m),
-				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
-	w--;
-    }
-}
-
-static force_inline uint32_t
-core_combine_atop_ca_pixel_sse2 (uint32_t src,
-                                 uint32_t mask,
-                                 uint32_t dst)
-{
-    __m128i m = unpack_32_1x128 (mask);
-    __m128i s = unpack_32_1x128 (src);
-    __m128i d = unpack_32_1x128 (dst);
-    __m128i sa = expand_alpha_1x128 (s);
-    __m128i da = expand_alpha_1x128 (d);
-
-    s = pix_multiply_1x128 (s, m);
-    m = negate_1x128 (pix_multiply_1x128 (m, sa));
-
-    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
-}
-
-static void
-sse2_combine_atop_ca (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      uint32_t *               pd,
-                      const uint32_t *         ps,
-                      const uint32_t *         pm,
-                      int                      w)
-{
-    uint32_t s, m, d;
-
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
-    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-    __m128i xmm_mask_lo, xmm_mask_hi;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
-			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-			    &xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_src_lo, &xmm_src_hi);
-	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
-			    &xmm_mask_lo, &xmm_mask_hi);
-
-	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-	pix_add_multiply_2x128 (
-	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
-	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-	    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	pm += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
-	w--;
-    }
-}
-
-static force_inline uint32_t
-core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
-                                         uint32_t mask,
-                                         uint32_t dst)
-{
-    __m128i m = unpack_32_1x128 (mask);
-    __m128i s = unpack_32_1x128 (src);
-    __m128i d = unpack_32_1x128 (dst);
-
-    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
-    __m128i sa = expand_alpha_1x128 (s);
-
-    s = pix_multiply_1x128 (s, m);
-    m = pix_multiply_1x128 (m, sa);
-
-    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
-}
-
-static void
-sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              uint32_t *               pd,
-                              const uint32_t *         ps,
-                              const uint32_t *         pm,
-                              int                      w)
-{
-    uint32_t s, m, d;
-
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
-    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-    __m128i xmm_mask_lo, xmm_mask_hi;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
-			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-			    &xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_src_lo, &xmm_src_hi);
-	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
-			    &xmm_mask_lo, &xmm_mask_hi);
-
-	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
-		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-	pix_add_multiply_2x128 (
-	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
-	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-	    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	pm += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
-	w--;
-    }
-}
-
-static force_inline uint32_t
-core_combine_xor_ca_pixel_sse2 (uint32_t src,
-                                uint32_t mask,
-                                uint32_t dst)
-{
-    __m128i a = unpack_32_1x128 (mask);
-    __m128i s = unpack_32_1x128 (src);
-    __m128i d = unpack_32_1x128 (dst);
-
-    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
-				       a, expand_alpha_1x128 (s)));
-    __m128i dest      = pix_multiply_1x128 (s, a);
-    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
-
-    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
-                                                &alpha_dst,
-                                                &dest,
-                                                &alpha_src));
-}
-
-static void
-sse2_combine_xor_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               pd,
-                     const uint32_t *         ps,
-                     const uint32_t *         pm,
-                     int                      w)
-{
-    uint32_t s, m, d;
-
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
-    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-    __m128i xmm_mask_lo, xmm_mask_hi;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
-			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-			    &xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_src_lo, &xmm_src_hi);
-	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
-			    &xmm_mask_lo, &xmm_mask_hi);
-
-	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
-		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
-		      &xmm_mask_lo, &xmm_mask_hi);
-
-	pix_add_multiply_2x128 (
-	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
-	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-	    &xmm_dst_lo, &xmm_dst_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	ps += 4;
-	pd += 4;
-	pm += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
-	w--;
-    }
-}
-
-static void
-sse2_combine_add_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               pd,
-                     const uint32_t *         ps,
-                     const uint32_t *         pm,
-                     int                      w)
-{
-    uint32_t s, m, d;
-
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask_lo, xmm_mask_hi;
-
-    while (w && (unsigned long)pd & 15)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
-					       unpack_32_1x128 (m)),
-			   unpack_32_1x128 (d)));
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-
-	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-			    &xmm_mask_lo, &xmm_mask_hi,
-			    &xmm_src_lo, &xmm_src_hi);
-
-	save_128_aligned (
-	    (__m128i*)pd, pack_2x128_128 (
-		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
-		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
-
-	ps += 4;
-	pd += 4;
-	pm += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	s = *ps++;
-	m = *pm++;
-	d = *pd;
-
-	*pd++ = pack_1x128_32 (
-	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
-					       unpack_32_1x128 (m)),
-			   unpack_32_1x128 (d)));
-	w--;
-    }
-}
-
-static force_inline __m128i
-create_mask_16_128 (uint16_t mask)
-{
-    return _mm_set1_epi16 (mask);
-}
-
-/* Work around a code generation bug in Sun Studio 12. */
-#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
-# define create_mask_2x32_128(mask0, mask1)				\
-    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
-#else
-static force_inline __m128i
-create_mask_2x32_128 (uint32_t mask0,
-                      uint32_t mask1)
-{
-    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
-}
-#endif
-
-static void
-sse2_composite_over_n_8888 (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            pixman_image_t *         src_image,
-                            pixman_image_t *         mask_image,
-                            pixman_image_t *         dst_image,
-                            int32_t                  src_x,
-                            int32_t                  src_y,
-                            int32_t                  mask_x,
-                            int32_t                  mask_y,
-                            int32_t                  dest_x,
-                            int32_t                  dest_y,
-                            int32_t                  width,
-                            int32_t                  height)
-{
-    uint32_t src;
-    uint32_t    *dst_line, *dst, d;
-    int32_t w;
-    int dst_stride;
-    __m128i xmm_src, xmm_alpha;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
-    xmm_src = expand_pixel_32_1x128 (src);
-    xmm_alpha = expand_alpha_1x128 (xmm_src);
-
-    while (height--)
-    {
-	dst = dst_line;
-
-	dst_line += dst_stride;
-	w = width;
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    d = *dst;
-	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
-						xmm_alpha,
-						unpack_32_1x128 (d)));
-	    w--;
-	}
-
-	while (w >= 4)
-	{
-	    xmm_dst = load_128_aligned ((__m128i*)dst);
-
-	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-	    over_2x128 (&xmm_src, &xmm_src,
-			&xmm_alpha, &xmm_alpha,
-			&xmm_dst_lo, &xmm_dst_hi);
-
-	    /* rebuid the 4 pixel data and save*/
-	    save_128_aligned (
-		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	    w -= 4;
-	    dst += 4;
-	}
-
-	while (w)
-	{
-	    d = *dst;
-	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
-						xmm_alpha,
-						unpack_32_1x128 (d)));
-	    w--;
-	}
-
-    }
-}
-
-static void
-sse2_composite_over_n_0565 (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            pixman_image_t *         src_image,
-                            pixman_image_t *         mask_image,
-                            pixman_image_t *         dst_image,
-                            int32_t                  src_x,
-                            int32_t                  src_y,
-                            int32_t                  mask_x,
-                            int32_t                  mask_y,
-                            int32_t                  dest_x,
-                            int32_t                  dest_y,
-                            int32_t                  width,
-                            int32_t                  height)
-{
-    uint32_t src;
-    uint16_t    *dst_line, *dst, d;
-    int32_t w;
-    int dst_stride;
-    __m128i xmm_src, xmm_alpha;
-    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    xmm_src = expand_pixel_32_1x128 (src);
-    xmm_alpha = expand_alpha_1x128 (xmm_src);
-
-    while (height--)
-    {
-	dst = dst_line;
-
-	dst_line += dst_stride;
-	w = width;
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    d = *dst;
-
-	    *dst++ = pack_565_32_16 (
-		pack_1x128_32 (over_1x128 (xmm_src,
-					   xmm_alpha,
-					   expand565_16_1x128 (d))));
-	    w--;
-	}
-
-	while (w >= 8)
-	{
-	    xmm_dst = load_128_aligned ((__m128i*)dst);
-
-	    unpack_565_128_4x128 (xmm_dst,
-				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
-	    over_2x128 (&xmm_src, &xmm_src,
-			&xmm_alpha, &xmm_alpha,
-			&xmm_dst0, &xmm_dst1);
-	    over_2x128 (&xmm_src, &xmm_src,
-			&xmm_alpha, &xmm_alpha,
-			&xmm_dst2, &xmm_dst3);
-
-	    xmm_dst = pack_565_4x128_128 (
-		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
-	    save_128_aligned ((__m128i*)dst, xmm_dst);
-
-	    dst += 8;
-	    w -= 8;
-	}
-
-	while (w--)
-	{
-	    d = *dst;
-	    *dst++ = pack_565_32_16 (
-		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
-					   expand565_16_1x128 (d))));
-	}
-    }
-
-}
-
-static void
-sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
-				   pixman_op_t              op,
-				   pixman_image_t *         src_image,
-				   pixman_image_t *         mask_image,
-				   pixman_image_t *         dst_image,
-				   int32_t                  src_x,
-				   int32_t                  src_y,
-				   int32_t                  mask_x,
-				   int32_t                  mask_y,
-				   int32_t                  dest_x,
-				   int32_t                  dest_y,
-				   int32_t                  width,
-				   int32_t                  height)
-{
-    uint32_t src, srca;
-    uint32_t    *dst_line, d;
-    uint32_t    *mask_line, m;
-    uint32_t pack_cmp;
-    int dst_stride, mask_stride;
-
-    __m128i xmm_src, xmm_alpha;
-    __m128i xmm_dst;
-    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
-    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-    srca = src >> 24;
-
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
-    xmm_src = _mm_unpacklo_epi8 (
-	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
-    xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = xmm_src;
-    mmx_alpha = xmm_alpha;
-
-    while (height--)
-    {
-	int w = width;
-	const uint32_t *pm = (uint32_t *)mask_line;
-	uint32_t *pd = (uint32_t *)dst_line;
-
-	dst_line += dst_stride;
-	mask_line += mask_stride;
-
-	while (w && (unsigned long)pd & 15)
-	{
-	    m = *pm++;
-
-	    if (m)
-	    {
-		d = *pd;
-
-		mmx_mask = unpack_32_1x128 (m);
-		mmx_dest = unpack_32_1x128 (d);
-
-		*pd = pack_1x128_32 (
-		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
-				   mmx_dest));
-	    }
-
-	    pd++;
-	    w--;
-	}
-
-	while (w >= 4)
-	{
-	    xmm_mask = load_128_unaligned ((__m128i*)pm);
-
-	    pack_cmp =
-		_mm_movemask_epi8 (
-		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
-	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
-	    if (pack_cmp != 0xffff)
-	    {
-		xmm_dst = load_128_aligned ((__m128i*)pd);
-
-		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-		pix_multiply_2x128 (&xmm_src, &xmm_src,
-				    &xmm_mask_lo, &xmm_mask_hi,
-				    &xmm_mask_lo, &xmm_mask_hi);
-		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
-
-		save_128_aligned (
-		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
-	    }
-
-	    pd += 4;
-	    pm += 4;
-	    w -= 4;
-	}
-
-	while (w)
-	{
-	    m = *pm++;
-
-	    if (m)
-	    {
-		d = *pd;
-
-		mmx_mask = unpack_32_1x128 (m);
-		mmx_dest = unpack_32_1x128 (d);
-
-		*pd = pack_1x128_32 (
-		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
-				   mmx_dest));
-	    }
-
-	    pd++;
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
-                                    pixman_op_t              op,
-                                    pixman_image_t *         src_image,
-                                    pixman_image_t *         mask_image,
-                                    pixman_image_t *         dst_image,
-                                    int32_t                  src_x,
-                                    int32_t                  src_y,
-                                    int32_t                  mask_x,
-                                    int32_t                  mask_y,
-                                    int32_t                  dest_x,
-                                    int32_t                  dest_y,
-                                    int32_t                  width,
-                                    int32_t                  height)
-{
-    uint32_t src;
-    uint32_t    *dst_line, d;
-    uint32_t    *mask_line, m;
-    uint32_t pack_cmp;
-    int dst_stride, mask_stride;
-
-    __m128i xmm_src, xmm_alpha;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
-    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
-    xmm_src = _mm_unpacklo_epi8 (
-	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
-    xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = xmm_src;
-    mmx_alpha = xmm_alpha;
-
-    while (height--)
-    {
-	int w = width;
-	const uint32_t *pm = (uint32_t *)mask_line;
-	uint32_t *pd = (uint32_t *)dst_line;
-
-	dst_line += dst_stride;
-	mask_line += mask_stride;
-
-	while (w && (unsigned long)pd & 15)
-	{
-	    m = *pm++;
-
-	    if (m)
-	    {
-		d = *pd;
-		mmx_mask = unpack_32_1x128 (m);
-		mmx_dest = unpack_32_1x128 (d);
-
-		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
-		                                  &mmx_alpha,
-		                                  &mmx_mask,
-		                                  &mmx_dest));
-	    }
-
-	    pd++;
-	    w--;
-	}
-
-	while (w >= 4)
-	{
-	    xmm_mask = load_128_unaligned ((__m128i*)pm);
-
-	    pack_cmp =
-		_mm_movemask_epi8 (
-		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
-	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
-	    if (pack_cmp != 0xffff)
-	    {
-		xmm_dst = load_128_aligned ((__m128i*)pd);
-
-		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-		in_over_2x128 (&xmm_src, &xmm_src,
-			       &xmm_alpha, &xmm_alpha,
-			       &xmm_mask_lo, &xmm_mask_hi,
-			       &xmm_dst_lo, &xmm_dst_hi);
-
-		save_128_aligned (
-		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-	    }
-
-	    pd += 4;
-	    pm += 4;
-	    w -= 4;
-	}
-
-	while (w)
-	{
-	    m = *pm++;
-
-	    if (m)
-	    {
-		d = *pd;
-		mmx_mask = unpack_32_1x128 (m);
-		mmx_dest = unpack_32_1x128 (d);
-
-		*pd = pack_1x128_32 (
-		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
-	    }
-
-	    pd++;
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
-{
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    uint32_t mask;
-    int32_t w;
-    int dst_stride, src_stride;
-
-    __m128i xmm_mask;
-    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
-
-    xmm_mask = create_mask_16_128 (mask >> 24);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    uint32_t s = *src++;
-
-	    if (s)
-	    {
-		uint32_t d = *dst;
-		
-		__m128i ms = unpack_32_1x128 (s);
-		__m128i alpha    = expand_alpha_1x128 (ms);
-		__m128i dest     = xmm_mask;
-		__m128i alpha_dst = unpack_32_1x128 (d);
-		
-		*dst = pack_1x128_32 (
-		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
-	    }
-	    dst++;
-	    w--;
-	}
-
-	while (w >= 4)
-	{
-	    xmm_src = load_128_unaligned ((__m128i*)src);
-
-	    if (!is_zero (xmm_src))
-	    {
-		xmm_dst = load_128_aligned ((__m128i*)dst);
-		
-		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-				    &xmm_alpha_lo, &xmm_alpha_hi);
-		
-		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
-			       &xmm_alpha_lo, &xmm_alpha_hi,
-			       &xmm_mask, &xmm_mask,
-			       &xmm_dst_lo, &xmm_dst_hi);
-		
-		save_128_aligned (
-		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-	    }
-		
-	    dst += 4;
-	    src += 4;
-	    w -= 4;
-	}
-
-	while (w)
-	{
-	    uint32_t s = *src++;
-
-	    if (s)
-	    {
-		uint32_t d = *dst;
-		
-		__m128i ms = unpack_32_1x128 (s);
-		__m128i alpha = expand_alpha_1x128 (ms);
-		__m128i mask  = xmm_mask;
-		__m128i dest  = unpack_32_1x128 (d);
-		
-		*dst = pack_1x128_32 (
-		    in_over_1x128 (&ms, &alpha, &mask, &dest));
-	    }
-
-	    dst++;
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
-			      pixman_op_t              op,
-			      pixman_image_t *         src_image,
-			      pixman_image_t *         mask_image,
-			      pixman_image_t *         dst_image,
-			      int32_t                  src_x,
-			      int32_t                  src_y,
-			      int32_t                  mask_x,
-			      int32_t                  mask_y,
-			      int32_t                  dest_x,
-			      int32_t                  dest_y,
-			      int32_t                  width,
-			      int32_t                  height)
-{
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    int32_t w;
-    int dst_stride, src_stride;
-
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    *dst++ = *src++ | 0xff000000;
-	    w--;
-	}
-
-	while (w >= 16)
-	{
-	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
-	    
-	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
-	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
-	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
-	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
-	    
-	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
-	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
-	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
-	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
-	    
-	    dst += 16;
-	    src += 16;
-	    w -= 16;
-	}
-
-	while (w)
-	{
-	    *dst++ = *src++ | 0xff000000;
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
-{
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    uint32_t mask;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    __m128i xmm_mask, xmm_alpha;
-    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
-
-    xmm_mask = create_mask_16_128 (mask >> 24);
-    xmm_alpha = mask_00ff;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    uint32_t s = (*src++) | 0xff000000;
-	    uint32_t d = *dst;
-
-	    __m128i src   = unpack_32_1x128 (s);
-	    __m128i alpha = xmm_alpha;
-	    __m128i mask  = xmm_mask;
-	    __m128i dest  = unpack_32_1x128 (d);
-
-	    *dst++ = pack_1x128_32 (
-		in_over_1x128 (&src, &alpha, &mask, &dest));
-
-	    w--;
-	}
-
-	while (w >= 4)
-	{
-	    xmm_src = _mm_or_si128 (
-		load_128_unaligned ((__m128i*)src), mask_ff000000);
-	    xmm_dst = load_128_aligned ((__m128i*)dst);
-
-	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
-			   &xmm_alpha, &xmm_alpha,
-			   &xmm_mask, &xmm_mask,
-			   &xmm_dst_lo, &xmm_dst_hi);
-
-	    save_128_aligned (
-		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	    dst += 4;
-	    src += 4;
-	    w -= 4;
-
-	}
-
-	while (w)
-	{
-	    uint32_t s = (*src++) | 0xff000000;
-	    uint32_t d = *dst;
-
-	    __m128i src  = unpack_32_1x128 (s);
-	    __m128i alpha = xmm_alpha;
-	    __m128i mask  = xmm_mask;
-	    __m128i dest  = unpack_32_1x128 (d);
-
-	    *dst++ = pack_1x128_32 (
-		in_over_1x128 (&src, &alpha, &mask, &dest));
-
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int32_t                  src_x,
-                               int32_t                  src_y,
-                               int32_t                  mask_x,
-                               int32_t                  mask_y,
-                               int32_t                  dest_x,
-                               int32_t                  dest_y,
-                               int32_t                  width,
-                               int32_t                  height)
-{
-    int dst_stride, src_stride;
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    dst = dst_line;
-    src = src_line;
-
-    while (height--)
-    {
-	sse2_combine_over_u (imp, op, dst, src, NULL, width);
-
-	dst += dst_stride;
-	src += src_stride;
-    }
-}
-
-static force_inline uint16_t
-composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
-{
-    __m128i ms;
-
-    ms = unpack_32_1x128 (src);
-    return pack_565_32_16 (
-	pack_1x128_32 (
-	    over_1x128 (
-		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
-}
-
-static void
-sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int32_t                  src_x,
-                               int32_t                  src_y,
-                               int32_t                  mask_x,
-                               int32_t                  mask_y,
-                               int32_t                  dest_x,
-                               int32_t                  dest_y,
-                               int32_t                  width,
-                               int32_t                  height)
-{
-    uint16_t    *dst_line, *dst, d;
-    uint32_t    *src_line, *src, s;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	src = src_line;
-
-	dst_line += dst_stride;
-	src_line += src_stride;
-	w = width;
-
-	/* Align dst on a 16-byte boundary */
-	while (w &&
-	       ((unsigned long)dst & 15))
-	{
-	    s = *src++;
-	    d = *dst;
-
-	    *dst++ = composite_over_8888_0565pixel (s, d);
-	    w--;
-	}
-
-	/* It's a 8 pixel loop */
-	while (w >= 8)
-	{
-	    /* I'm loading unaligned because I'm not sure
-	     * about the address alignment.
-	     */
-	    xmm_src = load_128_unaligned ((__m128i*) src);
-	    xmm_dst = load_128_aligned ((__m128i*) dst);
-
-	    /* Unpacking */
-	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-	    unpack_565_128_4x128 (xmm_dst,
-				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-				&xmm_alpha_lo, &xmm_alpha_hi);
-
-	    /* I'm loading next 4 pixels from memory
-	     * before to optimze the memory read.
-	     */
-	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
-
-	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
-			&xmm_alpha_lo, &xmm_alpha_hi,
-			&xmm_dst0, &xmm_dst1);
-
-	    /* Unpacking */
-	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-				&xmm_alpha_lo, &xmm_alpha_hi);
-
-	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
-			&xmm_alpha_lo, &xmm_alpha_hi,
-			&xmm_dst2, &xmm_dst3);
-
-	    save_128_aligned (
-		(__m128i*)dst, pack_565_4x128_128 (
-		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
-	    w -= 8;
-	    dst += 8;
-	    src += 8;
-	}
-
-	while (w--)
-	{
-	    s = *src++;
-	    d = *dst;
-
-	    *dst++ = composite_over_8888_0565pixel (s, d);
-	}
-    }
-
-}
-
-static void
-sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint32_t src, srca;
-    uint32_t *dst_line, *dst;
-    uint8_t *mask_line, *mask;
-    int dst_stride, mask_stride;
-    int32_t w;
-    uint32_t m, d;
-
-    __m128i xmm_src, xmm_alpha, xmm_def;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
-    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    xmm_def = create_mask_2x32_128 (src, src);
-    xmm_src = expand_pixel_32_1x128 (src);
-    xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = xmm_src;
-    mmx_alpha = xmm_alpha;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    uint8_t m = *mask++;
-
-	    if (m)
-	    {
-		d = *dst;
-		mmx_mask = expand_pixel_8_1x128 (m);
-		mmx_dest = unpack_32_1x128 (d);
-
-		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
-		                                   &mmx_alpha,
-		                                   &mmx_mask,
-		                                   &mmx_dest));
-	    }
-
-	    w--;
-	    dst++;
-	}
-
-	while (w >= 4)
-	{
-	    m = *((uint32_t*)mask);
-
-	    if (srca == 0xff && m == 0xffffffff)
-	    {
-		save_128_aligned ((__m128i*)dst, xmm_def);
-	    }
-	    else if (m)
-	    {
-		xmm_dst = load_128_aligned ((__m128i*) dst);
-		xmm_mask = unpack_32_1x128 (m);
-		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
-		/* Unpacking */
-		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
-					&xmm_mask_lo, &xmm_mask_hi);
-
-		in_over_2x128 (&xmm_src, &xmm_src,
-			       &xmm_alpha, &xmm_alpha,
-			       &xmm_mask_lo, &xmm_mask_hi,
-			       &xmm_dst_lo, &xmm_dst_hi);
-
-		save_128_aligned (
-		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-	    }
-
-	    w -= 4;
-	    dst += 4;
-	    mask += 4;
-	}
-
-	while (w)
-	{
-	    uint8_t m = *mask++;
-
-	    if (m)
-	    {
-		d = *dst;
-		mmx_mask = expand_pixel_8_1x128 (m);
-		mmx_dest = unpack_32_1x128 (d);
-
-		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
-		                                   &mmx_alpha,
-		                                   &mmx_mask,
-		                                   &mmx_dest));
-	    }
-
-	    w--;
-	    dst++;
-	}
-    }
-
-}
-
-static pixman_bool_t
-pixman_fill_sse2 (uint32_t *bits,
-                  int       stride,
-                  int       bpp,
-                  int       x,
-                  int       y,
-                  int       width,
-                  int       height,
-                  uint32_t  data)
-{
-    uint32_t byte_width;
-    uint8_t         *byte_line;
-
-    __m128i xmm_def;
-
-    if (bpp == 8)
-    {
-	uint8_t b;
-	uint16_t w;
-
-	stride = stride * (int) sizeof (uint32_t) / 1;
-	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
-	byte_width = width;
-	stride *= 1;
-
-	b = data & 0xff;
-	w = (b << 8) | b;
-	data = (w << 16) | w;
-    }
-    else if (bpp == 16)
-    {
-	stride = stride * (int) sizeof (uint32_t) / 2;
-	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
-	byte_width = 2 * width;
-	stride *= 2;
-
-        data = (data & 0xffff) * 0x00010001;
-    }
-    else if (bpp == 32)
-    {
-	stride = stride * (int) sizeof (uint32_t) / 4;
-	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
-	byte_width = 4 * width;
-	stride *= 4;
-    }
-    else
-    {
-	return FALSE;
-    }
-
-    xmm_def = create_mask_2x32_128 (data, data);
-
-    while (height--)
-    {
-	int w;
-	uint8_t *d = byte_line;
-	byte_line += stride;
-	w = byte_width;
-
-	while (w >= 1 && ((unsigned long)d & 1))
-	{
-	    *(uint8_t *)d = data;
-	    w -= 1;
-	    d += 1;
-	}
-
-	while (w >= 2 && ((unsigned long)d & 3))
-	{
-	    *(uint16_t *)d = data;
-	    w -= 2;
-	    d += 2;
-	}
-
-	while (w >= 4 && ((unsigned long)d & 15))
-	{
-	    *(uint32_t *)d = data;
-
-	    w -= 4;
-	    d += 4;
-	}
-
-	while (w >= 128)
-	{
-	    save_128_aligned ((__m128i*)(d),     xmm_def);
-	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
-	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
-	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
-	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
-	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
-	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
-	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
-
-	    d += 128;
-	    w -= 128;
-	}
-
-	if (w >= 64)
-	{
-	    save_128_aligned ((__m128i*)(d),     xmm_def);
-	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
-	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
-	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
-
-	    d += 64;
-	    w -= 64;
-	}
-
-	if (w >= 32)
-	{
-	    save_128_aligned ((__m128i*)(d),     xmm_def);
-	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
-
-	    d += 32;
-	    w -= 32;
-	}
-
-	if (w >= 16)
-	{
-	    save_128_aligned ((__m128i*)(d),     xmm_def);
-
-	    d += 16;
-	    w -= 16;
-	}
-
-	while (w >= 4)
-	{
-	    *(uint32_t *)d = data;
-
-	    w -= 4;
-	    d += 4;
-	}
-
-	if (w >= 2)
-	{
-	    *(uint16_t *)d = data;
-	    w -= 2;
-	    d += 2;
-	}
-
-	if (w >= 1)
-	{
-	    *(uint8_t *)d = data;
-	    w -= 1;
-	    d += 1;
-	}
-    }
-
-    return TRUE;
-}
-
-static void
-sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             pixman_image_t *         src_image,
-                             pixman_image_t *         mask_image,
-                             pixman_image_t *         dst_image,
-                             int32_t                  src_x,
-                             int32_t                  src_y,
-                             int32_t                  mask_x,
-                             int32_t                  mask_y,
-                             int32_t                  dest_x,
-                             int32_t                  dest_y,
-                             int32_t                  width,
-                             int32_t                  height)
-{
-    uint32_t src, srca;
-    uint32_t    *dst_line, *dst;
-    uint8_t     *mask_line, *mask;
-    int dst_stride, mask_stride;
-    int32_t w;
-    uint32_t m;
-
-    __m128i xmm_src, xmm_def;
-    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-    {
-	pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
-	                  PIXMAN_FORMAT_BPP (dst_image->bits.format),
-	                  dest_x, dest_y, width, height, 0);
-	return;
-    }
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    xmm_def = create_mask_2x32_128 (src, src);
-    xmm_src = expand_pixel_32_1x128 (src);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    uint8_t m = *mask++;
-
-	    if (m)
-	    {
-		*dst = pack_1x128_32 (
-		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
-	    }
-	    else
-	    {
-		*dst = 0;
-	    }
-
-	    w--;
-	    dst++;
-	}
-
-	while (w >= 4)
-	{
-	    m = *((uint32_t*)mask);
-
-	    if (srca == 0xff && m == 0xffffffff)
-	    {
-		save_128_aligned ((__m128i*)dst, xmm_def);
-	    }
-	    else if (m)
-	    {
-		xmm_mask = unpack_32_1x128 (m);
-		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
-		/* Unpacking */
-		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
-					&xmm_mask_lo, &xmm_mask_hi);
-
-		pix_multiply_2x128 (&xmm_src, &xmm_src,
-				    &xmm_mask_lo, &xmm_mask_hi,
-				    &xmm_mask_lo, &xmm_mask_hi);
-
-		save_128_aligned (
-		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
-	    }
-	    else
-	    {
-		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
-	    }
-
-	    w -= 4;
-	    dst += 4;
-	    mask += 4;
-	}
-
-	while (w)
-	{
-	    uint8_t m = *mask++;
-
-	    if (m)
-	    {
-		*dst = pack_1x128_32 (
-		    pix_multiply_1x128 (
-			xmm_src, expand_pixel_8_1x128 (m)));
-	    }
-	    else
-	    {
-		*dst = 0;
-	    }
-
-	    w--;
-	    dst++;
-	}
-    }
-
-}
-
-static void
-sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint32_t src, srca;
-    uint16_t    *dst_line, *dst, d;
-    uint8_t     *mask_line, *mask;
-    int dst_stride, mask_stride;
-    int32_t w;
-    uint32_t m;
-    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
-    __m128i xmm_src, xmm_alpha;
-    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    xmm_src = expand_pixel_32_1x128 (src);
-    xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src = xmm_src;
-    mmx_alpha = xmm_alpha;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    m = *mask++;
-
-	    if (m)
-	    {
-		d = *dst;
-		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
-		mmx_dest = expand565_16_1x128 (d);
-
-		*dst = pack_565_32_16 (
-		    pack_1x128_32 (
-			in_over_1x128 (
-			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
-	    }
-
-	    w--;
-	    dst++;
-	}
-
-	while (w >= 8)
-	{
-	    xmm_dst = load_128_aligned ((__m128i*) dst);
-	    unpack_565_128_4x128 (xmm_dst,
-				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
-	    m = *((uint32_t*)mask);
-	    mask += 4;
-
-	    if (m)
-	    {
-		xmm_mask = unpack_32_1x128 (m);
-		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
-		/* Unpacking */
-		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
-					&xmm_mask_lo, &xmm_mask_hi);
-
-		in_over_2x128 (&xmm_src, &xmm_src,
-			       &xmm_alpha, &xmm_alpha,
-			       &xmm_mask_lo, &xmm_mask_hi,
-			       &xmm_dst0, &xmm_dst1);
-	    }
-
-	    m = *((uint32_t*)mask);
-	    mask += 4;
-
-	    if (m)
-	    {
-		xmm_mask = unpack_32_1x128 (m);
-		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
-		/* Unpacking */
-		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
-					&xmm_mask_lo, &xmm_mask_hi);
-		in_over_2x128 (&xmm_src, &xmm_src,
-			       &xmm_alpha, &xmm_alpha,
-			       &xmm_mask_lo, &xmm_mask_hi,
-			       &xmm_dst2, &xmm_dst3);
-	    }
-
-	    save_128_aligned (
-		(__m128i*)dst, pack_565_4x128_128 (
-		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
-	    w -= 8;
-	    dst += 8;
-	}
-
-	while (w)
-	{
-	    m = *mask++;
-
-	    if (m)
-	    {
-		d = *dst;
-		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
-		mmx_dest = expand565_16_1x128 (d);
-
-		*dst = pack_565_32_16 (
-		    pack_1x128_32 (
-			in_over_1x128 (
-			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
-	    }
-
-	    w--;
-	    dst++;
-	}
-    }
-
-}
-
-static void
-sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
-{
-    uint16_t    *dst_line, *dst, d;
-    uint32_t    *src_line, *src, s;
-    int dst_stride, src_stride;
-    int32_t w;
-    uint32_t opaque, zero;
-
-    __m128i ms;
-    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    s = *src++;
-	    d = *dst;
-
-	    ms = unpack_32_1x128 (s);
-
-	    *dst++ = pack_565_32_16 (
-		pack_1x128_32 (
-		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
-	    w--;
-	}
-
-	while (w >= 8)
-	{
-	    /* First round */
-	    xmm_src = load_128_unaligned ((__m128i*)src);
-	    xmm_dst = load_128_aligned  ((__m128i*)dst);
-
-	    opaque = is_opaque (xmm_src);
-	    zero = is_zero (xmm_src);
-
-	    unpack_565_128_4x128 (xmm_dst,
-				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-
-	    /* preload next round*/
-	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
-
-	    if (opaque)
-	    {
-		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
-				     &xmm_dst0, &xmm_dst1);
-	    }
-	    else if (!zero)
-	    {
-		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
-					&xmm_dst0, &xmm_dst1);
-	    }
-
-	    /* Second round */
-	    opaque = is_opaque (xmm_src);
-	    zero = is_zero (xmm_src);
-
-	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-
-	    if (opaque)
-	    {
-		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
-				     &xmm_dst2, &xmm_dst3);
-	    }
-	    else if (!zero)
-	    {
-		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
-					&xmm_dst2, &xmm_dst3);
-	    }
-
-	    save_128_aligned (
-		(__m128i*)dst, pack_565_4x128_128 (
-		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
-	    w -= 8;
-	    src += 8;
-	    dst += 8;
-	}
-
-	while (w)
-	{
-	    s = *src++;
-	    d = *dst;
-
-	    ms = unpack_32_1x128 (s);
-
-	    *dst++ = pack_565_32_16 (
-		pack_1x128_32 (
-		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
-{
-    uint32_t    *dst_line, *dst, d;
-    uint32_t    *src_line, *src, s;
-    int dst_stride, src_stride;
-    int32_t w;
-    uint32_t opaque, zero;
-
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst_lo, xmm_dst_hi;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    s = *src++;
-	    d = *dst;
-
-	    *dst++ = pack_1x128_32 (
-		over_rev_non_pre_1x128 (
-		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
-
-	    w--;
-	}
-
-	while (w >= 4)
-	{
-	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
-
-	    opaque = is_opaque (xmm_src_hi);
-	    zero = is_zero (xmm_src_hi);
-
-	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
-	    if (opaque)
-	    {
-		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
-				     &xmm_dst_lo, &xmm_dst_hi);
-
-		save_128_aligned (
-		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-	    }
-	    else if (!zero)
-	    {
-		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
-
-		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
-					&xmm_dst_lo, &xmm_dst_hi);
-
-		save_128_aligned (
-		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-	    }
-
-	    w -= 4;
-	    dst += 4;
-	    src += 4;
-	}
-
-	while (w)
-	{
-	    s = *src++;
-	    d = *dst;
-
-	    *dst++ = pack_1x128_32 (
-		over_rev_non_pre_1x128 (
-		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
-
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
-                                    pixman_op_t              op,
-                                    pixman_image_t *         src_image,
-                                    pixman_image_t *         mask_image,
-                                    pixman_image_t *         dst_image,
-                                    int32_t                  src_x,
-                                    int32_t                  src_y,
-                                    int32_t                  mask_x,
-                                    int32_t                  mask_y,
-                                    int32_t                  dest_x,
-                                    int32_t                  dest_y,
-                                    int32_t                  width,
-                                    int32_t                  height)
-{
-    uint32_t src;
-    uint16_t    *dst_line, *dst, d;
-    uint32_t    *mask_line, *mask, m;
-    int dst_stride, mask_stride;
-    int w;
-    uint32_t pack_cmp;
-
-    __m128i xmm_src, xmm_alpha;
-    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
-    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
-    xmm_src = expand_pixel_32_1x128 (src);
-    xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src = xmm_src;
-    mmx_alpha = xmm_alpha;
-
-    while (height--)
-    {
-	w = width;
-	mask = mask_line;
-	dst = dst_line;
-	mask_line += mask_stride;
-	dst_line += dst_stride;
-
-	while (w && ((unsigned long)dst & 15))
-	{
-	    m = *(uint32_t *) mask;
-
-	    if (m)
-	    {
-		d = *dst;
-		mmx_mask = unpack_32_1x128 (m);
-		mmx_dest = expand565_16_1x128 (d);
-
-		*dst = pack_565_32_16 (
-		    pack_1x128_32 (
-			in_over_1x128 (
-			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
-	    }
-
-	    w--;
-	    dst++;
-	    mask++;
-	}
-
-	while (w >= 8)
-	{
-	    /* First round */
-	    xmm_mask = load_128_unaligned ((__m128i*)mask);
-	    xmm_dst = load_128_aligned ((__m128i*)dst);
-
-	    pack_cmp = _mm_movemask_epi8 (
-		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
-	    unpack_565_128_4x128 (xmm_dst,
-				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-	    /* preload next round */
-	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
-
-	    /* preload next round */
-	    if (pack_cmp != 0xffff)
-	    {
-		in_over_2x128 (&xmm_src, &xmm_src,
-			       &xmm_alpha, &xmm_alpha,
-			       &xmm_mask_lo, &xmm_mask_hi,
-			       &xmm_dst0, &xmm_dst1);
-	    }
-
-	    /* Second round */
-	    pack_cmp = _mm_movemask_epi8 (
-		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
-	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-	    if (pack_cmp != 0xffff)
-	    {
-		in_over_2x128 (&xmm_src, &xmm_src,
-			       &xmm_alpha, &xmm_alpha,
-			       &xmm_mask_lo, &xmm_mask_hi,
-			       &xmm_dst2, &xmm_dst3);
-	    }
-
-	    save_128_aligned (
-		(__m128i*)dst, pack_565_4x128_128 (
-		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
-	    w -= 8;
-	    dst += 8;
-	    mask += 8;
-	}
-
-	while (w)
-	{
-	    m = *(uint32_t *) mask;
-
-	    if (m)
-	    {
-		d = *dst;
-		mmx_mask = unpack_32_1x128 (m);
-		mmx_dest = expand565_16_1x128 (d);
-
-		*dst = pack_565_32_16 (
-		    pack_1x128_32 (
-			in_over_1x128 (
-			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
-	    }
-
-	    w--;
-	    dst++;
-	    mask++;
-	}
-    }
-
-}
-
-static void
-sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         pixman_image_t *         src_image,
-                         pixman_image_t *         mask_image,
-                         pixman_image_t *         dst_image,
-                         int32_t                  src_x,
-                         int32_t                  src_y,
-                         int32_t                  mask_x,
-                         int32_t                  mask_y,
-                         int32_t                  dest_x,
-                         int32_t                  dest_y,
-                         int32_t                  width,
-                         int32_t                  height)
-{
-    uint8_t     *dst_line, *dst;
-    uint8_t     *mask_line, *mask;
-    int dst_stride, mask_stride;
-    uint32_t d, m;
-    uint32_t src;
-    uint8_t sa;
-    int32_t w;
-
-    __m128i xmm_alpha;
-    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    sa = src >> 24;
-
-    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w && ((unsigned long)dst & 15))
-	{
-	    m = (uint32_t) *mask++;
-	    d = (uint32_t) *dst;
-
-	    *dst++ = (uint8_t) pack_1x128_32 (
-		pix_multiply_1x128 (
-		    pix_multiply_1x128 (xmm_alpha,
-				       unpack_32_1x128 (m)),
-		    unpack_32_1x128 (d)));
-	    w--;
-	}
-
-	while (w >= 16)
-	{
-	    xmm_mask = load_128_unaligned ((__m128i*)mask);
-	    xmm_dst = load_128_aligned ((__m128i*)dst);
-
-	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
-				&xmm_mask_lo, &xmm_mask_hi,
-				&xmm_mask_lo, &xmm_mask_hi);
-
-	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
-				&xmm_dst_lo, &xmm_dst_hi,
-				&xmm_dst_lo, &xmm_dst_hi);
-
-	    save_128_aligned (
-		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	    mask += 16;
-	    dst += 16;
-	    w -= 16;
-	}
-
-	while (w)
-	{
-	    m = (uint32_t) *mask++;
-	    d = (uint32_t) *dst;
-
-	    *dst++ = (uint8_t) pack_1x128_32 (
-		pix_multiply_1x128 (
-		    pix_multiply_1x128 (
-			xmm_alpha, unpack_32_1x128 (m)),
-		    unpack_32_1x128 (d)));
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_in_n_8 (pixman_implementation_t *imp,
-		       pixman_op_t              op,
-		       pixman_image_t *         src_image,
-		       pixman_image_t *         mask_image,
-		       pixman_image_t *         dst_image,
-		       int32_t                  src_x,
-		       int32_t                  src_y,
-		       int32_t                  mask_x,
-		       int32_t                  mask_y,
-		       int32_t                  dest_x,
-		       int32_t                  dest_y,
-		       int32_t                  width,
-		       int32_t                  height)
-{
-    uint8_t     *dst_line, *dst;
-    int dst_stride;
-    uint32_t d;
-    uint32_t src;
-    int32_t w;
-
-    __m128i xmm_alpha;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
-    src = src >> 24;
-
-    if (src == 0xff)
-	return;
-
-    if (src == 0x00)
-    {
-	pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
-		     8, dest_x, dest_y, width, height, src);
-
-	return;
-    }
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	w = width;
-
-	while (w && ((unsigned long)dst & 15))
-	{
-	    d = (uint32_t) *dst;
-
-	    *dst++ = (uint8_t) pack_1x128_32 (
-		pix_multiply_1x128 (
-		    xmm_alpha,
-		    unpack_32_1x128 (d)));
-	    w--;
-	}
-
-	while (w >= 16)
-	{
-	    xmm_dst = load_128_aligned ((__m128i*)dst);
-
-	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-	    
-	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
-				&xmm_dst_lo, &xmm_dst_hi,
-				&xmm_dst_lo, &xmm_dst_hi);
-
-	    save_128_aligned (
-		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	    dst += 16;
-	    w -= 16;
-	}
-
-	while (w)
-	{
-	    d = (uint32_t) *dst;
-
-	    *dst++ = (uint8_t) pack_1x128_32 (
-		pix_multiply_1x128 (
-		    xmm_alpha,
-		    unpack_32_1x128 (d)));
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_in_8_8 (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       pixman_image_t *         src_image,
-                       pixman_image_t *         mask_image,
-                       pixman_image_t *         dst_image,
-                       int32_t                  src_x,
-                       int32_t                  src_y,
-                       int32_t                  mask_x,
-                       int32_t                  mask_y,
-                       int32_t                  dest_x,
-                       int32_t                  dest_y,
-                       int32_t                  width,
-                       int32_t                  height)
-{
-    uint8_t     *dst_line, *dst;
-    uint8_t     *src_line, *src;
-    int src_stride, dst_stride;
-    int32_t w;
-    uint32_t s, d;
-
-    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w && ((unsigned long)dst & 15))
-	{
-	    s = (uint32_t) *src++;
-	    d = (uint32_t) *dst;
-
-	    *dst++ = (uint8_t) pack_1x128_32 (
-		pix_multiply_1x128 (
-		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
-	    w--;
-	}
-
-	while (w >= 16)
-	{
-	    xmm_src = load_128_unaligned ((__m128i*)src);
-	    xmm_dst = load_128_aligned ((__m128i*)dst);
-
-	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
-				&xmm_dst_lo, &xmm_dst_hi,
-				&xmm_dst_lo, &xmm_dst_hi);
-
-	    save_128_aligned (
-		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	    src += 16;
-	    dst += 16;
-	    w -= 16;
-	}
-
-	while (w)
-	{
-	    s = (uint32_t) *src++;
-	    d = (uint32_t) *dst;
-
-	    *dst++ = (uint8_t) pack_1x128_32 (
-		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
-			  pixman_op_t              op,
-			  pixman_image_t *         src_image,
-			  pixman_image_t *         mask_image,
-			  pixman_image_t *         dst_image,
-			  int32_t                  src_x,
-			  int32_t                  src_y,
-			  int32_t                  mask_x,
-			  int32_t                  mask_y,
-			  int32_t                  dest_x,
-			  int32_t                  dest_y,
-			  int32_t                  width,
-			  int32_t                  height)
-{
-    uint8_t     *dst_line, *dst;
-    uint8_t     *mask_line, *mask;
-    int dst_stride, mask_stride;
-    int32_t w;
-    uint32_t src;
-    uint8_t sa;
-    uint32_t m, d;
-
-    __m128i xmm_alpha;
-    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    sa = src >> 24;
-
-    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w && ((unsigned long)dst & 15))
-	{
-	    m = (uint32_t) *mask++;
-	    d = (uint32_t) *dst;
-
-	    *dst++ = (uint8_t) pack_1x128_32 (
-		_mm_adds_epu16 (
-		    pix_multiply_1x128 (
-			xmm_alpha, unpack_32_1x128 (m)),
-		    unpack_32_1x128 (d)));
-	    w--;
-	}
-
-	while (w >= 16)
-	{
-	    xmm_mask = load_128_unaligned ((__m128i*)mask);
-	    xmm_dst = load_128_aligned ((__m128i*)dst);
-
-	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
-				&xmm_mask_lo, &xmm_mask_hi,
-				&xmm_mask_lo, &xmm_mask_hi);
-
-	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
-	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
-
-	    save_128_aligned (
-		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-	    mask += 16;
-	    dst += 16;
-	    w -= 16;
-	}
-
-	while (w)
-	{
-	    m = (uint32_t) *mask++;
-	    d = (uint32_t) *dst;
-
-	    *dst++ = (uint8_t) pack_1x128_32 (
-		_mm_adds_epu16 (
-		    pix_multiply_1x128 (
-			xmm_alpha, unpack_32_1x128 (m)),
-		    unpack_32_1x128 (d)));
-
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_add_n_8 (pixman_implementation_t *imp,
-			pixman_op_t              op,
-			pixman_image_t *         src_image,
-			pixman_image_t *         mask_image,
-			pixman_image_t *         dst_image,
-			int32_t                  src_x,
-			int32_t                  src_y,
-			int32_t                  mask_x,
-			int32_t                  mask_y,
-			int32_t                  dest_x,
-			int32_t                  dest_y,
-			int32_t                  width,
-			int32_t                  height)
-{
-    uint8_t     *dst_line, *dst;
-    int dst_stride;
-    int32_t w;
-    uint32_t src;
-
-    __m128i xmm_src;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    src >>= 24;
-
-    if (src == 0x00)
-	return;
-
-    if (src == 0xff)
-    {
-	pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
-		     8, dest_x, dest_y, width, height, 0xff);
-
-	return;
-    }
-
-    src = (src << 24) | (src << 16) | (src << 8) | src;
-    xmm_src = _mm_set_epi32 (src, src, src, src);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	w = width;
-
-	while (w && ((unsigned long)dst & 15))
-	{
-	    *dst = (uint8_t)_mm_cvtsi128_si32 (
-		_mm_adds_epu8 (
-		    xmm_src,
-		    _mm_cvtsi32_si128 (*dst)));
-
-	    w--;
-	    dst++;
-	}
-
-	while (w >= 16)
-	{
-	    save_128_aligned (
-		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
-
-	    dst += 16;
-	    w -= 16;
-	}
-
-	while (w)
-	{
-	    *dst = (uint8_t)_mm_cvtsi128_si32 (
-		_mm_adds_epu8 (
-		    xmm_src,
-		    _mm_cvtsi32_si128 (*dst)));
-
-	    w--;
-	    dst++;
-	}
-    }
-
-}
-
-static void
-sse2_composite_add_8_8 (pixman_implementation_t *imp,
-			pixman_op_t              op,
-			pixman_image_t *         src_image,
-			pixman_image_t *         mask_image,
-			pixman_image_t *         dst_image,
-			int32_t                  src_x,
-			int32_t                  src_y,
-			int32_t                  mask_x,
-			int32_t                  mask_y,
-			int32_t                  dest_x,
-			int32_t                  dest_y,
-			int32_t                  width,
-			int32_t                  height)
-{
-    uint8_t     *dst_line, *dst;
-    uint8_t     *src_line, *src;
-    int dst_stride, src_stride;
-    int32_t w;
-    uint16_t t;
-
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	src = src_line;
-
-	dst_line += dst_stride;
-	src_line += src_stride;
-	w = width;
-
-	/* Small head */
-	while (w && (unsigned long)dst & 3)
-	{
-	    t = (*dst) + (*src++);
-	    *dst++ = t | (0 - (t >> 8));
-	    w--;
-	}
-
-	sse2_combine_add_u (imp, op,
-			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
-
-	/* Small tail */
-	dst += w & 0xfffc;
-	src += w & 0xfffc;
-
-	w &= 3;
-
-	while (w)
-	{
-	    t = (*dst) + (*src++);
-	    *dst++ = t | (0 - (t >> 8));
-	    w--;
-	}
-    }
-
-}
-
-static void
-sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    int dst_stride, src_stride;
-
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-
-	sse2_combine_add_u (imp, op, dst, src, NULL, width);
-    }
-
-}
-
-static pixman_bool_t
-pixman_blt_sse2 (uint32_t *src_bits,
-                 uint32_t *dst_bits,
-                 int       src_stride,
-                 int       dst_stride,
-                 int       src_bpp,
-                 int       dst_bpp,
-                 int       src_x,
-                 int       src_y,
-                 int       dst_x,
-                 int       dst_y,
-                 int       width,
-                 int       height)
-{
-    uint8_t *   src_bytes;
-    uint8_t *   dst_bytes;
-    int byte_width;
-
-    if (src_bpp != dst_bpp)
-	return FALSE;
-
-    if (src_bpp == 16)
-    {
-	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
-	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
-	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
-	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
-	byte_width = 2 * width;
-	src_stride *= 2;
-	dst_stride *= 2;
-    }
-    else if (src_bpp == 32)
-    {
-	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
-	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
-	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
-	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
-	byte_width = 4 * width;
-	src_stride *= 4;
-	dst_stride *= 4;
-    }
-    else
-    {
-	return FALSE;
-    }
-
-    while (height--)
-    {
-	int w;
-	uint8_t *s = src_bytes;
-	uint8_t *d = dst_bytes;
-	src_bytes += src_stride;
-	dst_bytes += dst_stride;
-	w = byte_width;
-
-	while (w >= 2 && ((unsigned long)d & 3))
-	{
-	    *(uint16_t *)d = *(uint16_t *)s;
-	    w -= 2;
-	    s += 2;
-	    d += 2;
-	}
-
-	while (w >= 4 && ((unsigned long)d & 15))
-	{
-	    *(uint32_t *)d = *(uint32_t *)s;
-
-	    w -= 4;
-	    s += 4;
-	    d += 4;
-	}
-
-	while (w >= 64)
-	{
-	    __m128i xmm0, xmm1, xmm2, xmm3;
-
-	    xmm0 = load_128_unaligned ((__m128i*)(s));
-	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
-	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
-	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
-
-	    save_128_aligned ((__m128i*)(d),    xmm0);
-	    save_128_aligned ((__m128i*)(d + 16), xmm1);
-	    save_128_aligned ((__m128i*)(d + 32), xmm2);
-	    save_128_aligned ((__m128i*)(d + 48), xmm3);
-
-	    s += 64;
-	    d += 64;
-	    w -= 64;
-	}
-
-	while (w >= 16)
-	{
-	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
-
-	    w -= 16;
-	    d += 16;
-	    s += 16;
-	}
-
-	while (w >= 4)
-	{
-	    *(uint32_t *)d = *(uint32_t *)s;
-
-	    w -= 4;
-	    s += 4;
-	    d += 4;
-	}
-
-	if (w >= 2)
-	{
-	    *(uint16_t *)d = *(uint16_t *)s;
-	    w -= 2;
-	    s += 2;
-	    d += 2;
-	}
-    }
-
-
-    return TRUE;
-}
-
-static void
-sse2_composite_copy_area (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          pixman_image_t *         src_image,
-                          pixman_image_t *         mask_image,
-                          pixman_image_t *         dst_image,
-                          int32_t                  src_x,
-                          int32_t                  src_y,
-                          int32_t                  mask_x,
-                          int32_t                  mask_y,
-                          int32_t                  dest_x,
-                          int32_t                  dest_y,
-                          int32_t                  width,
-                          int32_t                  height)
-{
-    pixman_blt_sse2 (src_image->bits.bits,
-                     dst_image->bits.bits,
-                     src_image->bits.rowstride,
-                     dst_image->bits.rowstride,
-                     PIXMAN_FORMAT_BPP (src_image->bits.format),
-                     PIXMAN_FORMAT_BPP (dst_image->bits.format),
-                     src_x, src_y, dest_x, dest_y, width, height);
-}
-
-static void
-sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
-{
-    uint32_t    *src, *src_line, s;
-    uint32_t    *dst, *dst_line, d;
-    uint8_t         *mask, *mask_line;
-    uint32_t m;
-    int src_stride, mask_stride, dst_stride;
-    int32_t w;
-    __m128i ms;
-
-    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-        src = src_line;
-        src_line += src_stride;
-        dst = dst_line;
-        dst_line += dst_stride;
-        mask = mask_line;
-        mask_line += mask_stride;
-
-        w = width;
-
-        while (w && (unsigned long)dst & 15)
-        {
-            s = 0xff000000 | *src++;
-            m = (uint32_t) *mask++;
-            d = *dst;
-            ms = unpack_32_1x128 (s);
-
-            if (m != 0xff)
-            {
-		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
-		__m128i md = unpack_32_1x128 (d);
-
-                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
-            }
-
-            *dst++ = pack_1x128_32 (ms);
-            w--;
-        }
-
-        while (w >= 4)
-        {
-            m = *(uint32_t*) mask;
-            xmm_src = _mm_or_si128 (
-		load_128_unaligned ((__m128i*)src), mask_ff000000);
-
-            if (m == 0xffffffff)
-            {
-                save_128_aligned ((__m128i*)dst, xmm_src);
-            }
-            else
-            {
-                xmm_dst = load_128_aligned ((__m128i*)dst);
-
-                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
-                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-                expand_alpha_rev_2x128 (
-		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
-			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
-			       &xmm_dst_lo, &xmm_dst_hi);
-
-                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-            }
-
-            src += 4;
-            dst += 4;
-            mask += 4;
-            w -= 4;
-        }
-
-        while (w)
-        {
-            m = (uint32_t) *mask++;
-
-            if (m)
-            {
-                s = 0xff000000 | *src;
-
-                if (m == 0xff)
-                {
-                    *dst = s;
-                }
-                else
-                {
-		    __m128i ma, md, ms;
-
-                    d = *dst;
-
-		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
-		    md = unpack_32_1x128 (d);
-		    ms = unpack_32_1x128 (s);
-
-                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
-                }
-
-            }
-
-            src++;
-            dst++;
-            w--;
-        }
-    }
-
-}
-
-static void
-sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
-{
-    uint32_t    *src, *src_line, s;
-    uint32_t    *dst, *dst_line, d;
-    uint8_t         *mask, *mask_line;
-    uint32_t m;
-    int src_stride, mask_stride, dst_stride;
-    int32_t w;
-
-    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-        src = src_line;
-        src_line += src_stride;
-        dst = dst_line;
-        dst_line += dst_stride;
-        mask = mask_line;
-        mask_line += mask_stride;
-
-        w = width;
-
-        while (w && (unsigned long)dst & 15)
-        {
-	    uint32_t sa;
-
-            s = *src++;
-            m = (uint32_t) *mask++;
-            d = *dst;
-
-	    sa = s >> 24;
-
-	    if (m)
-	    {
-		if (sa == 0xff && m == 0xff)
-		{
-		    *dst = s;
-		}
-		else
-		{
-		    __m128i ms, md, ma, msa;
-
-		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
-		    ms = unpack_32_1x128 (s);
-		    md = unpack_32_1x128 (d);
-
-		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
-
-		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
-		}
-	    }
-
-	    dst++;
-            w--;
-        }
-
-        while (w >= 4)
-        {
-            m = *(uint32_t *) mask;
-
-	    if (m)
-	    {
-		xmm_src = load_128_unaligned ((__m128i*)src);
-
-		if (m == 0xffffffff && is_opaque (xmm_src))
-		{
-		    save_128_aligned ((__m128i *)dst, xmm_src);
-		}
-		else
-		{
-		    xmm_dst = load_128_aligned ((__m128i *)dst);
-
-		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
-		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
-		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
-				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-		}
-	    }
-
-            src += 4;
-            dst += 4;
-            mask += 4;
-            w -= 4;
-        }
-
-        while (w)
-        {
-	    uint32_t sa;
-
-            s = *src++;
-            m = (uint32_t) *mask++;
-            d = *dst;
-
-	    sa = s >> 24;
-
-	    if (m)
-	    {
-		if (sa == 0xff && m == 0xff)
-		{
-		    *dst = s;
-		}
-		else
-		{
-		    __m128i ms, md, ma, msa;
-
-		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
-		    ms = unpack_32_1x128 (s);
-		    md = unpack_32_1x128 (d);
-
-		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
-
-		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
-		}
-	    }
-
-	    dst++;
-            w--;
-        }
-    }
-
-}
-
-static void
-sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
-				    pixman_op_t              op,
-				    pixman_image_t *         src_image,
-				    pixman_image_t *         mask_image,
-				    pixman_image_t *         dst_image,
-				    int32_t                  src_x,
-				    int32_t                  src_y,
-				    int32_t                  mask_x,
-				    int32_t                  mask_y,
-				    int32_t                  dest_x,
-				    int32_t                  dest_y,
-				    int32_t                  width,
-				    int32_t                  height)
-{
-    uint32_t src;
-    uint32_t    *dst_line, *dst;
-    __m128i xmm_src;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_dsta_hi, xmm_dsta_lo;
-    int dst_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
-    xmm_src = expand_pixel_32_1x128 (src);
-
-    while (height--)
-    {
-	dst = dst_line;
-
-	dst_line += dst_stride;
-	w = width;
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    __m128i vd;
-
-	    vd = unpack_32_1x128 (*dst);
-
-	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
-					      xmm_src));
-	    w--;
-	    dst++;
-	}
-
-	while (w >= 4)
-	{
-	    __m128i tmp_lo, tmp_hi;
-
-	    xmm_dst = load_128_aligned ((__m128i*)dst);
-
-	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
-
-	    tmp_lo = xmm_src;
-	    tmp_hi = xmm_src;
-
-	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
-			&xmm_dsta_lo, &xmm_dsta_hi,
-			&tmp_lo, &tmp_hi);
-
-	    save_128_aligned (
-		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
-
-	    w -= 4;
-	    dst += 4;
-	}
-
-	while (w)
-	{
-	    __m128i vd;
-
-	    vd = unpack_32_1x128 (*dst);
-
-	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
-					      xmm_src));
-	    w--;
-	    dst++;
-	}
-
-    }
-
-}
-
-static void
-sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
-				    pixman_op_t              op,
-				    pixman_image_t *         src_image,
-				    pixman_image_t *         mask_image,
-				    pixman_image_t *         dst_image,
-				    int32_t                  src_x,
-				    int32_t                  src_y,
-				    int32_t                  mask_x,
-				    int32_t                  mask_y,
-				    int32_t                  dest_x,
-				    int32_t                  dest_y,
-				    int32_t                  width,
-				    int32_t                  height)
-{
-    uint32_t    *src, *src_line, s;
-    uint32_t    *dst, *dst_line, d;
-    uint32_t    *mask, *mask_line;
-    uint32_t    m;
-    int src_stride, mask_stride, dst_stride;
-    int32_t w;
-
-    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
-    PIXMAN_IMAGE_GET_LINE (
-	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-        src = src_line;
-        src_line += src_stride;
-        dst = dst_line;
-        dst_line += dst_stride;
-        mask = mask_line;
-        mask_line += mask_stride;
-
-        w = width;
-
-        while (w && (unsigned long)dst & 15)
-        {
-	    uint32_t sa;
-
-            s = *src++;
-            m = (*mask++) >> 24;
-            d = *dst;
-
-	    sa = s >> 24;
-
-	    if (m)
-	    {
-		if (sa == 0xff && m == 0xff)
-		{
-		    *dst = s;
-		}
-		else
-		{
-		    __m128i ms, md, ma, msa;
-
-		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
-		    ms = unpack_32_1x128 (s);
-		    md = unpack_32_1x128 (d);
-
-		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
-
-		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
-		}
-	    }
-
-	    dst++;
-            w--;
-        }
-
-        while (w >= 4)
-        {
-	    xmm_mask = load_128_unaligned ((__m128i*)mask);
-
-	    if (!is_transparent (xmm_mask))
-	    {
-		xmm_src = load_128_unaligned ((__m128i*)src);
-
-		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
-		{
-		    save_128_aligned ((__m128i *)dst, xmm_src);
-		}
-		else
-		{
-		    xmm_dst = load_128_aligned ((__m128i *)dst);
-
-		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
-		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
-				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-		}
-	    }
-
-            src += 4;
-            dst += 4;
-            mask += 4;
-            w -= 4;
-        }
-
-        while (w)
-        {
-	    uint32_t sa;
-
-            s = *src++;
-            m = (*mask++) >> 24;
-            d = *dst;
-
-	    sa = s >> 24;
-
-	    if (m)
-	    {
-		if (sa == 0xff && m == 0xff)
-		{
-		    *dst = s;
-		}
-		else
-		{
-		    __m128i ms, md, ma, msa;
-
-		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
-		    ms = unpack_32_1x128 (s);
-		    md = unpack_32_1x128 (d);
-
-		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
-
-		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
-		}
-	    }
-
-	    dst++;
-            w--;
-        }
-    }
-
-}
-
-/* A variant of 'sse2_combine_over_u' with minor tweaks */
-static force_inline void
-scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
-                                             const uint32_t* ps,
-                                             int32_t         w,
-                                             pixman_fixed_t  vx,
-                                             pixman_fixed_t  unit_x,
-                                             pixman_fixed_t  max_vx,
-                                             pixman_bool_t   fully_transparent_src)
-{
-    uint32_t s, d;
-    const uint32_t* pm = NULL;
-
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-
-    if (fully_transparent_src)
-	return;
-
-    /* Align dst on a 16-byte boundary */
-    while (w && ((unsigned long)pd & 15))
-    {
-	d = *pd;
-	s = combine1 (ps + (vx >> 16), pm);
-	vx += unit_x;
-
-	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
-	if (pm)
-	    pm++;
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	__m128i tmp;
-	uint32_t tmp1, tmp2, tmp3, tmp4;
-
-	tmp1 = ps[vx >> 16];
-	vx += unit_x;
-	tmp2 = ps[vx >> 16];
-	vx += unit_x;
-	tmp3 = ps[vx >> 16];
-	vx += unit_x;
-	tmp4 = ps[vx >> 16];
-	vx += unit_x;
-
-	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
-
-	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
-
-	if (is_opaque (xmm_src_hi))
-	{
-	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
-	}
-	else if (!is_zero (xmm_src_hi))
-	{
-	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
-	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-	    expand_alpha_2x128 (
-		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-
-	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
-			&xmm_alpha_lo, &xmm_alpha_hi,
-			&xmm_dst_lo, &xmm_dst_hi);
-
-	    /* rebuid the 4 pixel data and save*/
-	    save_128_aligned ((__m128i*)pd,
-			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-	}
-
-	w -= 4;
-	pd += 4;
-	if (pm)
-	    pm += 4;
-    }
-
-    while (w)
-    {
-	d = *pd;
-	s = combine1 (ps + (vx >> 16), pm);
-	vx += unit_x;
-
-	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
-	if (pm)
-	    pm++;
-
-	w--;
-    }
-}
-
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
-		       scaled_nearest_scanline_sse2_8888_8888_OVER,
-		       uint32_t, uint32_t, COVER)
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
-		       scaled_nearest_scanline_sse2_8888_8888_OVER,
-		       uint32_t, uint32_t, NONE)
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
-		       scaled_nearest_scanline_sse2_8888_8888_OVER,
-		       uint32_t, uint32_t, PAD)
-
-static force_inline void
-scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
-					       uint32_t *       dst,
-					       const uint32_t * src,
-					       int32_t          w,
-					       pixman_fixed_t   vx,
-					       pixman_fixed_t   unit_x,
-					       pixman_fixed_t   max_vx,
-					       pixman_bool_t    zero_src)
-{
-    __m128i xmm_mask;
-    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
-    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-
-    if (zero_src || (*mask >> 24) == 0)
-	return;
-
-    xmm_mask = create_mask_16_128 (*mask >> 24);
-
-    while (w && (unsigned long)dst & 15)
-    {
-	uint32_t s = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-
-	if (s)
-	{
-	    uint32_t d = *dst;
-
-	    __m128i ms = unpack_32_1x128 (s);
-	    __m128i alpha     = expand_alpha_1x128 (ms);
-	    __m128i dest      = xmm_mask;
-	    __m128i alpha_dst = unpack_32_1x128 (d);
-
-	    *dst = pack_1x128_32 (
-		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
-	}
-	dst++;
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	uint32_t tmp1, tmp2, tmp3, tmp4;
-
-	tmp1 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	tmp3 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	tmp4 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-
-	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
-
-	if (!is_zero (xmm_src))
-	{
-	    xmm_dst = load_128_aligned ((__m128i*)dst);
-
-	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-			        &xmm_alpha_lo, &xmm_alpha_hi);
-
-	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
-			   &xmm_alpha_lo, &xmm_alpha_hi,
-			   &xmm_mask, &xmm_mask,
-			   &xmm_dst_lo, &xmm_dst_hi);
-
-	    save_128_aligned (
-		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-	}
-
-	dst += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	uint32_t s = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-
-	if (s)
-	{
-	    uint32_t d = *dst;
-
-	    __m128i ms = unpack_32_1x128 (s);
-	    __m128i alpha = expand_alpha_1x128 (ms);
-	    __m128i mask  = xmm_mask;
-	    __m128i dest  = unpack_32_1x128 (d);
-
-	    *dst = pack_1x128_32 (
-		in_over_1x128 (&ms, &alpha, &mask, &dest));
-	}
-
-	dst++;
-	w--;
-    }
-
-}
-
-FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
-			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
-			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
-FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
-			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
-			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
-FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
-			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
-			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
-
-static void
-bilinear_interpolate_line_sse2 (uint32_t *       out,
-                                const uint32_t * top,
-                                const uint32_t * bottom,
-                                int              wt,
-                                int              wb,
-                                pixman_fixed_t   x,
-                                pixman_fixed_t   ux,
-                                int              width)
-{
-    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
-    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
-    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
-    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
-    const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
-    const __m128i xmm_zero = _mm_setzero_si128 ();
-    __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
-    uint32_t pix1, pix2, pix3, pix4;
-
-    #define INTERPOLATE_ONE_PIXEL(pix)						\
-    do {									\
-	__m128i xmm_wh, xmm_lo, xmm_hi, a;					\
-	/* fetch 2x2 pixel block into sse2 register */				\
-	uint32_t tl = top [pixman_fixed_to_int (x)];				\
-	uint32_t tr = top [pixman_fixed_to_int (x) + 1];			\
-	uint32_t bl = bottom [pixman_fixed_to_int (x)];				\
-	uint32_t br = bottom [pixman_fixed_to_int (x) + 1];			\
-	a = _mm_set_epi32 (tr, tl, br, bl);					\
-        x += ux;								\
-	/* vertical interpolation */						\
-	a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),	\
-					    xmm_wt),				\
-			   _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),	\
-					    xmm_wb));				\
-	/* calculate horizontal weights */					\
-	xmm_wh = _mm_add_epi16 (xmm_addc,					\
-				_mm_xor_si128 (xmm_xorc,			\
-					       _mm_srli_epi16 (xmm_x, 8)));	\
-	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
-	/* horizontal interpolation */						\
-	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
-	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
-	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
-			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
-	/* shift and pack the result */						\
-	a = _mm_srli_epi32 (a, 16);						\
-	a = _mm_packs_epi32 (a, a);						\
-	a = _mm_packus_epi16 (a, a);						\
-	pix = _mm_cvtsi128_si32 (a);						\
-    } while (0)
-
-    while ((width -= 4) >= 0)
-    {
-	INTERPOLATE_ONE_PIXEL (pix1);
-	INTERPOLATE_ONE_PIXEL (pix2);
-	INTERPOLATE_ONE_PIXEL (pix3);
-	INTERPOLATE_ONE_PIXEL (pix4);
-	*out++ = pix1;
-	*out++ = pix2;
-	*out++ = pix3;
-	*out++ = pix4;
-    }
-    if (width & 2)
-    {
-	INTERPOLATE_ONE_PIXEL (pix1);
-	INTERPOLATE_ONE_PIXEL (pix2);
-	*out++ = pix1;
-	*out++ = pix2;
-    }
-    if (width & 1)
-    {
-	INTERPOLATE_ONE_PIXEL (pix1);
-	*out = pix1;
-    }
-
-    #undef INTERPOLATE_ONE_PIXEL
-}
-
-static force_inline void
-scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
-					     const uint32_t * mask,
-					     const uint32_t * src_top,
-					     const uint32_t * src_bottom,
-					     int32_t          w,
-					     int              wt,
-					     int              wb,
-					     pixman_fixed_t   vx,
-					     pixman_fixed_t   unit_x,
-					     pixman_fixed_t   max_vx,
-					     pixman_bool_t    zero_src)
-{
-    bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
-				    wt, wb, vx, unit_x, w);
-}
-
-FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
-			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
-			       uint32_t, uint32_t, uint32_t,
-			       COVER, FALSE, FALSE)
-FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
-			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
-			       uint32_t, uint32_t, uint32_t,
-			       PAD, FALSE, FALSE)
-FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
-			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
-			       uint32_t, uint32_t, uint32_t,
-			       NONE, FALSE, FALSE)
-
-static const pixman_fast_path_t sse2_fast_paths[] =
-{
-    /* PIXMAN_OP_OVER */
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
-    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
-    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
-    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
-    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
-    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
-    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
-    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
-    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
-    
-    /* PIXMAN_OP_OVER_REVERSE */
-    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
-
-    /* PIXMAN_OP_ADD */
-    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
-    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
-    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
-
-    /* PIXMAN_OP_SRC */
-    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
-    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
-    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
-    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
-    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
-    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
-
-    /* PIXMAN_OP_IN */
-    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
-    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
-    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
-
-    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
-
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
-
-    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
-    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
-    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
-
-    { PIXMAN_OP_NONE },
-};
-
-static pixman_bool_t
-sse2_blt (pixman_implementation_t *imp,
-          uint32_t *               src_bits,
-          uint32_t *               dst_bits,
-          int                      src_stride,
-          int                      dst_stride,
-          int                      src_bpp,
-          int                      dst_bpp,
-          int                      src_x,
-          int                      src_y,
-          int                      dst_x,
-          int                      dst_y,
-          int                      width,
-          int                      height)
-{
-    if (!pixman_blt_sse2 (
-            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-            src_x, src_y, dst_x, dst_y, width, height))
-
-    {
-	return _pixman_implementation_blt (
-	    imp->delegate,
-	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-	    src_x, src_y, dst_x, dst_y, width, height);
-    }
-
-    return TRUE;
-}
-
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static pixman_bool_t
-sse2_fill (pixman_implementation_t *imp,
-           uint32_t *               bits,
-           int                      stride,
-           int                      bpp,
-           int                      x,
-           int                      y,
-           int                      width,
-           int                      height,
-           uint32_t xor)
-{
-    if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
-    {
-	return _pixman_implementation_fill (
-	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
-    }
-
-    return TRUE;
-}
-
-static uint32_t *
-sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
-{
-    int w = iter->width;
-    __m128i ff000000 = mask_ff000000;
-    uint32_t *dst = iter->buffer;
-    uint32_t *src = (uint32_t *)iter->bits;
-
-    iter->bits += iter->stride;
-
-    while (w && ((unsigned long)dst) & 0x0f)
-    {
-	*dst++ = (*src++) | 0xff000000;
-	w--;
-    }
-
-    while (w >= 4)
-    {
-	save_128_aligned (
-	    (__m128i *)dst, _mm_or_si128 (
-		load_128_unaligned ((__m128i *)src), ff000000));
-
-	dst += 4;
-	src += 4;
-	w -= 4;
-    }
-
-    while (w)
-    {
-	*dst++ = (*src++) | 0xff000000;
-	w--;
-    }
-
-    return iter->buffer;
-}
-
-static uint32_t *
-sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
-{
-    int w = iter->width;
-    uint32_t *dst = iter->buffer;
-    uint16_t *src = (uint16_t *)iter->bits;
-    __m128i ff000000 = mask_ff000000;
-
-    iter->bits += iter->stride;
-
-    while (w && ((unsigned long)dst) & 0x0f)
-    {
-	uint16_t s = *src++;
-
-	*dst++ = CONVERT_0565_TO_8888 (s);
-	w--;
-    }
-
-    while (w >= 8)
-    {
-	__m128i lo, hi, s;
-
-	s = _mm_loadu_si128 ((__m128i *)src);
-
-	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
-	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
-
-	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
-	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
-
-	dst += 8;
-	src += 8;
-	w -= 8;
-    }
-
-    while (w)
-    {
-	uint16_t s = *src++;
-
-	*dst++ = CONVERT_0565_TO_8888 (s);
-	w--;
-    }
-
-    return iter->buffer;
-}
-
-static uint32_t *
-sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
-{
-    int w = iter->width;
-    uint32_t *dst = iter->buffer;
-    uint8_t *src = iter->bits;
-    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
-
-    iter->bits += iter->stride;
-
-    while (w && (((unsigned long)dst) & 15))
-    {
-        *dst++ = *(src++) << 24;
-        w--;
-    }
-
-    while (w >= 16)
-    {
-	xmm0 = _mm_loadu_si128((__m128i *)src);
-
-	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
-	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
-	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
-	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
-	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
-	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
-
-	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);
-	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);
-	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);
-	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
-
-	dst += 16;
-	src += 16;
-	w -= 16;
-    }
-
-    while (w)
-    {
-	*dst++ = *(src++) << 24;
-	w--;
-    }
-
-    return iter->buffer;
-}
-
-typedef struct
-{
-    pixman_format_code_t	format;
-    pixman_iter_get_scanline_t	get_scanline;
-} fetcher_info_t;
-
-static const fetcher_info_t fetchers[] =
-{
-    { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
-    { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },
-    { PIXMAN_a8,		sse2_fetch_a8 },
-    { PIXMAN_null }
-};
-
-static void
-sse2_src_iter_init (pixman_implementation_t *imp,
-		    pixman_iter_t *iter,
-		    pixman_image_t *image,
-		    int x, int y, int width, int height,
-		    uint8_t *buffer, iter_flags_t flags)
-{
-#define FLAGS								\
-    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
-
-    if ((flags & ITER_NARROW)				&&
-	(image->common.flags & FLAGS) == FLAGS		&&
-	x >= 0 && y >= 0				&&
-	x + width <= image->bits.width			&&
-	y + height <= image->bits.height)
-    {
-	const fetcher_info_t *f;
-
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
-	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
-
-		iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
-		iter->width = width;
-		iter->buffer = (uint32_t *)buffer;
-
-		iter->get_scanline = f->get_scanline;
-		return;
-	    }
-	}
-    }
-
-    _pixman_implementation_src_iter_init (
-	imp->delegate, iter, image, x, y, width, height, buffer, flags);
-}
-
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-pixman_implementation_t *
-_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
-{
-    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
-
-    /* SSE2 constants */
-    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
-    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
-    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
-    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
-    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
-    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
-    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
-    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
-    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
-    mask_0080 = create_mask_16_128 (0x0080);
-    mask_00ff = create_mask_16_128 (0x00ff);
-    mask_0101 = create_mask_16_128 (0x0101);
-    mask_ffff = create_mask_16_128 (0xffff);
-    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
-    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
-
-    /* Set up function pointers */
-    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
-    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
-    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
-    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
-    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
-    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
-    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
-    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
-    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
-    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
-
-    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
-
-    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
-    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
-    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
-    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
-    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
-    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
-    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
-
-    imp->blt = sse2_blt;
-    imp->fill = sse2_fill;
-
-    imp->src_iter_init = sse2_src_iter_init;
-
-    return imp;
-}
+/*
+ * Copyright © 2008 Rodrigo Kumpera
+ * Copyright © 2008 André Tupinambá
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Rodrigo Kumpera (kumpera@gmail.com)
+ *          André Tupinambá (andrelrt@gmail.com)
+ *
+ * Based on work by Owen Taylor and Søren Sandmann
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h> /* for SSE2 intrinsics */
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-fast-path.h"
+
+static __m128i mask_0080;
+static __m128i mask_00ff;
+static __m128i mask_0101;
+static __m128i mask_ffff;
+static __m128i mask_ff000000;
+static __m128i mask_alpha;
+
+static __m128i mask_565_r;
+static __m128i mask_565_g1, mask_565_g2;
+static __m128i mask_565_b;
+static __m128i mask_red;
+static __m128i mask_green;
+static __m128i mask_blue;
+
+static __m128i mask_565_fix_rb;
+static __m128i mask_565_fix_g;
+
+static force_inline __m128i
+unpack_32_1x128 (uint32_t data)
+{
+    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
+}
+
+static force_inline void
+unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
+{
+    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
+    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
+}
+
+static force_inline __m128i
+unpack_565_to_8888 (__m128i lo)
+{
+    __m128i r, g, b, rb, t;
+
+    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
+    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
+    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
+
+    rb = _mm_or_si128 (r, b);
+    t  = _mm_and_si128 (rb, mask_565_fix_rb);
+    t  = _mm_srli_epi32 (t, 5);
+    rb = _mm_or_si128 (rb, t);
+
+    t  = _mm_and_si128 (g, mask_565_fix_g);
+    t  = _mm_srli_epi32 (t, 6);
+    g  = _mm_or_si128 (g, t);
+
+    return _mm_or_si128 (rb, g);
+}
+
+static force_inline void
+unpack_565_128_4x128 (__m128i  data,
+                      __m128i* data0,
+                      __m128i* data1,
+                      __m128i* data2,
+                      __m128i* data3)
+{
+    __m128i lo, hi;
+
+    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
+    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
+
+    lo = unpack_565_to_8888 (lo);
+    hi = unpack_565_to_8888 (hi);
+
+    unpack_128_2x128 (lo, data0, data1);
+    unpack_128_2x128 (hi, data2, data3);
+}
+
+static force_inline uint16_t
+pack_565_32_16 (uint32_t pixel)
+{
+    return (uint16_t) (((pixel >> 8) & 0xf800) |
+		       ((pixel >> 5) & 0x07e0) |
+		       ((pixel >> 3) & 0x001f));
+}
+
+static force_inline __m128i
+pack_2x128_128 (__m128i lo, __m128i hi)
+{
+    return _mm_packus_epi16 (lo, hi);
+}
+
+static force_inline __m128i
+pack_565_2x128_128 (__m128i lo, __m128i hi)
+{
+    __m128i data;
+    __m128i r, g1, g2, b;
+
+    data = pack_2x128_128 (lo, hi);
+
+    r  = _mm_and_si128 (data, mask_565_r);
+    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
+
+    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
+}
+
+static force_inline __m128i
+pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
+{
+    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+			     pack_565_2x128_128 (*xmm2, *xmm3));
+}
+
+static force_inline int
+is_opaque (__m128i x)
+{
+    __m128i ffs = _mm_cmpeq_epi8 (x, x);
+
+    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (__m128i x)
+{
+    return _mm_movemask_epi8 (
+	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
+}
+
+static force_inline int
+is_transparent (__m128i x)
+{
+    return (_mm_movemask_epi8 (
+		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
+}
+
+static force_inline __m128i
+expand_pixel_32_1x128 (uint32_t data)
+{
+    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
+}
+
+static force_inline __m128i
+expand_alpha_1x128 (__m128i data)
+{
+    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+						     _MM_SHUFFLE (3, 3, 3, 3)),
+				_MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_2x128 (__m128i  data_lo,
+                    __m128i  data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_rev_2x128 (__m128i  data_lo,
+                        __m128i  data_hi,
+                        __m128i* alpha_lo,
+                        __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline void
+pix_multiply_2x128 (__m128i* data_lo,
+                    __m128i* data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi,
+                    __m128i* ret_lo,
+                    __m128i* ret_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
+    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
+    lo = _mm_adds_epu16 (lo, mask_0080);
+    hi = _mm_adds_epu16 (hi, mask_0080);
+    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
+}
+
+static force_inline void
+pix_add_multiply_2x128 (__m128i* src_lo,
+                        __m128i* src_hi,
+                        __m128i* alpha_dst_lo,
+                        __m128i* alpha_dst_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi,
+                        __m128i* alpha_src_lo,
+                        __m128i* alpha_src_hi,
+                        __m128i* ret_lo,
+                        __m128i* ret_hi)
+{
+    __m128i t1_lo, t1_hi;
+    __m128i t2_lo, t2_hi;
+
+    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
+    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
+
+    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
+    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
+}
+
+static force_inline void
+negate_2x128 (__m128i  data_lo,
+              __m128i  data_hi,
+              __m128i* neg_lo,
+              __m128i* neg_hi)
+{
+    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
+    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
+}
+
+static force_inline void
+invert_colors_2x128 (__m128i  data_lo,
+                     __m128i  data_hi,
+                     __m128i* inv_lo,
+                     __m128i* inv_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline void
+over_2x128 (__m128i* src_lo,
+            __m128i* src_hi,
+            __m128i* alpha_lo,
+            __m128i* alpha_hi,
+            __m128i* dst_lo,
+            __m128i* dst_hi)
+{
+    __m128i t1, t2;
+
+    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
+    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
+}
+
+static force_inline void
+over_rev_non_pre_2x128 (__m128i  src_lo,
+                        __m128i  src_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi)
+{
+    __m128i lo, hi;
+    __m128i alpha_lo, alpha_hi;
+
+    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
+
+    lo = _mm_or_si128 (alpha_lo, mask_alpha);
+    hi = _mm_or_si128 (alpha_hi, mask_alpha);
+
+    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
+
+    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
+
+    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
+}
+
+static force_inline void
+in_over_2x128 (__m128i* src_lo,
+               __m128i* src_hi,
+               __m128i* alpha_lo,
+               __m128i* alpha_hi,
+               __m128i* mask_lo,
+               __m128i* mask_hi,
+               __m128i* dst_lo,
+               __m128i* dst_hi)
+{
+    __m128i s_lo, s_hi;
+    __m128i a_lo, a_hi;
+
+    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
+
+    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
+}
+
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline __m128i
+load_128_aligned (__m128i* src)
+{
+    return _mm_load_si128 (src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline __m128i
+load_128_unaligned (const __m128i* src)
+{
+    return _mm_loadu_si128 (src);
+}
+
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
+static force_inline void
+save_128_write_combining (__m128i* dst,
+                          __m128i  data)
+{
+    _mm_stream_si128 (dst, data);
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (__m128i* dst,
+                  __m128i  data)
+{
+    _mm_store_si128 (dst, data);
+}
+
+/* save 4 pixels on a unaligned address */
+static force_inline void
+save_128_unaligned (__m128i* dst,
+                    __m128i  data)
+{
+    _mm_storeu_si128 (dst, data);
+}
+
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
+{
+    return _mm_cvtsi32_si128 (data);
+}
+
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
+{
+    return _mm_shufflelo_epi16 (
+	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+		    __m128i alpha)
+{
+    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+					    mask_0080),
+			    mask_0101);
+}
+
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+			__m128i* alpha_dst,
+			__m128i* dst,
+			__m128i* alpha_src)
+{
+    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
+
+    return _mm_adds_epu8 (t1, t2);
+}
+
+static force_inline __m128i
+negate_1x128 (__m128i data)
+{
+    return _mm_xor_si128 (data, mask_00ff);
+}
+
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
+{
+    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
+}
+
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
+{
+    return over_1x128 (pix_multiply_1x128 (*src, *mask),
+		       pix_multiply_1x128 (*alpha, *mask),
+		       *dst);
+}
+
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
+{
+    __m128i alpha = expand_alpha_1x128 (src);
+
+    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+					   _mm_or_si128 (alpha, mask_alpha)),
+		       alpha,
+		       dst);
+}
+
+static force_inline uint32_t
+pack_1x128_32 (__m128i data)
+{
+    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
+}
+
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
+{
+    __m128i m = _mm_cvtsi32_si128 (pixel);
+
+    m = unpack_565_to_8888 (m);
+
+    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+    uint8_t a;
+    __m128i xmms;
+
+    a = src >> 24;
+
+    if (a == 0xff)
+    {
+	return src;
+    }
+    else if (src)
+    {
+	xmms = unpack_32_1x128 (src);
+	return pack_1x128_32 (
+	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
+			unpack_32_1x128 (dst)));
+    }
+
+    return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+    uint32_t s = *ps;
+
+    if (pm)
+    {
+	__m128i ms, mm;
+
+	mm = unpack_32_1x128 (*pm);
+	mm = expand_alpha_1x128 (mm);
+
+	ms = unpack_32_1x128 (s);
+	ms = pix_multiply_1x128 (ms, mm);
+
+	s = pack_1x128_32 (ms);
+    }
+
+    return s;
+}
+
+static force_inline __m128i
+combine4 (const __m128i *ps, const __m128i *pm)
+{
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_msk_lo, xmm_msk_hi;
+    __m128i s;
+
+    if (pm)
+    {
+	xmm_msk_lo = load_128_unaligned (pm);
+
+	if (is_transparent (xmm_msk_lo))
+	    return _mm_setzero_si128 ();
+    }
+
+    s = load_128_unaligned (ps);
+
+    if (pm)
+    {
+	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
+
+	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_msk_lo, &xmm_msk_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+
+	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
+    }
+
+    return s;
+}
+
+static force_inline void
+core_combine_over_u_sse2_mask (uint32_t *	  pd,
+			       const uint32_t*    ps,
+			       const uint32_t*    pm,
+			       int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	pm++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i mask = load_128_unaligned ((__m128i *)pm);
+
+	if (!is_zero (mask))
+	{
+	    __m128i src;
+	    __m128i src_hi, src_lo;
+	    __m128i mask_hi, mask_lo;
+	    __m128i alpha_hi, alpha_lo;
+
+	    src = load_128_unaligned ((__m128i *)ps);
+
+	    if (is_opaque (_mm_and_si128 (src, mask)))
+	    {
+		save_128_aligned ((__m128i *)pd, src);
+	    }
+	    else
+	    {
+		__m128i dst = load_128_aligned ((__m128i *)pd);
+		__m128i dst_hi, dst_lo;
+
+		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
+		unpack_128_2x128 (src, &src_lo, &src_hi);
+
+		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
+		pix_multiply_2x128 (&src_lo, &src_hi,
+				    &mask_lo, &mask_hi,
+				    &src_lo, &src_hi);
+
+		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+		expand_alpha_2x128 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+
+		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_128_aligned (
+		    (__m128i *)pd,
+		    pack_2x128_128 (dst_lo, dst_hi));
+	    }
+	}
+
+	pm += 4;
+	ps += 4;
+	pd += 4;
+	w -= 4;
+    }
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	pm++;
+
+	w--;
+    }
+}
+
+static force_inline void
+core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
+				  const uint32_t*    ps,
+				  int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i src;
+	__m128i src_hi, src_lo, dst_hi, dst_lo;
+	__m128i alpha_hi, alpha_lo;
+
+	src = load_128_unaligned ((__m128i *)ps);
+
+	if (!is_zero (src))
+	{
+	    if (is_opaque (src))
+	    {
+		save_128_aligned ((__m128i *)pd, src);
+	    }
+	    else
+	    {
+		__m128i dst = load_128_aligned ((__m128i *)pd);
+
+		unpack_128_2x128 (src, &src_lo, &src_hi);
+		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+		expand_alpha_2x128 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_128_aligned (
+		    (__m128i *)pd,
+		    pack_2x128_128 (dst_lo, dst_hi));
+	    }
+	}
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+    }
+    while (w)
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+
+	w--;
+    }
+}
+
+static force_inline void
+sse2_combine_over_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    if (pm)
+	core_combine_over_u_sse2_mask (pd, ps, pm, w);
+    else
+	core_combine_over_u_sse2_no_mask (pd, ps, w);
+}
+
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    /* Align dst on a 16-byte boundary */
+    while (w &&
+           ((unsigned long)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	/* I'm loading unaligned because I'm not sure
+	 * about the address alignment.
+	 */
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+		    &xmm_alpha_lo, &xmm_alpha_hi,
+		    &xmm_src_lo, &xmm_src_hi);
+
+	/* rebuid the 4 pixel data and save*/
+	save_128_aligned ((__m128i*)pd,
+			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
+
+	w -= 4;
+	ps += 4;
+	pd += 4;
+
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+    uint32_t maska = src >> 24;
+
+    if (maska == 0)
+    {
+	return 0;
+    }
+    else if (maska != 0xff)
+    {
+	return pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (dst),
+				expand_alpha_1x128 (unpack_32_1x128 (src))));
+    }
+
+    return dst;
+}
+
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               pd,
+                   const uint32_t *         ps,
+                   const uint32_t *         pm,
+                   int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned ((__m128i*)pd,
+			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               pd,
+                           const uint32_t *         ps,
+                           const uint32_t *         pm,
+                           int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
+{
+    while (w && ((unsigned long) pd & 15))
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
+
+	if (pm)
+	    pm++;
+	ps++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src_lo, xmm_src_hi;
+	__m128i xmm_dst_lo, xmm_dst_hi;
+
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	if (pm)
+	    pm += 4;
+
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	ps++;
+	if (pm)
+	    pm++;
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
+{
+    while (w && ((unsigned long) pd & 15))
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src_lo, xmm_src_hi;
+	__m128i xmm_dst_lo, xmm_dst_hi;
+
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_atop_u_pixel_sse2 (uint32_t src,
+                                uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+    __m128i da = expand_alpha_1x128 (d);
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
+                                        uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_xor_u_pixel_sse2 (uint32_t src,
+                               uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
+}
+
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int w = width;
+    uint32_t s, d;
+    uint32_t* pd = dst;
+    const uint32_t* ps = src;
+    const uint32_t* pm = mask;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
+	xmm_dst = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline void
+sse2_combine_add_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int w = width;
+    uint32_t s, d;
+    uint32_t* pd = dst;
+    const uint32_t* ps = src;
+    const uint32_t* pm = mask;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	ps++;
+	if (pm)
+	    pm++;
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i s;
+
+	s = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+	save_128_aligned (
+	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
+
+	pd += 4;
+	ps += 4;
+	if (pm)
+	    pm += 4;
+	w -= 4;
+    }
+
+    while (w--)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	ps++;
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_saturate_u_pixel_sse2 (uint32_t src,
+                                    uint32_t dst)
+{
+    __m128i ms = unpack_32_1x128 (src);
+    __m128i md = unpack_32_1x128 (dst);
+    uint32_t sa = src >> 24;
+    uint32_t da = ~dst >> 24;
+
+    if (sa > da)
+    {
+	ms = pix_multiply_1x128 (
+	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
+    }
+
+    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
+}
+
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *               pd,
+                         const uint32_t *         ps,
+                         const uint32_t *         pm,
+                         int                      w)
+{
+    uint32_t s, d;
+
+    uint32_t pack_cmp;
+    __m128i xmm_src, xmm_dst;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst = load_128_aligned  ((__m128i*)pd);
+	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+	pack_cmp = _mm_movemask_epi8 (
+	    _mm_cmpgt_epi32 (
+		_mm_srli_epi32 (xmm_src, 24),
+		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
+
+	/* if some alpha src is grater than respective ~alpha dst */
+	if (pack_cmp)
+	{
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+	}
+	else
+	{
+	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
+
+	    pd += 4;
+	    ps += 4;
+	    if (pm)
+		pm += 4;
+	}
+
+	w -= 4;
+    }
+
+    while (w--)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_over_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i expAlpha = expand_alpha_1x128 (s);
+    __m128i unpk_mask = unpack_32_1x128 (mask);
+    __m128i unpk_dst  = unpack_32_1x128 (dst);
+
+    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+}
+
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+		       &xmm_alpha_lo, &xmm_alpha_hi,
+		       &xmm_mask_lo, &xmm_mask_hi,
+		       &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
+{
+    __m128i d = unpack_32_1x128 (dst);
+
+    return pack_1x128_32 (
+	over_1x128 (d, expand_alpha_1x128 (d),
+		    pix_multiply_1x128 (unpack_32_1x128 (src),
+					unpack_32_1x128 (mask))));
+}
+
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+		    &xmm_alpha_lo, &xmm_alpha_hi,
+		    &xmm_mask_lo, &xmm_mask_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+	w--;
+    }
+}
+
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
+		      &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+		      &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_atop_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
+{
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = expand_alpha_1x128 (d);
+
+    s = pix_multiply_1x128 (s, m);
+    m = negate_1x128 (pix_multiply_1x128 (m, sa));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
+{
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i sa = expand_alpha_1x128 (s);
+
+    s = pix_multiply_1x128 (s, m);
+    m = pix_multiply_1x128 (m, sa);
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_xor_ca_pixel_sse2 (uint32_t src,
+                                uint32_t mask,
+                                uint32_t dst)
+{
+    __m128i a = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+				       a, expand_alpha_1x128 (s)));
+    __m128i dest      = pix_multiply_1x128 (s, a);
+    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
+                                                &alpha_dst,
+                                                &dest,
+                                                &alpha_src));
+}
+
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+		      &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (
+		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
+	w--;
+    }
+}
+
+static force_inline __m128i
+create_mask_16_128 (uint16_t mask)
+{
+    return _mm_set1_epi16 (mask);
+}
+
+/* Work around a code generation bug in Sun Studio 12. */
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
+# define create_mask_2x32_128(mask0, mask1)				\
+    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+#else
+static force_inline __m128i
+create_mask_2x32_128 (uint32_t mask0,
+                      uint32_t mask1)
+{
+    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
+}
+#endif
+
+static void
+sse2_composite_over_n_8888 (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            pixman_image_t *         src_image,
+                            pixman_image_t *         mask_image,
+                            pixman_image_t *         dst_image,
+                            int32_t                  src_x,
+                            int32_t                  src_y,
+                            int32_t                  mask_x,
+                            int32_t                  mask_y,
+                            int32_t                  dest_x,
+                            int32_t                  dest_y,
+                            int32_t                  width,
+                            int32_t                  height)
+{
+    uint32_t src;
+    uint32_t    *dst_line, *dst, d;
+    int32_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    d = *dst;
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst_lo, &xmm_dst_hi);
+
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    w -= 4;
+	    dst += 4;
+	}
+
+	while (w)
+	{
+	    d = *dst;
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
+	    w--;
+	}
+
+    }
+}
+
+static void
+sse2_composite_over_n_0565 (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            pixman_image_t *         src_image,
+                            pixman_image_t *         mask_image,
+                            pixman_image_t *         dst_image,
+                            int32_t                  src_x,
+                            int32_t                  src_y,
+                            int32_t                  mask_x,
+                            int32_t                  mask_y,
+                            int32_t                  dest_x,
+                            int32_t                  dest_y,
+                            int32_t                  width,
+                            int32_t                  height)
+{
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    int32_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    d = *dst;
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (over_1x128 (xmm_src,
+					   xmm_alpha,
+					   expand565_16_1x128 (d))));
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst0, &xmm_dst1);
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst2, &xmm_dst3);
+
+	    xmm_dst = pack_565_4x128_128 (
+		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    save_128_aligned ((__m128i*)dst, xmm_dst);
+
+	    dst += 8;
+	    w -= 8;
+	}
+
+	while (w--)
+	{
+	    d = *dst;
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+					   expand565_16_1x128 (d))));
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				   pixman_op_t              op,
+				   pixman_image_t *         src_image,
+				   pixman_image_t *         mask_image,
+				   pixman_image_t *         dst_image,
+				   int32_t                  src_x,
+				   int32_t                  src_y,
+				   int32_t                  mask_x,
+				   int32_t                  mask_y,
+				   int32_t                  dest_x,
+				   int32_t                  dest_y,
+				   int32_t                  width,
+				   int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    srca = src >> 24;
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = _mm_unpacklo_epi8 (
+	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (unsigned long)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+				   mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+	    pack_cmp =
+		_mm_movemask_epi8 (
+		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+	    if (pack_cmp != 0xffff)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)pd);
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
+
+		save_128_aligned (
+		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+				   mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    pixman_image_t *         src_image,
+                                    pixman_image_t *         mask_image,
+                                    pixman_image_t *         dst_image,
+                                    int32_t                  src_x,
+                                    int32_t                  src_y,
+                                    int32_t                  mask_x,
+                                    int32_t                  mask_y,
+                                    int32_t                  dest_x,
+                                    int32_t                  dest_y,
+                                    int32_t                  width,
+                                    int32_t                  height)
+{
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = _mm_unpacklo_epi8 (
+	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (unsigned long)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                  &mmx_alpha,
+		                                  &mmx_mask,
+		                                  &mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+	    pack_cmp =
+		_mm_movemask_epi8 (
+		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+	    if (pack_cmp != 0xffff)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)pd);
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int32_t w;
+    int dst_stride, src_stride;
+
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+    xmm_mask = create_mask_16_128 (mask >> 24);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint32_t s = *src++;
+
+	    if (s)
+	    {
+		uint32_t d = *dst;
+		
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha    = expand_alpha_1x128 (ms);
+		__m128i dest     = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+		
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	    }
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+
+	    if (!is_zero (xmm_src))
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)dst);
+		
+		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				    &xmm_alpha_lo, &xmm_alpha_hi);
+		
+		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			       &xmm_alpha_lo, &xmm_alpha_hi,
+			       &xmm_mask, &xmm_mask,
+			       &xmm_dst_lo, &xmm_dst_hi);
+		
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+		
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    uint32_t s = *src++;
+
+	    if (s)
+	    {
+		uint32_t d = *dst;
+		
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha = expand_alpha_1x128 (ms);
+		__m128i mask  = xmm_mask;
+		__m128i dest  = unpack_32_1x128 (d);
+		
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &mask, &dest));
+	    }
+
+	    dst++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
+			      pixman_op_t              op,
+			      pixman_image_t *         src_image,
+			      pixman_image_t *         mask_image,
+			      pixman_image_t *         dst_image,
+			      int32_t                  src_x,
+			      int32_t                  src_y,
+			      int32_t                  mask_x,
+			      int32_t                  mask_y,
+			      int32_t                  dest_x,
+			      int32_t                  dest_y,
+			      int32_t                  width,
+			      int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int32_t w;
+    int dst_stride, src_stride;
+
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
+	    
+	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
+	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
+	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
+	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
+	    
+	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
+	    
+	    dst += 16;
+	    src += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    __m128i xmm_mask, xmm_alpha;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+    xmm_mask = create_mask_16_128 (mask >> 24);
+    xmm_alpha = mask_00ff;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint32_t s = (*src++) | 0xff000000;
+	    uint32_t d = *dst;
+
+	    __m128i src   = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
+
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src = _mm_or_si128 (
+		load_128_unaligned ((__m128i*)src), mask_ff000000);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha, &xmm_alpha,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+
+	}
+
+	while (w)
+	{
+	    uint32_t s = (*src++) | 0xff000000;
+	    uint32_t d = *dst;
+
+	    __m128i src  = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int32_t                  src_x,
+                               int32_t                  src_y,
+                               int32_t                  mask_x,
+                               int32_t                  mask_y,
+                               int32_t                  dest_x,
+                               int32_t                  dest_y,
+                               int32_t                  width,
+                               int32_t                  height)
+{
+    int dst_stride, src_stride;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    dst = dst_line;
+    src = src_line;
+
+    while (height--)
+    {
+	sse2_combine_over_u (imp, op, dst, src, NULL, width);
+
+	dst += dst_stride;
+	src += src_stride;
+    }
+}
+
+static force_inline uint16_t
+composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
+{
+    __m128i ms;
+
+    ms = unpack_32_1x128 (src);
+    return pack_565_32_16 (
+	pack_1x128_32 (
+	    over_1x128 (
+		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
+}
+
+static void
+sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int32_t                  src_x,
+                               int32_t                  src_y,
+                               int32_t                  mask_x,
+                               int32_t                  mask_y,
+                               int32_t                  dest_x,
+                               int32_t                  dest_y,
+                               int32_t                  width,
+                               int32_t                  height)
+{
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	src = src_line;
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Align dst on a 16-byte boundary */
+	while (w &&
+	       ((unsigned long)dst & 15))
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = composite_over_8888_0565pixel (s, d);
+	    w--;
+	}
+
+	/* It's a 8 pixel loop */
+	while (w >= 8)
+	{
+	    /* I'm loading unaligned because I'm not sure
+	     * about the address alignment.
+	     */
+	    xmm_src = load_128_unaligned ((__m128i*) src);
+	    xmm_dst = load_128_aligned ((__m128i*) dst);
+
+	    /* Unpacking */
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    /* I'm loading next 4 pixels from memory
+	     * before to optimze the memory read.
+	     */
+	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst0, &xmm_dst1);
+
+	    /* Unpacking */
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst2, &xmm_dst3);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	    src += 8;
+	}
+
+	while (w--)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = composite_over_8888_0565pixel (s, d);
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m, d;
+
+    __m128i xmm_src, xmm_alpha, xmm_def;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                   &mmx_alpha,
+		                                   &mmx_mask,
+		                                   &mmx_dest));
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    m = *((uint32_t*)mask);
+
+	    if (srca == 0xff && m == 0xffffffff)
+	    {
+		save_128_aligned ((__m128i*)dst, xmm_def);
+	    }
+	    else if (m)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*) dst);
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                   &mmx_alpha,
+		                                   &mmx_mask,
+		                                   &mmx_dest));
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static pixman_bool_t
+pixman_fill_sse2 (uint32_t *bits,
+                  int       stride,
+                  int       bpp,
+                  int       x,
+                  int       y,
+                  int       width,
+                  int       height,
+                  uint32_t  data)
+{
+    uint32_t byte_width;
+    uint8_t         *byte_line;
+
+    __m128i xmm_def;
+
+    if (bpp == 8)
+    {
+	uint8_t b;
+	uint16_t w;
+
+	stride = stride * (int) sizeof (uint32_t) / 1;
+	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+	byte_width = width;
+	stride *= 1;
+
+	b = data & 0xff;
+	w = (b << 8) | b;
+	data = (w << 16) | w;
+    }
+    else if (bpp == 16)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 2;
+	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	byte_width = 2 * width;
+	stride *= 2;
+
+        data = (data & 0xffff) * 0x00010001;
+    }
+    else if (bpp == 32)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 4;
+	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	byte_width = 4 * width;
+	stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    xmm_def = create_mask_2x32_128 (data, data);
+
+    while (height--)
+    {
+	int w;
+	uint8_t *d = byte_line;
+	byte_line += stride;
+	w = byte_width;
+
+	while (w >= 1 && ((unsigned long)d & 1))
+	{
+	    *(uint8_t *)d = data;
+	    w -= 1;
+	    d += 1;
+	}
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = data;
+	    w -= 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 15))
+	{
+	    *(uint32_t *)d = data;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	while (w >= 128)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
+
+	    d += 128;
+	    w -= 128;
+	}
+
+	if (w >= 64)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+
+	    d += 64;
+	    w -= 64;
+	}
+
+	if (w >= 32)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+
+	    d += 32;
+	    w -= 32;
+	}
+
+	if (w >= 16)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+
+	    d += 16;
+	    w -= 16;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = data;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = data;
+	    w -= 2;
+	    d += 2;
+	}
+
+	if (w >= 1)
+	{
+	    *(uint8_t *)d = data;
+	    w -= 1;
+	    d += 1;
+	}
+    }
+
+    return TRUE;
+}
+
+static void
+sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m;
+
+    __m128i xmm_src, xmm_def;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+    {
+	pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
+	                  PIXMAN_FORMAT_BPP (dst_image->bits.format),
+	                  dest_x, dest_y, width, height, 0);
+	return;
+    }
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    m = *((uint32_t*)mask);
+
+	    if (srca == 0xff && m == 0xffffffff)
+	    {
+		save_128_aligned ((__m128i*)dst, xmm_def);
+	    }
+	    else if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+	    }
+	    else
+	    {
+		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (
+			xmm_src, expand_pixel_8_1x128 (m)));
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t src, srca;
+    uint16_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 8)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*) dst);
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    m = *((uint32_t*)mask);
+	    mask += 4;
+
+	    if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst0, &xmm_dst1);
+	    }
+
+	    m = *((uint32_t*)mask);
+	    mask += 4;
+
+	    if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t opaque, zero;
+
+    __m128i ms;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    ms = unpack_32_1x128 (s);
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    /* First round */
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+	    xmm_dst = load_128_aligned  ((__m128i*)dst);
+
+	    opaque = is_opaque (xmm_src);
+	    zero = is_zero (xmm_src);
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+	    /* preload next round*/
+	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst0, &xmm_dst1);
+	    }
+	    else if (!zero)
+	    {
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst0, &xmm_dst1);
+	    }
+
+	    /* Second round */
+	    opaque = is_opaque (xmm_src);
+	    zero = is_zero (xmm_src);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst2, &xmm_dst3);
+	    }
+	    else if (!zero)
+	    {
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    src += 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    ms = unpack_32_1x128 (s);
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t opaque, zero;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
+
+	    opaque = is_opaque (xmm_src_hi);
+	    zero = is_zero (xmm_src_hi);
+
+	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+	    else if (!zero)
+	    {
+		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
+
+		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    pixman_image_t *         src_image,
+                                    pixman_image_t *         mask_image,
+                                    pixman_image_t *         dst_image,
+                                    int32_t                  src_x,
+                                    int32_t                  src_y,
+                                    int32_t                  mask_x,
+                                    int32_t                  mask_y,
+                                    int32_t                  dest_x,
+                                    int32_t                  dest_y,
+                                    int32_t                  width,
+                                    int32_t                  height)
+{
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int w;
+    uint32_t pack_cmp;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	w = width;
+	mask = mask_line;
+	dst = dst_line;
+	mask_line += mask_stride;
+	dst_line += dst_stride;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	    mask++;
+	}
+
+	while (w >= 8)
+	{
+	    /* First round */
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    pack_cmp = _mm_movemask_epi8 (
+		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+	    /* preload next round */
+	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
+
+	    /* preload next round */
+	    if (pack_cmp != 0xffff)
+	    {
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst0, &xmm_dst1);
+	    }
+
+	    /* Second round */
+	    pack_cmp = _mm_movemask_epi8 (
+		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+	    if (pack_cmp != 0xffff)
+	    {
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	    mask += 8;
+	}
+
+	while (w)
+	{
+	    m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	    mask++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         pixman_image_t *         src_image,
+                         pixman_image_t *         mask_image,
+                         pixman_image_t *         dst_image,
+                         int32_t                  src_x,
+                         int32_t                  src_y,
+                         int32_t                  mask_x,
+                         int32_t                  mask_y,
+                         int32_t                  dest_x,
+                         int32_t                  dest_y,
+                         int32_t                  width,
+                         int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint32_t d, m;
+    uint32_t src;
+    uint8_t sa;
+    int32_t w;
+
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    sa = src >> 24;
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (xmm_alpha,
+				       unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_mask_lo, &xmm_mask_hi);
+
+	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    mask += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_n_8 (pixman_implementation_t *imp,
+		       pixman_op_t              op,
+		       pixman_image_t *         src_image,
+		       pixman_image_t *         mask_image,
+		       pixman_image_t *         dst_image,
+		       int32_t                  src_x,
+		       int32_t                  src_y,
+		       int32_t                  mask_x,
+		       int32_t                  mask_y,
+		       int32_t                  dest_x,
+		       int32_t                  dest_y,
+		       int32_t                  width,
+		       int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    int dst_stride;
+    uint32_t d;
+    uint32_t src;
+    int32_t w;
+
+    __m128i xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    src = src >> 24;
+
+    if (src == 0xff)
+	return;
+
+    if (src == 0x00)
+    {
+	pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+		     8, dest_x, dest_y, width, height, src);
+
+	return;
+    }
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       pixman_image_t *         src_image,
+                       pixman_image_t *         mask_image,
+                       pixman_image_t *         dst_image,
+                       int32_t                  src_x,
+                       int32_t                  src_y,
+                       int32_t                  mask_x,
+                       int32_t                  mask_y,
+                       int32_t                  dest_x,
+                       int32_t                  dest_y,
+                       int32_t                  width,
+                       int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    int32_t w;
+    uint32_t s, d;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    s = (uint32_t) *src++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    src += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    s = (uint32_t) *src++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
+			  pixman_op_t              op,
+			  pixman_image_t *         src_image,
+			  pixman_image_t *         mask_image,
+			  pixman_image_t *         dst_image,
+			  int32_t                  src_x,
+			  int32_t                  src_y,
+			  int32_t                  mask_x,
+			  int32_t                  mask_y,
+			  int32_t                  dest_x,
+			  int32_t                  dest_y,
+			  int32_t                  width,
+			  int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+    uint32_t m, d;
+
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    sa = src >> 24;
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_mask_lo, &xmm_mask_hi);
+
+	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    mask += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8 (pixman_implementation_t *imp,
+			pixman_op_t              op,
+			pixman_image_t *         src_image,
+			pixman_image_t *         mask_image,
+			pixman_image_t *         dst_image,
+			int32_t                  src_x,
+			int32_t                  src_y,
+			int32_t                  mask_x,
+			int32_t                  mask_y,
+			int32_t                  dest_x,
+			int32_t                  dest_y,
+			int32_t                  width,
+			int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    int dst_stride;
+    int32_t w;
+    uint32_t src;
+
+    __m128i xmm_src;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    src >>= 24;
+
+    if (src == 0x00)
+	return;
+
+    if (src == 0xff)
+    {
+	pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+		     8, dest_x, dest_y, width, height, 0xff);
+
+	return;
+    }
+
+    src = (src << 24) | (src << 16) | (src << 8) | src;
+    xmm_src = _mm_set_epi32 (src, src, src, src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 16)
+	{
+	    save_128_aligned (
+		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
+
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_op_t              op,
+			pixman_image_t *         src_image,
+			pixman_image_t *         mask_image,
+			pixman_image_t *         dst_image,
+			int32_t                  src_x,
+			int32_t                  src_y,
+			int32_t                  mask_x,
+			int32_t                  mask_y,
+			int32_t                  dest_x,
+			int32_t                  dest_y,
+			int32_t                  width,
+			int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	src = src_line;
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Small head */
+	while (w && (unsigned long)dst & 3)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+
+	sse2_combine_add_u (imp, op,
+			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+	/* Small tail */
+	dst += w & 0xfffc;
+	src += w & 0xfffc;
+
+	w &= 3;
+
+	while (w)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	sse2_combine_add_u (imp, op, dst, src, NULL, width);
+    }
+
+}
+
+static pixman_bool_t
+pixman_blt_sse2 (uint32_t *src_bits,
+                 uint32_t *dst_bits,
+                 int       src_stride,
+                 int       dst_stride,
+                 int       src_bpp,
+                 int       dst_bpp,
+                 int       src_x,
+                 int       src_y,
+                 int       dst_x,
+                 int       dst_y,
+                 int       width,
+                 int       height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
+
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    if (src_bpp == 16)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+	byte_width = 2 * width;
+	src_stride *= 2;
+	dst_stride *= 2;
+    }
+    else if (src_bpp == 32)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+	byte_width = 4 * width;
+	src_stride *= 4;
+	dst_stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    while (height--)
+    {
+	int w;
+	uint8_t *s = src_bytes;
+	uint8_t *d = dst_bytes;
+	src_bytes += src_stride;
+	dst_bytes += dst_stride;
+	w = byte_width;
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 15))
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	while (w >= 64)
+	{
+	    __m128i xmm0, xmm1, xmm2, xmm3;
+
+	    xmm0 = load_128_unaligned ((__m128i*)(s));
+	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
+	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
+	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
+
+	    save_128_aligned ((__m128i*)(d),    xmm0);
+	    save_128_aligned ((__m128i*)(d + 16), xmm1);
+	    save_128_aligned ((__m128i*)(d + 32), xmm2);
+	    save_128_aligned ((__m128i*)(d + 48), xmm3);
+
+	    s += 64;
+	    d += 64;
+	    w -= 64;
+	}
+
+	while (w >= 16)
+	{
+	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
+
+	    w -= 16;
+	    d += 16;
+	    s += 16;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+    }
+
+
+    return TRUE;
+}
+
+static void
+sse2_composite_copy_area (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          pixman_image_t *         src_image,
+                          pixman_image_t *         mask_image,
+                          pixman_image_t *         dst_image,
+                          int32_t                  src_x,
+                          int32_t                  src_y,
+                          int32_t                  mask_x,
+                          int32_t                  mask_y,
+                          int32_t                  dest_x,
+                          int32_t                  dest_y,
+                          int32_t                  width,
+                          int32_t                  height)
+{
+    pixman_blt_sse2 (src_image->bits.bits,
+                     dst_image->bits.bits,
+                     src_image->bits.rowstride,
+                     dst_image->bits.rowstride,
+                     PIXMAN_FORMAT_BPP (src_image->bits.format),
+                     PIXMAN_FORMAT_BPP (dst_image->bits.format),
+                     src_x, src_y, dest_x, dest_y, width, height);
+}
+
+static void
+sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+    __m128i ms;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (unsigned long)dst & 15)
+        {
+            s = 0xff000000 | *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+            ms = unpack_32_1x128 (s);
+
+            if (m != 0xff)
+            {
+		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		__m128i md = unpack_32_1x128 (d);
+
+                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
+            }
+
+            *dst++ = pack_1x128_32 (ms);
+            w--;
+        }
+
+        while (w >= 4)
+        {
+            m = *(uint32_t*) mask;
+            xmm_src = _mm_or_si128 (
+		load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+            if (m == 0xffffffff)
+            {
+                save_128_aligned ((__m128i*)dst, xmm_src);
+            }
+            else
+            {
+                xmm_dst = load_128_aligned ((__m128i*)dst);
+
+                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+                expand_alpha_rev_2x128 (
+		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+            }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+            m = (uint32_t) *mask++;
+
+            if (m)
+            {
+                s = 0xff000000 | *src;
+
+                if (m == 0xff)
+                {
+                    *dst = s;
+                }
+                else
+                {
+		    __m128i ma, md, ms;
+
+                    d = *dst;
+
+		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		    md = unpack_32_1x128 (d);
+		    ms = unpack_32_1x128 (s);
+
+                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
+                }
+
+            }
+
+            src++;
+            dst++;
+            w--;
+        }
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (unsigned long)dst & 15)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+            m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		xmm_src = load_128_unaligned ((__m128i*)src);
+
+		if (m == 0xffffffff && is_opaque (xmm_src))
+		{
+		    save_128_aligned ((__m128i *)dst, xmm_src);
+		}
+		else
+		{
+		    xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+		}
+	    }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+    }
+
+}
+
+static void
+sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+				    pixman_op_t              op,
+				    pixman_image_t *         src_image,
+				    pixman_image_t *         mask_image,
+				    pixman_image_t *         dst_image,
+				    int32_t                  src_x,
+				    int32_t                  src_y,
+				    int32_t                  mask_x,
+				    int32_t                  mask_y,
+				    int32_t                  dest_x,
+				    int32_t                  dest_y,
+				    int32_t                  width,
+				    int32_t                  height)
+{
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    __m128i xmm_src;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_dsta_hi, xmm_dsta_lo;
+    int dst_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    __m128i vd;
+
+	    vd = unpack_32_1x128 (*dst);
+
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    __m128i tmp_lo, tmp_hi;
+
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
+
+	    tmp_lo = xmm_src;
+	    tmp_hi = xmm_src;
+
+	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			&xmm_dsta_lo, &xmm_dsta_hi,
+			&tmp_lo, &tmp_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
+
+	    w -= 4;
+	    dst += 4;
+	}
+
+	while (w)
+	{
+	    __m128i vd;
+
+	    vd = unpack_32_1x128 (*dst);
+
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
+	    w--;
+	    dst++;
+	}
+
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
+				    pixman_op_t              op,
+				    pixman_image_t *         src_image,
+				    pixman_image_t *         mask_image,
+				    pixman_image_t *         dst_image,
+				    int32_t                  src_x,
+				    int32_t                  src_y,
+				    int32_t                  mask_x,
+				    int32_t                  mask_y,
+				    int32_t                  dest_x,
+				    int32_t                  dest_y,
+				    int32_t                  width,
+				    int32_t                  height)
+{
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint32_t    *mask, *mask_line;
+    uint32_t    m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (unsigned long)dst & 15)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+
+	    if (!is_transparent (xmm_mask))
+	    {
+		xmm_src = load_128_unaligned ((__m128i*)src);
+
+		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
+		{
+		    save_128_aligned ((__m128i *)dst, xmm_src);
+		}
+		else
+		{
+		    xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+		}
+	    }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+    }
+
+}
+
+/* A variant of 'sse2_combine_over_u' with minor tweaks */
+static force_inline void
+scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
+                                             const uint32_t* ps,
+                                             int32_t         w,
+                                             pixman_fixed_t  vx,
+                                             pixman_fixed_t  unit_x,
+                                             pixman_fixed_t  max_vx,
+                                             pixman_bool_t   fully_transparent_src)
+{
+    uint32_t s, d;
+    const uint32_t* pm = NULL;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (fully_transparent_src)
+	return;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps + (vx >> 16), pm);
+	vx += unit_x;
+
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (pm)
+	    pm++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i tmp;
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = ps[vx >> 16];
+	vx += unit_x;
+	tmp2 = ps[vx >> 16];
+	vx += unit_x;
+	tmp3 = ps[vx >> 16];
+	vx += unit_x;
+	tmp4 = ps[vx >> 16];
+	vx += unit_x;
+
+	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
+
+	if (is_opaque (xmm_src_hi))
+	{
+	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
+	}
+	else if (!is_zero (xmm_src_hi))
+	{
+	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	    expand_alpha_2x128 (
+		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst_lo, &xmm_dst_hi);
+
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned ((__m128i*)pd,
+			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	w -= 4;
+	pd += 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps + (vx >> 16), pm);
+	vx += unit_x;
+
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (pm)
+	    pm++;
+
+	w--;
+    }
+}
+
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, PAD)
+
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+					       uint32_t *       dst,
+					       const uint32_t * src,
+					       int32_t          w,
+					       pixman_fixed_t   vx,
+					       pixman_fixed_t   unit_x,
+					       pixman_fixed_t   max_vx,
+					       pixman_bool_t    zero_src)
+{
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (zero_src || (*mask >> 24) == 0)
+	return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && (unsigned long)dst & 15)
+    {
+	uint32_t s = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha     = expand_alpha_1x128 (ms);
+	    __m128i dest      = xmm_mask;
+	    __m128i alpha_dst = unpack_32_1x128 (d);
+
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+	dst++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp3 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp4 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+	if (!is_zero (xmm_src))
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			        &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha_lo, &xmm_alpha_hi,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	dst += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha = expand_alpha_1x128 (ms);
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &mask, &dest));
+	}
+
+	dst++;
+	w--;
+    }
+
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+
+static void
+bilinear_interpolate_line_sse2 (uint32_t *       out,
+                                const uint32_t * top,
+                                const uint32_t * bottom,
+                                int              wt,
+                                int              wb,
+                                pixman_fixed_t   x,
+                                pixman_fixed_t   ux,
+                                int              width)
+{
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
+    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
+    const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
+    const __m128i xmm_zero = _mm_setzero_si128 ();
+    __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
+    uint32_t pix1, pix2, pix3, pix4;
+
+    #define INTERPOLATE_ONE_PIXEL(pix)						\
+    do {									\
+	__m128i xmm_wh, xmm_lo, xmm_hi, a;					\
+	/* fetch 2x2 pixel block into sse2 register */				\
+	uint32_t tl = top [pixman_fixed_to_int (x)];				\
+	uint32_t tr = top [pixman_fixed_to_int (x) + 1];			\
+	uint32_t bl = bottom [pixman_fixed_to_int (x)];				\
+	uint32_t br = bottom [pixman_fixed_to_int (x) + 1];			\
+	a = _mm_set_epi32 (tr, tl, br, bl);					\
+        x += ux;								\
+	/* vertical interpolation */						\
+	a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),	\
+					    xmm_wt),				\
+			   _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),	\
+					    xmm_wb));				\
+	/* calculate horizontal weights */					\
+	xmm_wh = _mm_add_epi16 (xmm_addc,					\
+				_mm_xor_si128 (xmm_xorc,			\
+					       _mm_srli_epi16 (xmm_x, 8)));	\
+	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+	/* horizontal interpolation */						\
+	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
+	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
+	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
+			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
+	/* shift and pack the result */						\
+	a = _mm_srli_epi32 (a, 16);						\
+	a = _mm_packs_epi32 (a, a);						\
+	a = _mm_packus_epi16 (a, a);						\
+	pix = _mm_cvtsi128_si32 (a);						\
+    } while (0)
+
+    while ((width -= 4) >= 0)
+    {
+	INTERPOLATE_ONE_PIXEL (pix1);
+	INTERPOLATE_ONE_PIXEL (pix2);
+	INTERPOLATE_ONE_PIXEL (pix3);
+	INTERPOLATE_ONE_PIXEL (pix4);
+	*out++ = pix1;
+	*out++ = pix2;
+	*out++ = pix3;
+	*out++ = pix4;
+    }
+    if (width & 2)
+    {
+	INTERPOLATE_ONE_PIXEL (pix1);
+	INTERPOLATE_ONE_PIXEL (pix2);
+	*out++ = pix1;
+	*out++ = pix2;
+    }
+    if (width & 1)
+    {
+	INTERPOLATE_ONE_PIXEL (pix1);
+	*out = pix1;
+    }
+
+    #undef INTERPOLATE_ONE_PIXEL
+}
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx,
+					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
+				    wt, wb, vx, unit_x, w);
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FALSE, FALSE)
+
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+    /* PIXMAN_OP_OVER */
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    
+    /* PIXMAN_OP_OVER_REVERSE */
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
+
+    /* PIXMAN_OP_ADD */
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+
+    /* PIXMAN_OP_SRC */
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
+
+    /* PIXMAN_OP_IN */
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
+
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+sse2_blt (pixman_implementation_t *imp,
+          uint32_t *               src_bits,
+          uint32_t *               dst_bits,
+          int                      src_stride,
+          int                      dst_stride,
+          int                      src_bpp,
+          int                      dst_bpp,
+          int                      src_x,
+          int                      src_y,
+          int                      dst_x,
+          int                      dst_y,
+          int                      width,
+          int                      height)
+{
+    if (!pixman_blt_sse2 (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dst_x, dst_y, width, height))
+
+    {
+	return _pixman_implementation_blt (
+	    imp->delegate,
+	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+	    src_x, src_y, dst_x, dst_y, width, height);
+    }
+
+    return TRUE;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_fill (pixman_implementation_t *imp,
+           uint32_t *               bits,
+           int                      stride,
+           int                      bpp,
+           int                      x,
+           int                      y,
+           int                      width,
+           int                      height,
+           uint32_t xor)
+{
+    if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
+    {
+	return _pixman_implementation_fill (
+	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+    }
+
+    return TRUE;
+}
+
+static uint32_t *
+sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    __m128i ff000000 = mask_ff000000;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 0x0f)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	save_128_aligned (
+	    (__m128i *)dst, _mm_or_si128 (
+		load_128_unaligned ((__m128i *)src), ff000000));
+
+	dst += 4;
+	src += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint16_t *src = (uint16_t *)iter->bits;
+    __m128i ff000000 = mask_ff000000;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 0x0f)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = CONVERT_0565_TO_8888 (s);
+	w--;
+    }
+
+    while (w >= 8)
+    {
+	__m128i lo, hi, s;
+
+	s = _mm_loadu_si128 ((__m128i *)src);
+
+	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
+	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
+
+	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
+	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
+
+	dst += 8;
+	src += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = CONVERT_0565_TO_8888 (s);
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+    iter->bits += iter->stride;
+
+    while (w && (((unsigned long)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 16)
+    {
+	xmm0 = _mm_loadu_si128((__m128i *)src);
+
+	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
+	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
+	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
+	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
+	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
+	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
+
+	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);
+	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);
+	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);
+	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
+
+	dst += 16;
+	src += 16;
+	w -= 16;
+    }
+
+    while (w)
+    {
+	*dst++ = *(src++) << 24;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    pixman_iter_get_scanline_t	get_scanline;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
+    { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },
+    { PIXMAN_a8,		sse2_fetch_a8 },
+    { PIXMAN_null }
+};
+
+static void
+sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    int height = iter->height;
+
+#define FLAGS								\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+
+    if ((iter->flags & ITER_NARROW)				&&
+	(image->common.flags & FLAGS) == FLAGS			&&
+	x >= 0 && y >= 0					&&
+	x + width <= image->bits.width				&&
+	y + height <= image->bits.height)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		iter->get_scanline = f->get_scanline;
+		return;
+	    }
+	}
+    }
+
+    imp->delegate->src_iter_init (imp->delegate, iter);
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
+
+    /* SSE2 constants */
+    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
+    mask_0080 = create_mask_16_128 (0x0080);
+    mask_00ff = create_mask_16_128 (0x00ff);
+    mask_0101 = create_mask_16_128 (0x0101);
+    mask_ffff = create_mask_16_128 (0xffff);
+    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+
+    /* Set up function pointers */
+    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
+    imp->blt = sse2_blt;
+    imp->fill = sse2_fill;
+
+    imp->src_iter_init = sse2_src_iter_init;
+
+    return imp;
+}
author	marha <marha@users.sourceforge.net>	2011-03-20 16:32:44 +0000
committer	marha <marha@users.sourceforge.net>	2011-03-20 16:32:44 +0000
commit	eca5dee9e7a8dea1edba4d10b60444ac0e884139 (patch)
tree	67c0e6552d06cb59b33ef79ece38d6581b2c8976 /pixman
parent	d7f1bd4112420f1d4b41c5409074eca6b34bf507 (diff)
download	vcxsrv-eca5dee9e7a8dea1edba4d10b60444ac0e884139.tar.gz vcxsrv-eca5dee9e7a8dea1edba4d10b60444ac0e884139.tar.bz2 vcxsrv-eca5dee9e7a8dea1edba4d10b60444ac0e884139.zip