aboutsummaryrefslogtreecommitdiff
path: root/pixman
diff options
context:
space:
mode:
Diffstat (limited to 'pixman')
-rw-r--r--pixman/pixman/pixman-bits-image.c3210
-rw-r--r--pixman/pixman/pixman-conical-gradient.c425
-rw-r--r--pixman/pixman/pixman-general.c586
-rw-r--r--pixman/pixman/pixman-implementation.c610
-rw-r--r--pixman/pixman/pixman-linear-gradient.c578
-rw-r--r--pixman/pixman/pixman-private.h51
-rw-r--r--pixman/pixman/pixman-radial-gradient.c923
-rw-r--r--pixman/pixman/pixman-solid-fill.c181
-rw-r--r--pixman/pixman/pixman-sse2.c12153
9 files changed, 9319 insertions, 9398 deletions
diff --git a/pixman/pixman/pixman-bits-image.c b/pixman/pixman/pixman-bits-image.c
index a865d719a..88c2f0eea 100644
--- a/pixman/pixman/pixman-bits-image.c
+++ b/pixman/pixman/pixman-bits-image.c
@@ -1,1608 +1,1602 @@
-/*
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- * 2005 Lars Knoll & Zack Rusin, Trolltech
- * 2008 Aaron Plattner, NVIDIA Corporation
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007, 2009 Red Hat, Inc.
- * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. Keith Packard makes no
- * representations about the suitability of this software for any purpose. It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-
-/*
- * By default, just evaluate the image at 32bpp and expand. Individual image
- * types can plug in a better scanline getter if they want to. For example
- * we could produce smoother gradients by evaluating them at higher color
- * depth, but that's a project for the future.
- */
-static void
-_pixman_image_get_scanline_generic_64 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- uint32_t *mask8 = NULL;
-
- /* Contract the mask image, if one exists, so that the 32-bit fetch
- * function can use it.
- */
- if (mask)
- {
- mask8 = pixman_malloc_ab (width, sizeof(uint32_t));
- if (!mask8)
- return;
-
- pixman_contract (mask8, (uint64_t *)mask, width);
- }
-
- /* Fetch the source image into the first half of buffer. */
- image->bits.get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8);
-
- /* Expand from 32bpp to 64bpp in place. */
- pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width);
-
- free (mask8);
-}
-
-/* Fetch functions */
-
-static force_inline uint32_t
-fetch_pixel_no_alpha (bits_image_t *image,
- int x, int y, pixman_bool_t check_bounds)
-{
- if (check_bounds &&
- (x < 0 || x >= image->width || y < 0 || y >= image->height))
- {
- return 0;
- }
-
- return image->fetch_pixel_32 (image, x, y);
-}
-
-typedef uint32_t (* get_pixel_t) (bits_image_t *image,
- int x, int y, pixman_bool_t check_bounds);
-
-static force_inline void
-repeat (pixman_repeat_t repeat, int size, int *coord)
-{
- switch (repeat)
- {
- case PIXMAN_REPEAT_NORMAL:
- *coord = MOD (*coord, size);
- break;
-
- case PIXMAN_REPEAT_PAD:
- *coord = CLIP (*coord, 0, size - 1);
- break;
-
- case PIXMAN_REPEAT_REFLECT:
- *coord = MOD (*coord, size * 2);
-
- if (*coord >= size)
- *coord = size * 2 - *coord - 1;
- break;
-
- case PIXMAN_REPEAT_NONE:
- break;
-
- default:
- break;
- }
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_nearest (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
- int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
- if (image->common.repeat != PIXMAN_REPEAT_NONE)
- {
- repeat (image->common.repeat, image->width, &x0);
- repeat (image->common.repeat, image->height, &y0);
-
- return get_pixel (image, x0, y0, FALSE);
- }
- else
- {
- return get_pixel (image, x0, y0, TRUE);
- }
-}
-
-#if SIZEOF_LONG > 4
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
- uint32_t bl, uint32_t br,
- int distx, int disty)
-{
- uint64_t distxy, distxiy, distixy, distixiy;
- uint64_t tl64, tr64, bl64, br64;
- uint64_t f, r;
-
- distxy = distx * disty;
- distxiy = distx * (256 - disty);
- distixy = (256 - distx) * disty;
- distixiy = (256 - distx) * (256 - disty);
-
- /* Alpha and Blue */
- tl64 = tl & 0xff0000ff;
- tr64 = tr & 0xff0000ff;
- bl64 = bl & 0xff0000ff;
- br64 = br & 0xff0000ff;
-
- f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
- r = f & 0x0000ff0000ff0000ull;
-
- /* Red and Green */
- tl64 = tl;
- tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
-
- tr64 = tr;
- tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
-
- bl64 = bl;
- bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
-
- br64 = br;
- br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
-
- f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
- r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
-
- return (uint32_t)(r >> 16);
-}
-
-#else
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
- uint32_t bl, uint32_t br,
- int distx, int disty)
-{
- int distxy, distxiy, distixy, distixiy;
- uint32_t f, r;
-
- distxy = distx * disty;
- distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */
- distixy = (disty << 8) - distxy; /* disty * (256 - distx) */
- distixiy =
- 256 * 256 - (disty << 8) -
- (distx << 8) + distxy; /* (256 - distx) * (256 - disty) */
-
- /* Blue */
- r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
- + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
-
- /* Green */
- f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
- + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
- r |= f & 0xff000000;
-
- tl >>= 16;
- tr >>= 16;
- bl >>= 16;
- br >>= 16;
- r >>= 16;
-
- /* Red */
- f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
- + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
- r |= f & 0x00ff0000;
-
- /* Alpha */
- f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
- + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
- r |= f & 0xff000000;
-
- return r;
-}
-
-#endif
-
-static force_inline uint32_t
-bits_image_fetch_pixel_bilinear (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- pixman_repeat_t repeat_mode = image->common.repeat;
- int width = image->width;
- int height = image->height;
- int x1, y1, x2, y2;
- uint32_t tl, tr, bl, br;
- int32_t distx, disty;
-
- x1 = x - pixman_fixed_1 / 2;
- y1 = y - pixman_fixed_1 / 2;
-
- distx = (x1 >> 8) & 0xff;
- disty = (y1 >> 8) & 0xff;
-
- x1 = pixman_fixed_to_int (x1);
- y1 = pixman_fixed_to_int (y1);
- x2 = x1 + 1;
- y2 = y1 + 1;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- repeat (repeat_mode, width, &x1);
- repeat (repeat_mode, height, &y1);
- repeat (repeat_mode, width, &x2);
- repeat (repeat_mode, height, &y2);
-
- tl = get_pixel (image, x1, y1, FALSE);
- bl = get_pixel (image, x1, y2, FALSE);
- tr = get_pixel (image, x2, y1, FALSE);
- br = get_pixel (image, x2, y2, FALSE);
- }
- else
- {
- tl = get_pixel (image, x1, y1, TRUE);
- tr = get_pixel (image, x2, y1, TRUE);
- bl = get_pixel (image, x1, y2, TRUE);
- br = get_pixel (image, x2, y2, TRUE);
- }
-
- return bilinear_interpolation (tl, tr, bl, br, distx, disty);
-}
-
-static void
-bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- bits_image_t *bits = &ima->bits;
- pixman_fixed_t x_top, x_bottom, x;
- pixman_fixed_t ux_top, ux_bottom, ux;
- pixman_vector_t v;
- uint32_t top_mask, bottom_mask;
- uint32_t *top_row;
- uint32_t *bottom_row;
- uint32_t *end;
- uint32_t zero[2] = { 0, 0 };
- uint32_t one = 1;
- int y, y1, y2;
- int disty;
- int mask_inc;
- int w;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (bits->common.transform, &v))
- return;
-
- ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
- x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
-
- y = v.vector[1] - pixman_fixed_1/2;
- disty = (y >> 8) & 0xff;
-
- /* Load the pointers to the first and second lines from the source
- * image that bilinear code must read.
- *
- * The main trick in this code is about the check if any line are
- * outside of the image;
- *
- * When I realize that a line (any one) is outside, I change
- * the pointer to a dummy area with zeros. Once I change this, I
- * must be sure the pointer will not change, so I set the
- * variables to each pointer increments inside the loop.
- */
- y1 = pixman_fixed_to_int (y);
- y2 = y1 + 1;
-
- if (y1 < 0 || y1 >= bits->height)
- {
- top_row = zero;
- x_top = 0;
- ux_top = 0;
- }
- else
- {
- top_row = bits->bits + y1 * bits->rowstride;
- x_top = x;
- ux_top = ux;
- }
-
- if (y2 < 0 || y2 >= bits->height)
- {
- bottom_row = zero;
- x_bottom = 0;
- ux_bottom = 0;
- }
- else
- {
- bottom_row = bits->bits + y2 * bits->rowstride;
- x_bottom = x;
- ux_bottom = ux;
- }
-
- /* Instead of checking whether the operation uses the mast in
- * each loop iteration, verify this only once and prepare the
- * variables to make the code smaller inside the loop.
- */
- if (!mask)
- {
- mask_inc = 0;
- mask = &one;
- }
- else
- {
- /* If have a mask, prepare the variables to check it */
- mask_inc = 1;
- }
-
- /* If both are zero, then the whole thing is zero */
- if (top_row == zero && bottom_row == zero)
- {
- memset (buffer, 0, width * sizeof (uint32_t));
- return;
- }
- else if (bits->format == PIXMAN_x8r8g8b8)
- {
- if (top_row == zero)
- {
- top_mask = 0;
- bottom_mask = 0xff000000;
- }
- else if (bottom_row == zero)
- {
- top_mask = 0xff000000;
- bottom_mask = 0;
- }
- else
- {
- top_mask = 0xff000000;
- bottom_mask = 0xff000000;
- }
- }
- else
- {
- top_mask = 0;
- bottom_mask = 0;
- }
-
- end = buffer + width;
-
- /* Zero fill to the left of the image */
- while (buffer < end && x < pixman_fixed_minus_1)
- {
- *buffer++ = 0;
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Left edge
- */
- while (buffer < end && x < 0)
- {
- uint32_t tr, br;
- int32_t distx;
-
- tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
- br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
- distx = (x >> 8) & 0xff;
-
- *buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
-
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Main part */
- w = pixman_int_to_fixed (bits->width - 1);
-
- while (buffer < end && x < w)
- {
- if (*mask)
- {
- uint32_t tl, tr, bl, br;
- int32_t distx;
-
- tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
- tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
- bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
- br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
- distx = (x >> 8) & 0xff;
-
- *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
- }
-
- buffer++;
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Right Edge */
- w = pixman_int_to_fixed (bits->width);
- while (buffer < end && x < w)
- {
- if (*mask)
- {
- uint32_t tl, bl;
- int32_t distx;
-
- tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
- bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
-
- distx = (x >> 8) & 0xff;
-
- *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
- }
-
- buffer++;
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Zero fill to the left of the image */
- while (buffer < end)
- *buffer++ = 0;
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_convolution (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- pixman_fixed_t *params = image->common.filter_params;
- int x_off = (params[0] - pixman_fixed_1) >> 1;
- int y_off = (params[1] - pixman_fixed_1) >> 1;
- int32_t cwidth = pixman_fixed_to_int (params[0]);
- int32_t cheight = pixman_fixed_to_int (params[1]);
- int32_t srtot, sgtot, sbtot, satot;
- int32_t i, j, x1, x2, y1, y2;
- pixman_repeat_t repeat_mode = image->common.repeat;
- int width = image->width;
- int height = image->height;
-
- params += 2;
-
- x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
- y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
- x2 = x1 + cwidth;
- y2 = y1 + cheight;
-
- srtot = sgtot = sbtot = satot = 0;
-
- for (i = y1; i < y2; ++i)
- {
- for (j = x1; j < x2; ++j)
- {
- int rx = j;
- int ry = i;
-
- pixman_fixed_t f = *params;
-
- if (f)
- {
- uint32_t pixel;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- repeat (repeat_mode, width, &rx);
- repeat (repeat_mode, height, &ry);
-
- pixel = get_pixel (image, rx, ry, FALSE);
- }
- else
- {
- pixel = get_pixel (image, rx, ry, TRUE);
- }
-
- srtot += RED_8 (pixel) * f;
- sgtot += GREEN_8 (pixel) * f;
- sbtot += BLUE_8 (pixel) * f;
- satot += ALPHA_8 (pixel) * f;
- }
-
- params++;
- }
- }
-
- satot >>= 16;
- srtot >>= 16;
- sgtot >>= 16;
- sbtot >>= 16;
-
- satot = CLIP (satot, 0, 0xff);
- srtot = CLIP (srtot, 0, 0xff);
- sgtot = CLIP (sgtot, 0, 0xff);
- sbtot = CLIP (sbtot, 0, 0xff);
-
- return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot));
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_filtered (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- switch (image->common.filter)
- {
- case PIXMAN_FILTER_NEAREST:
- case PIXMAN_FILTER_FAST:
- return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
- break;
-
- case PIXMAN_FILTER_BILINEAR:
- case PIXMAN_FILTER_GOOD:
- case PIXMAN_FILTER_BEST:
- return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
- break;
-
- case PIXMAN_FILTER_CONVOLUTION:
- return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
- break;
-
- default:
- break;
- }
-
- return 0;
-}
-
-static void
-bits_image_fetch_affine_no_alpha (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- pixman_fixed_t x, y;
- pixman_fixed_t ux, uy;
- pixman_vector_t v;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (image->common.transform)
- {
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
- }
- else
- {
- ux = pixman_fixed_1;
- uy = 0;
- }
-
- x = v.vector[0];
- y = v.vector[1];
-
- for (i = 0; i < width; ++i)
- {
- if (!mask || mask[i])
- {
- buffer[i] = bits_image_fetch_pixel_filtered (
- &image->bits, x, y, fetch_pixel_no_alpha);
- }
-
- x += ux;
- y += uy;
- }
-}
-
-/* General fetcher */
-static force_inline uint32_t
-fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
-{
- uint32_t pixel;
-
- if (check_bounds &&
- (x < 0 || x >= image->width || y < 0 || y >= image->height))
- {
- return 0;
- }
-
- pixel = image->fetch_pixel_32 (image, x, y);
-
- if (image->common.alpha_map)
- {
- uint32_t pixel_a;
-
- x -= image->common.alpha_origin_x;
- y -= image->common.alpha_origin_y;
-
- if (x < 0 || x >= image->common.alpha_map->width ||
- y < 0 || y >= image->common.alpha_map->height)
- {
- pixel_a = 0;
- }
- else
- {
- pixel_a = image->common.alpha_map->fetch_pixel_32 (
- image->common.alpha_map, x, y);
-
- pixel_a = ALPHA_8 (pixel_a);
- }
-
- pixel &= 0x00ffffff;
- pixel |= (pixel_a << 24);
- }
-
- return pixel;
-}
-
-static void
-bits_image_fetch_general (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- pixman_fixed_t x, y, w;
- pixman_fixed_t ux, uy, uw;
- pixman_vector_t v;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (image->common.transform)
- {
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
- uw = image->common.transform->matrix[2][0];
- }
- else
- {
- ux = pixman_fixed_1;
- uy = 0;
- uw = 0;
- }
-
- x = v.vector[0];
- y = v.vector[1];
- w = v.vector[2];
-
- for (i = 0; i < width; ++i)
- {
- pixman_fixed_t x0, y0;
-
- if (!mask || mask[i])
- {
- if (w != 0)
- {
- x0 = ((pixman_fixed_48_16_t)x << 16) / w;
- y0 = ((pixman_fixed_48_16_t)y << 16) / w;
- }
- else
- {
- x0 = 0;
- y0 = 0;
- }
-
- buffer[i] = bits_image_fetch_pixel_filtered (
- &image->bits, x0, y0, fetch_pixel_general);
- }
-
- x += ux;
- y += uy;
- w += uw;
- }
-}
-
-static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
-typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
-
-static force_inline void
-bits_image_fetch_bilinear_affine (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask,
-
- convert_pixel_t convert_pixel,
- pixman_format_code_t format,
- pixman_repeat_t repeat_mode)
-{
- pixman_fixed_t x, y;
- pixman_fixed_t ux, uy;
- pixman_vector_t v;
- bits_image_t *bits = &image->bits;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
-
- x = v.vector[0];
- y = v.vector[1];
-
- for (i = 0; i < width; ++i)
- {
- int x1, y1, x2, y2;
- uint32_t tl, tr, bl, br;
- int32_t distx, disty;
- int width = image->bits.width;
- int height = image->bits.height;
- const uint8_t *row1;
- const uint8_t *row2;
-
- if (mask && !mask[i])
- goto next;
-
- x1 = x - pixman_fixed_1 / 2;
- y1 = y - pixman_fixed_1 / 2;
-
- distx = (x1 >> 8) & 0xff;
- disty = (y1 >> 8) & 0xff;
-
- y1 = pixman_fixed_to_int (y1);
- y2 = y1 + 1;
- x1 = pixman_fixed_to_int (x1);
- x2 = x1 + 1;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- uint32_t mask;
-
- mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
- repeat (repeat_mode, width, &x1);
- repeat (repeat_mode, height, &y1);
- repeat (repeat_mode, width, &x2);
- repeat (repeat_mode, height, &y2);
-
- row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
- row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
-
- tl = convert_pixel (row1, x1) | mask;
- tr = convert_pixel (row1, x2) | mask;
- bl = convert_pixel (row2, x1) | mask;
- br = convert_pixel (row2, x2) | mask;
- }
- else
- {
- uint32_t mask1, mask2;
- int bpp;
-
- /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
- * which means if you use it in expressions, those
- * expressions become unsigned themselves. Since
- * the variables below can be negative in some cases,
- * that will lead to crashes on 64 bit architectures.
- *
- * So this line makes sure bpp is signed
- */
- bpp = PIXMAN_FORMAT_BPP (format);
-
- if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
- {
- buffer[i] = 0;
- goto next;
- }
-
- if (y2 == 0)
- {
- row1 = zero;
- mask1 = 0;
- }
- else
- {
- row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
- row1 += bpp / 8 * x1;
-
- mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
- }
-
- if (y1 == height - 1)
- {
- row2 = zero;
- mask2 = 0;
- }
- else
- {
- row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
- row2 += bpp / 8 * x1;
-
- mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
- }
-
- if (x2 == 0)
- {
- tl = 0;
- bl = 0;
- }
- else
- {
- tl = convert_pixel (row1, 0) | mask1;
- bl = convert_pixel (row2, 0) | mask2;
- }
-
- if (x1 == width - 1)
- {
- tr = 0;
- br = 0;
- }
- else
- {
- tr = convert_pixel (row1, 1) | mask1;
- br = convert_pixel (row2, 1) | mask2;
- }
- }
-
- buffer[i] = bilinear_interpolation (
- tl, tr, bl, br, distx, disty);
-
- next:
- x += ux;
- y += uy;
- }
-}
-
-static force_inline void
-bits_image_fetch_nearest_affine (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask,
-
- convert_pixel_t convert_pixel,
- pixman_format_code_t format,
- pixman_repeat_t repeat_mode)
-{
- pixman_fixed_t x, y;
- pixman_fixed_t ux, uy;
- pixman_vector_t v;
- bits_image_t *bits = &image->bits;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
-
- x = v.vector[0];
- y = v.vector[1];
-
- for (i = 0; i < width; ++i)
- {
- int width, height, x0, y0;
- const uint8_t *row;
-
- if (mask && !mask[i])
- goto next;
-
- width = image->bits.width;
- height = image->bits.height;
- x0 = pixman_fixed_to_int (x - pixman_fixed_e);
- y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
- if (repeat_mode == PIXMAN_REPEAT_NONE &&
- (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
- {
- buffer[i] = 0;
- }
- else
- {
- uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- repeat (repeat_mode, width, &x0);
- repeat (repeat_mode, height, &y0);
- }
-
- row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
-
- buffer[i] = convert_pixel (row, x0) | mask;
- }
-
- next:
- x += ux;
- y += uy;
- }
-}
-
-static force_inline uint32_t
-convert_a8r8g8b8 (const uint8_t *row, int x)
-{
- return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_x8r8g8b8 (const uint8_t *row, int x)
-{
- return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_a8 (const uint8_t *row, int x)
-{
- return *(row + x) << 24;
-}
-
-static force_inline uint32_t
-convert_r5g6b5 (const uint8_t *row, int x)
-{
- return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
-}
-
-#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode) \
- static void \
- bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image, \
- int offset, \
- int line, \
- int width, \
- uint32_t * buffer, \
- const uint32_t * mask) \
- { \
- bits_image_fetch_bilinear_affine (image, offset, line, \
- width, buffer, mask, \
- convert_ ## format, \
- PIXMAN_ ## format, \
- repeat_mode); \
- }
-
-#define MAKE_NEAREST_FETCHER(name, format, repeat_mode) \
- static void \
- bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image, \
- int offset, \
- int line, \
- int width, \
- uint32_t * buffer, \
- const uint32_t * mask) \
- { \
- bits_image_fetch_nearest_affine (image, offset, line, \
- width, buffer, mask, \
- convert_ ## format, \
- PIXMAN_ ## format, \
- repeat_mode); \
- }
-
-#define MAKE_FETCHERS(name, format, repeat_mode) \
- MAKE_NEAREST_FETCHER (name, format, repeat_mode) \
- MAKE_BILINEAR_FETCHER (name, format, repeat_mode)
-
-MAKE_FETCHERS (pad_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_a8, a8, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_a8, a8, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_a8, a8, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_a8, a8, PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_r5g6b5, r5g6b5, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_r5g6b5, r5g6b5, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_r5g6b5, r5g6b5, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_r5g6b5, r5g6b5, PIXMAN_REPEAT_NORMAL)
-
-static void
-bits_image_fetch_solid_32 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- uint32_t color;
- uint32_t *end;
-
- color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
-
- end = buffer + width;
- while (buffer < end)
- *(buffer++) = color;
-}
-
-static void
-bits_image_fetch_solid_64 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * b,
- const uint32_t * unused)
-{
- uint64_t color;
- uint64_t *buffer = (uint64_t *)b;
- uint64_t *end;
-
- color = image->bits.fetch_pixel_64 (&image->bits, 0, 0);
-
- end = buffer + width;
- while (buffer < end)
- *(buffer++) = color;
-}
-
-static void
-bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
- pixman_bool_t wide,
- int x,
- int y,
- int width,
- uint32_t * buffer)
-{
- uint32_t w;
-
- if (y < 0 || y >= image->height)
- {
- memset (buffer, 0, width * (wide? 8 : 4));
- return;
- }
-
- if (x < 0)
- {
- w = MIN (width, -x);
-
- memset (buffer, 0, w * (wide ? 8 : 4));
-
- width -= w;
- buffer += w * (wide? 2 : 1);
- x += w;
- }
-
- if (x < image->width)
- {
- w = MIN (width, image->width - x);
-
- if (wide)
- image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
- else
- image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-
- width -= w;
- buffer += w * (wide? 2 : 1);
- x += w;
- }
-
- memset (buffer, 0, width * (wide ? 8 : 4));
-}
-
-static void
-bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
- pixman_bool_t wide,
- int x,
- int y,
- int width,
- uint32_t * buffer)
-{
- uint32_t w;
-
- while (y < 0)
- y += image->height;
-
- while (y >= image->height)
- y -= image->height;
-
- while (width)
- {
- while (x < 0)
- x += image->width;
- while (x >= image->width)
- x -= image->width;
-
- w = MIN (width, image->width - x);
-
- if (wide)
- image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
- else
- image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-
- buffer += w * (wide? 2 : 1);
- x += w;
- width -= w;
- }
-}
-
-static void
-bits_image_fetch_untransformed_32 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- if (image->common.repeat == PIXMAN_REPEAT_NONE)
- {
- bits_image_fetch_untransformed_repeat_none (
- &image->bits, FALSE, x, y, width, buffer);
- }
- else
- {
- bits_image_fetch_untransformed_repeat_normal (
- &image->bits, FALSE, x, y, width, buffer);
- }
-}
-
-static void
-bits_image_fetch_untransformed_64 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * buffer,
- const uint32_t * unused)
-{
- if (image->common.repeat == PIXMAN_REPEAT_NONE)
- {
- bits_image_fetch_untransformed_repeat_none (
- &image->bits, TRUE, x, y, width, buffer);
- }
- else
- {
- bits_image_fetch_untransformed_repeat_normal (
- &image->bits, TRUE, x, y, width, buffer);
- }
-}
-
-typedef struct
-{
- pixman_format_code_t format;
- uint32_t flags;
- fetch_scanline_t fetch_32;
- fetch_scanline_t fetch_64;
-} fetcher_info_t;
-
-static const fetcher_info_t fetcher_info[] =
-{
- { PIXMAN_solid,
- FAST_PATH_NO_ALPHA_MAP,
- bits_image_fetch_solid_32,
- bits_image_fetch_solid_64
- },
-
- { PIXMAN_any,
- (FAST_PATH_NO_ALPHA_MAP |
- FAST_PATH_ID_TRANSFORM |
- FAST_PATH_NO_CONVOLUTION_FILTER |
- FAST_PATH_NO_PAD_REPEAT |
- FAST_PATH_NO_REFLECT_REPEAT),
- bits_image_fetch_untransformed_32,
- bits_image_fetch_untransformed_64
- },
-
-#define FAST_BILINEAR_FLAGS \
- (FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_HAS_TRANSFORM | \
- FAST_PATH_AFFINE_TRANSFORM | \
- FAST_PATH_X_UNIT_POSITIVE | \
- FAST_PATH_Y_UNIT_ZERO | \
- FAST_PATH_NONE_REPEAT | \
- FAST_PATH_BILINEAR_FILTER)
-
- { PIXMAN_a8r8g8b8,
- FAST_BILINEAR_FLAGS,
- bits_image_fetch_bilinear_no_repeat_8888,
- _pixman_image_get_scanline_generic_64
- },
-
- { PIXMAN_x8r8g8b8,
- FAST_BILINEAR_FLAGS,
- bits_image_fetch_bilinear_no_repeat_8888,
- _pixman_image_get_scanline_generic_64
- },
-
-#define GENERAL_BILINEAR_FLAGS \
- (FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_HAS_TRANSFORM | \
- FAST_PATH_AFFINE_TRANSFORM | \
- FAST_PATH_BILINEAR_FILTER)
-
-#define GENERAL_NEAREST_FLAGS \
- (FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_HAS_TRANSFORM | \
- FAST_PATH_AFFINE_TRANSFORM | \
- FAST_PATH_NEAREST_FILTER)
-
-#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \
- { PIXMAN_ ## format, \
- GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
- bits_image_fetch_bilinear_affine_ ## name, \
- _pixman_image_get_scanline_generic_64 \
- },
-
-#define NEAREST_AFFINE_FAST_PATH(name, format, repeat) \
- { PIXMAN_ ## format, \
- GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
- bits_image_fetch_nearest_affine_ ## name, \
- _pixman_image_get_scanline_generic_64 \
- },
-
-#define AFFINE_FAST_PATHS(name, format, repeat) \
- BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \
- NEAREST_AFFINE_FAST_PATH(name, format, repeat)
-
- AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
- AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
- AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
- AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
- AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
- AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
- AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
- AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
- AFFINE_FAST_PATHS (pad_a8, a8, PAD)
- AFFINE_FAST_PATHS (none_a8, a8, NONE)
- AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
- AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
- AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
- AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
- AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
- AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
-
- /* Affine, no alpha */
- { PIXMAN_any,
- (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
- bits_image_fetch_affine_no_alpha,
- _pixman_image_get_scanline_generic_64
- },
-
- /* General */
- { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
-
- { PIXMAN_null },
-};
-
-static void
-bits_image_property_changed (pixman_image_t *image)
-{
- uint32_t flags = image->common.flags;
- pixman_format_code_t format = image->common.extended_format_code;
- const fetcher_info_t *info;
-
- _pixman_bits_image_setup_accessors (&image->bits);
-
- info = fetcher_info;
- while (info->format != PIXMAN_null)
- {
- if ((info->format == format || info->format == PIXMAN_any) &&
- (info->flags & flags) == info->flags)
- {
- image->bits.get_scanline_32 = info->fetch_32;
- image->bits.get_scanline_64 = info->fetch_64;
- break;
- }
-
- info++;
- }
-}
-
-static uint32_t *
-src_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
-{
- iter->image->bits.get_scanline_32 (
- iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
-
- return iter->buffer;
-}
-
-static uint32_t *
-src_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
- iter->image->bits.get_scanline_64 (
- iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
-
- return iter->buffer;
-}
-
-void
-_pixman_bits_image_src_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags)
-{
- if (flags & ITER_NARROW)
- iter->get_scanline = src_get_scanline_narrow;
- else
- iter->get_scanline = src_get_scanline_wide;
-}
-
-static uint32_t *
-dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
-{
- pixman_image_t *image = iter->image;
- int x = iter->x;
- int y = iter->y;
- int width = iter->width;
- uint32_t * buffer = iter->buffer;
-
- image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask);
- if (image->common.alpha_map)
- {
- x -= image->common.alpha_origin_x;
- y -= image->common.alpha_origin_y;
-
- image->common.alpha_map->fetch_scanline_32 (
- (pixman_image_t *)image->common.alpha_map,
- x, y, width, buffer, mask);
- }
-
- return iter->buffer;
-}
-
-static uint32_t *
-dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
- bits_image_t * image = &iter->image->bits;
- int x = iter->x;
- int y = iter->y;
- int width = iter->width;
- uint32_t * buffer = iter->buffer;
-
- image->fetch_scanline_64 (
- (pixman_image_t *)image, x, y, width, buffer, mask);
- if (image->common.alpha_map)
- {
- x -= image->common.alpha_origin_x;
- y -= image->common.alpha_origin_y;
-
- image->common.alpha_map->fetch_scanline_64 (
- (pixman_image_t *)image->common.alpha_map, x, y, width, buffer, mask);
- }
-
- return iter->buffer;
-}
-
-static void
-dest_write_back_narrow (pixman_iter_t *iter)
-{
- bits_image_t * image = &iter->image->bits;
- int x = iter->x;
- int y = iter->y;
- int width = iter->width;
- const uint32_t *buffer = iter->buffer;
-
- image->store_scanline_32 (image, x, y, width, buffer);
-
- if (image->common.alpha_map)
- {
- x -= image->common.alpha_origin_x;
- y -= image->common.alpha_origin_y;
-
- image->common.alpha_map->store_scanline_32 (
- image->common.alpha_map, x, y, width, buffer);
- }
-
- iter->y++;
-}
-
-static void
-dest_write_back_wide (pixman_iter_t *iter)
-{
- bits_image_t * image = &iter->image->bits;
- int x = iter->x;
- int y = iter->y;
- int width = iter->width;
- const uint32_t *buffer = iter->buffer;
-
- image->store_scanline_64 (image, x, y, width, buffer);
-
- if (image->common.alpha_map)
- {
- x -= image->common.alpha_origin_x;
- y -= image->common.alpha_origin_y;
-
- image->common.alpha_map->store_scanline_64 (
- image->common.alpha_map, x, y, width, buffer);
- }
-
- iter->y++;
-}
-
-static void
-dest_write_back_direct (pixman_iter_t *iter)
-{
- iter->buffer += iter->image->bits.rowstride;
-}
-
-void
-_pixman_bits_image_dest_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags)
-{
- if (flags & ITER_NARROW)
- {
- if (((image->common.flags &
- (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) ==
- (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) &&
- (image->bits.format == PIXMAN_a8r8g8b8 ||
- (image->bits.format == PIXMAN_x8r8g8b8 &&
- (flags & ITER_LOCALIZED_ALPHA))))
- {
- iter->buffer = image->bits.bits + y * image->bits.rowstride + x;
-
- iter->get_scanline = _pixman_iter_get_scanline_noop;
- iter->write_back = dest_write_back_direct;
- }
- else
- {
- if ((flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
- (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
- {
- iter->get_scanline = _pixman_iter_get_scanline_noop;
- }
- else
- {
- iter->get_scanline = dest_get_scanline_narrow;
- }
-
- iter->write_back = dest_write_back_narrow;
- }
- }
- else
- {
- iter->get_scanline = dest_get_scanline_wide;
- iter->write_back = dest_write_back_wide;
- }
-}
-
-static uint32_t *
-create_bits (pixman_format_code_t format,
- int width,
- int height,
- int * rowstride_bytes)
-{
- int stride;
- int buf_size;
- int bpp;
-
- /* what follows is a long-winded way, avoiding any possibility of integer
- * overflows, of saying:
- * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
- */
-
- bpp = PIXMAN_FORMAT_BPP (format);
- if (pixman_multiply_overflows_int (width, bpp))
- return NULL;
-
- stride = width * bpp;
- if (pixman_addition_overflows_int (stride, 0x1f))
- return NULL;
-
- stride += 0x1f;
- stride >>= 5;
-
- stride *= sizeof (uint32_t);
-
- if (pixman_multiply_overflows_int (height, stride))
- return NULL;
-
- buf_size = height * stride;
-
- if (rowstride_bytes)
- *rowstride_bytes = stride;
-
- return calloc (buf_size, 1);
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_bits (pixman_format_code_t format,
- int width,
- int height,
- uint32_t * bits,
- int rowstride_bytes)
-{
- pixman_image_t *image;
- uint32_t *free_me = NULL;
-
- /* must be a whole number of uint32_t's
- */
- return_val_if_fail (
- bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
-
- return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
-
- if (!bits && width && height)
- {
- free_me = bits = create_bits (format, width, height, &rowstride_bytes);
- if (!bits)
- return NULL;
- }
-
- image = _pixman_image_allocate ();
-
- if (!image)
- {
- if (free_me)
- free (free_me);
-
- return NULL;
- }
-
- image->type = BITS;
- image->bits.format = format;
- image->bits.width = width;
- image->bits.height = height;
- image->bits.bits = bits;
- image->bits.free_me = free_me;
- image->bits.read_func = NULL;
- image->bits.write_func = NULL;
-
- /* The rowstride is stored in number of uint32_t */
- image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t);
-
- image->bits.indexed = NULL;
-
- image->common.property_changed = bits_image_property_changed;
-
- _pixman_image_reset_clip_region (image);
-
- return image;
-}
+/*
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * 2005 Lars Knoll & Zack Rusin, Trolltech
+ * 2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007, 2009 Red Hat, Inc.
+ * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Keith Packard makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+
+/*
+ * By default, just evaluate the image at 32bpp and expand. Individual image
+ * types can plug in a better scanline getter if they want to. For example
+ * we could produce smoother gradients by evaluating them at higher color
+ * depth, but that's a project for the future.
+ */
+static void
+_pixman_image_get_scanline_generic_64 (pixman_image_t * image,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask)
+{
+ uint32_t *mask8 = NULL;
+
+ /* Contract the mask image, if one exists, so that the 32-bit fetch
+ * function can use it.
+ */
+ if (mask)
+ {
+ mask8 = pixman_malloc_ab (width, sizeof(uint32_t));
+ if (!mask8)
+ return;
+
+ pixman_contract (mask8, (uint64_t *)mask, width);
+ }
+
+ /* Fetch the source image into the first half of buffer. */
+ image->bits.get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8);
+
+ /* Expand from 32bpp to 64bpp in place. */
+ pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width);
+
+ free (mask8);
+}
+
+/* Fetch functions */
+
+static force_inline uint32_t
+fetch_pixel_no_alpha (bits_image_t *image,
+ int x, int y, pixman_bool_t check_bounds)
+{
+ if (check_bounds &&
+ (x < 0 || x >= image->width || y < 0 || y >= image->height))
+ {
+ return 0;
+ }
+
+ return image->fetch_pixel_32 (image, x, y);
+}
+
+typedef uint32_t (* get_pixel_t) (bits_image_t *image,
+ int x, int y, pixman_bool_t check_bounds);
+
+static force_inline void
+repeat (pixman_repeat_t repeat, int size, int *coord)
+{
+ switch (repeat)
+ {
+ case PIXMAN_REPEAT_NORMAL:
+ *coord = MOD (*coord, size);
+ break;
+
+ case PIXMAN_REPEAT_PAD:
+ *coord = CLIP (*coord, 0, size - 1);
+ break;
+
+ case PIXMAN_REPEAT_REFLECT:
+ *coord = MOD (*coord, size * 2);
+
+ if (*coord >= size)
+ *coord = size * 2 - *coord - 1;
+ break;
+
+ case PIXMAN_REPEAT_NONE:
+ break;
+
+ default:
+ break;
+ }
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_nearest (bits_image_t *image,
+ pixman_fixed_t x,
+ pixman_fixed_t y,
+ get_pixel_t get_pixel)
+{
+ int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+ int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+ if (image->common.repeat != PIXMAN_REPEAT_NONE)
+ {
+ repeat (image->common.repeat, image->width, &x0);
+ repeat (image->common.repeat, image->height, &y0);
+
+ return get_pixel (image, x0, y0, FALSE);
+ }
+ else
+ {
+ return get_pixel (image, x0, y0, TRUE);
+ }
+}
+
+#if SIZEOF_LONG > 4
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+ uint32_t bl, uint32_t br,
+ int distx, int disty)
+{
+ uint64_t distxy, distxiy, distixy, distixiy;
+ uint64_t tl64, tr64, bl64, br64;
+ uint64_t f, r;
+
+ distxy = distx * disty;
+ distxiy = distx * (256 - disty);
+ distixy = (256 - distx) * disty;
+ distixiy = (256 - distx) * (256 - disty);
+
+ /* Alpha and Blue */
+ tl64 = tl & 0xff0000ff;
+ tr64 = tr & 0xff0000ff;
+ bl64 = bl & 0xff0000ff;
+ br64 = br & 0xff0000ff;
+
+ f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+ r = f & 0x0000ff0000ff0000ull;
+
+ /* Red and Green */
+ tl64 = tl;
+ tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+ tr64 = tr;
+ tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+ bl64 = bl;
+ bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+ br64 = br;
+ br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+ f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+ r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+ return (uint32_t)(r >> 16);
+}
+
+#else
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+ uint32_t bl, uint32_t br,
+ int distx, int disty)
+{
+ int distxy, distxiy, distixy, distixiy;
+ uint32_t f, r;
+
+ distxy = distx * disty;
+ distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */
+ distixy = (disty << 8) - distxy; /* disty * (256 - distx) */
+ distixiy =
+ 256 * 256 - (disty << 8) -
+ (distx << 8) + distxy; /* (256 - distx) * (256 - disty) */
+
+ /* Blue */
+ r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+ + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
+
+ /* Green */
+ f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+ + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
+ r |= f & 0xff000000;
+
+ tl >>= 16;
+ tr >>= 16;
+ bl >>= 16;
+ br >>= 16;
+ r >>= 16;
+
+ /* Red */
+ f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+ + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
+ r |= f & 0x00ff0000;
+
+ /* Alpha */
+ f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+ + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
+ r |= f & 0xff000000;
+
+ return r;
+}
+
+#endif
+
+static force_inline uint32_t
+bits_image_fetch_pixel_bilinear (bits_image_t *image,
+ pixman_fixed_t x,
+ pixman_fixed_t y,
+ get_pixel_t get_pixel)
+{
+ pixman_repeat_t repeat_mode = image->common.repeat;
+ int width = image->width;
+ int height = image->height;
+ int x1, y1, x2, y2;
+ uint32_t tl, tr, bl, br;
+ int32_t distx, disty;
+
+ x1 = x - pixman_fixed_1 / 2;
+ y1 = y - pixman_fixed_1 / 2;
+
+ distx = (x1 >> 8) & 0xff;
+ disty = (y1 >> 8) & 0xff;
+
+ x1 = pixman_fixed_to_int (x1);
+ y1 = pixman_fixed_to_int (y1);
+ x2 = x1 + 1;
+ y2 = y1 + 1;
+
+ if (repeat_mode != PIXMAN_REPEAT_NONE)
+ {
+ repeat (repeat_mode, width, &x1);
+ repeat (repeat_mode, height, &y1);
+ repeat (repeat_mode, width, &x2);
+ repeat (repeat_mode, height, &y2);
+
+ tl = get_pixel (image, x1, y1, FALSE);
+ bl = get_pixel (image, x1, y2, FALSE);
+ tr = get_pixel (image, x2, y1, FALSE);
+ br = get_pixel (image, x2, y2, FALSE);
+ }
+ else
+ {
+ tl = get_pixel (image, x1, y1, TRUE);
+ tr = get_pixel (image, x2, y1, TRUE);
+ bl = get_pixel (image, x1, y2, TRUE);
+ br = get_pixel (image, x2, y2, TRUE);
+ }
+
+ return bilinear_interpolation (tl, tr, bl, br, distx, disty);
+}
+
+static void
+bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
+ int offset,
+ int line,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask)
+{
+ bits_image_t *bits = &ima->bits;
+ pixman_fixed_t x_top, x_bottom, x;
+ pixman_fixed_t ux_top, ux_bottom, ux;
+ pixman_vector_t v;
+ uint32_t top_mask, bottom_mask;
+ uint32_t *top_row;
+ uint32_t *bottom_row;
+ uint32_t *end;
+ uint32_t zero[2] = { 0, 0 };
+ uint32_t one = 1;
+ int y, y1, y2;
+ int disty;
+ int mask_inc;
+ int w;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (bits->common.transform, &v))
+ return;
+
+ ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
+ x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
+
+ y = v.vector[1] - pixman_fixed_1/2;
+ disty = (y >> 8) & 0xff;
+
+ /* Load the pointers to the first and second lines from the source
+ * image that bilinear code must read.
+ *
+ * The main trick in this code is about the check if any line are
+ * outside of the image;
+ *
+ * When I realize that a line (any one) is outside, I change
+ * the pointer to a dummy area with zeros. Once I change this, I
+ * must be sure the pointer will not change, so I set the
+ * variables to each pointer increments inside the loop.
+ */
+ y1 = pixman_fixed_to_int (y);
+ y2 = y1 + 1;
+
+ if (y1 < 0 || y1 >= bits->height)
+ {
+ top_row = zero;
+ x_top = 0;
+ ux_top = 0;
+ }
+ else
+ {
+ top_row = bits->bits + y1 * bits->rowstride;
+ x_top = x;
+ ux_top = ux;
+ }
+
+ if (y2 < 0 || y2 >= bits->height)
+ {
+ bottom_row = zero;
+ x_bottom = 0;
+ ux_bottom = 0;
+ }
+ else
+ {
+ bottom_row = bits->bits + y2 * bits->rowstride;
+ x_bottom = x;
+ ux_bottom = ux;
+ }
+
+ /* Instead of checking whether the operation uses the mast in
+ * each loop iteration, verify this only once and prepare the
+ * variables to make the code smaller inside the loop.
+ */
+ if (!mask)
+ {
+ mask_inc = 0;
+ mask = &one;
+ }
+ else
+ {
+ /* If have a mask, prepare the variables to check it */
+ mask_inc = 1;
+ }
+
+ /* If both are zero, then the whole thing is zero */
+ if (top_row == zero && bottom_row == zero)
+ {
+ memset (buffer, 0, width * sizeof (uint32_t));
+ return;
+ }
+ else if (bits->format == PIXMAN_x8r8g8b8)
+ {
+ if (top_row == zero)
+ {
+ top_mask = 0;
+ bottom_mask = 0xff000000;
+ }
+ else if (bottom_row == zero)
+ {
+ top_mask = 0xff000000;
+ bottom_mask = 0;
+ }
+ else
+ {
+ top_mask = 0xff000000;
+ bottom_mask = 0xff000000;
+ }
+ }
+ else
+ {
+ top_mask = 0;
+ bottom_mask = 0;
+ }
+
+ end = buffer + width;
+
+ /* Zero fill to the left of the image */
+ while (buffer < end && x < pixman_fixed_minus_1)
+ {
+ *buffer++ = 0;
+ x += ux;
+ x_top += ux_top;
+ x_bottom += ux_bottom;
+ mask += mask_inc;
+ }
+
+ /* Left edge
+ */
+ while (buffer < end && x < 0)
+ {
+ uint32_t tr, br;
+ int32_t distx;
+
+ tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
+ br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+ distx = (x >> 8) & 0xff;
+
+ *buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
+
+ x += ux;
+ x_top += ux_top;
+ x_bottom += ux_bottom;
+ mask += mask_inc;
+ }
+
+ /* Main part */
+ w = pixman_int_to_fixed (bits->width - 1);
+
+ while (buffer < end && x < w)
+ {
+ if (*mask)
+ {
+ uint32_t tl, tr, bl, br;
+ int32_t distx;
+
+ tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+ tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
+ bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+ br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+ distx = (x >> 8) & 0xff;
+
+ *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
+ }
+
+ buffer++;
+ x += ux;
+ x_top += ux_top;
+ x_bottom += ux_bottom;
+ mask += mask_inc;
+ }
+
+ /* Right Edge */
+ w = pixman_int_to_fixed (bits->width);
+ while (buffer < end && x < w)
+ {
+ if (*mask)
+ {
+ uint32_t tl, bl;
+ int32_t distx;
+
+ tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+ bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+
+ distx = (x >> 8) & 0xff;
+
+ *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
+ }
+
+ buffer++;
+ x += ux;
+ x_top += ux_top;
+ x_bottom += ux_bottom;
+ mask += mask_inc;
+ }
+
+ /* Zero fill to the left of the image */
+ while (buffer < end)
+ *buffer++ = 0;
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_convolution (bits_image_t *image,
+ pixman_fixed_t x,
+ pixman_fixed_t y,
+ get_pixel_t get_pixel)
+{
+ pixman_fixed_t *params = image->common.filter_params;
+ int x_off = (params[0] - pixman_fixed_1) >> 1;
+ int y_off = (params[1] - pixman_fixed_1) >> 1;
+ int32_t cwidth = pixman_fixed_to_int (params[0]);
+ int32_t cheight = pixman_fixed_to_int (params[1]);
+ int32_t srtot, sgtot, sbtot, satot;
+ int32_t i, j, x1, x2, y1, y2;
+ pixman_repeat_t repeat_mode = image->common.repeat;
+ int width = image->width;
+ int height = image->height;
+
+ params += 2;
+
+ x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+ y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+ x2 = x1 + cwidth;
+ y2 = y1 + cheight;
+
+ srtot = sgtot = sbtot = satot = 0;
+
+ for (i = y1; i < y2; ++i)
+ {
+ for (j = x1; j < x2; ++j)
+ {
+ int rx = j;
+ int ry = i;
+
+ pixman_fixed_t f = *params;
+
+ if (f)
+ {
+ uint32_t pixel;
+
+ if (repeat_mode != PIXMAN_REPEAT_NONE)
+ {
+ repeat (repeat_mode, width, &rx);
+ repeat (repeat_mode, height, &ry);
+
+ pixel = get_pixel (image, rx, ry, FALSE);
+ }
+ else
+ {
+ pixel = get_pixel (image, rx, ry, TRUE);
+ }
+
+ srtot += RED_8 (pixel) * f;
+ sgtot += GREEN_8 (pixel) * f;
+ sbtot += BLUE_8 (pixel) * f;
+ satot += ALPHA_8 (pixel) * f;
+ }
+
+ params++;
+ }
+ }
+
+ satot >>= 16;
+ srtot >>= 16;
+ sgtot >>= 16;
+ sbtot >>= 16;
+
+ satot = CLIP (satot, 0, 0xff);
+ srtot = CLIP (srtot, 0, 0xff);
+ sgtot = CLIP (sgtot, 0, 0xff);
+ sbtot = CLIP (sbtot, 0, 0xff);
+
+ return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot));
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_filtered (bits_image_t *image,
+ pixman_fixed_t x,
+ pixman_fixed_t y,
+ get_pixel_t get_pixel)
+{
+ switch (image->common.filter)
+ {
+ case PIXMAN_FILTER_NEAREST:
+ case PIXMAN_FILTER_FAST:
+ return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
+ break;
+
+ case PIXMAN_FILTER_BILINEAR:
+ case PIXMAN_FILTER_GOOD:
+ case PIXMAN_FILTER_BEST:
+ return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
+ break;
+
+ case PIXMAN_FILTER_CONVOLUTION:
+ return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static void
+bits_image_fetch_affine_no_alpha (pixman_image_t * image,
+ int offset,
+ int line,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask)
+{
+ pixman_fixed_t x, y;
+ pixman_fixed_t ux, uy;
+ pixman_vector_t v;
+ int i;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (image->common.transform)
+ {
+ if (!pixman_transform_point_3d (image->common.transform, &v))
+ return;
+
+ ux = image->common.transform->matrix[0][0];
+ uy = image->common.transform->matrix[1][0];
+ }
+ else
+ {
+ ux = pixman_fixed_1;
+ uy = 0;
+ }
+
+ x = v.vector[0];
+ y = v.vector[1];
+
+ for (i = 0; i < width; ++i)
+ {
+ if (!mask || mask[i])
+ {
+ buffer[i] = bits_image_fetch_pixel_filtered (
+ &image->bits, x, y, fetch_pixel_no_alpha);
+ }
+
+ x += ux;
+ y += uy;
+ }
+}
+
+/* General fetcher */
+static force_inline uint32_t
+fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
+{
+ uint32_t pixel;
+
+ if (check_bounds &&
+ (x < 0 || x >= image->width || y < 0 || y >= image->height))
+ {
+ return 0;
+ }
+
+ pixel = image->fetch_pixel_32 (image, x, y);
+
+ if (image->common.alpha_map)
+ {
+ uint32_t pixel_a;
+
+ x -= image->common.alpha_origin_x;
+ y -= image->common.alpha_origin_y;
+
+ if (x < 0 || x >= image->common.alpha_map->width ||
+ y < 0 || y >= image->common.alpha_map->height)
+ {
+ pixel_a = 0;
+ }
+ else
+ {
+ pixel_a = image->common.alpha_map->fetch_pixel_32 (
+ image->common.alpha_map, x, y);
+
+ pixel_a = ALPHA_8 (pixel_a);
+ }
+
+ pixel &= 0x00ffffff;
+ pixel |= (pixel_a << 24);
+ }
+
+ return pixel;
+}
+
+static void
+bits_image_fetch_general (pixman_image_t * image,
+ int offset,
+ int line,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask)
+{
+ pixman_fixed_t x, y, w;
+ pixman_fixed_t ux, uy, uw;
+ pixman_vector_t v;
+ int i;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (image->common.transform)
+ {
+ if (!pixman_transform_point_3d (image->common.transform, &v))
+ return;
+
+ ux = image->common.transform->matrix[0][0];
+ uy = image->common.transform->matrix[1][0];
+ uw = image->common.transform->matrix[2][0];
+ }
+ else
+ {
+ ux = pixman_fixed_1;
+ uy = 0;
+ uw = 0;
+ }
+
+ x = v.vector[0];
+ y = v.vector[1];
+ w = v.vector[2];
+
+ for (i = 0; i < width; ++i)
+ {
+ pixman_fixed_t x0, y0;
+
+ if (!mask || mask[i])
+ {
+ if (w != 0)
+ {
+ x0 = ((pixman_fixed_48_16_t)x << 16) / w;
+ y0 = ((pixman_fixed_48_16_t)y << 16) / w;
+ }
+ else
+ {
+ x0 = 0;
+ y0 = 0;
+ }
+
+ buffer[i] = bits_image_fetch_pixel_filtered (
+ &image->bits, x0, y0, fetch_pixel_general);
+ }
+
+ x += ux;
+ y += uy;
+ w += uw;
+ }
+}
+
+static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
+
+static force_inline void
+bits_image_fetch_bilinear_affine (pixman_image_t * image,
+ int offset,
+ int line,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask,
+
+ convert_pixel_t convert_pixel,
+ pixman_format_code_t format,
+ pixman_repeat_t repeat_mode)
+{
+ pixman_fixed_t x, y;
+ pixman_fixed_t ux, uy;
+ pixman_vector_t v;
+ bits_image_t *bits = &image->bits;
+ int i;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (image->common.transform, &v))
+ return;
+
+ ux = image->common.transform->matrix[0][0];
+ uy = image->common.transform->matrix[1][0];
+
+ x = v.vector[0];
+ y = v.vector[1];
+
+ for (i = 0; i < width; ++i)
+ {
+ int x1, y1, x2, y2;
+ uint32_t tl, tr, bl, br;
+ int32_t distx, disty;
+ int width = image->bits.width;
+ int height = image->bits.height;
+ const uint8_t *row1;
+ const uint8_t *row2;
+
+ if (mask && !mask[i])
+ goto next;
+
+ x1 = x - pixman_fixed_1 / 2;
+ y1 = y - pixman_fixed_1 / 2;
+
+ distx = (x1 >> 8) & 0xff;
+ disty = (y1 >> 8) & 0xff;
+
+ y1 = pixman_fixed_to_int (y1);
+ y2 = y1 + 1;
+ x1 = pixman_fixed_to_int (x1);
+ x2 = x1 + 1;
+
+ if (repeat_mode != PIXMAN_REPEAT_NONE)
+ {
+ uint32_t mask;
+
+ mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+ repeat (repeat_mode, width, &x1);
+ repeat (repeat_mode, height, &y1);
+ repeat (repeat_mode, width, &x2);
+ repeat (repeat_mode, height, &y2);
+
+ row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+ row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+
+ tl = convert_pixel (row1, x1) | mask;
+ tr = convert_pixel (row1, x2) | mask;
+ bl = convert_pixel (row2, x1) | mask;
+ br = convert_pixel (row2, x2) | mask;
+ }
+ else
+ {
+ uint32_t mask1, mask2;
+ int bpp;
+
+ /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
+ * which means if you use it in expressions, those
+ * expressions become unsigned themselves. Since
+ * the variables below can be negative in some cases,
+ * that will lead to crashes on 64 bit architectures.
+ *
+ * So this line makes sure bpp is signed
+ */
+ bpp = PIXMAN_FORMAT_BPP (format);
+
+ if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
+ {
+ buffer[i] = 0;
+ goto next;
+ }
+
+ if (y2 == 0)
+ {
+ row1 = zero;
+ mask1 = 0;
+ }
+ else
+ {
+ row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+ row1 += bpp / 8 * x1;
+
+ mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+ }
+
+ if (y1 == height - 1)
+ {
+ row2 = zero;
+ mask2 = 0;
+ }
+ else
+ {
+ row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+ row2 += bpp / 8 * x1;
+
+ mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+ }
+
+ if (x2 == 0)
+ {
+ tl = 0;
+ bl = 0;
+ }
+ else
+ {
+ tl = convert_pixel (row1, 0) | mask1;
+ bl = convert_pixel (row2, 0) | mask2;
+ }
+
+ if (x1 == width - 1)
+ {
+ tr = 0;
+ br = 0;
+ }
+ else
+ {
+ tr = convert_pixel (row1, 1) | mask1;
+ br = convert_pixel (row2, 1) | mask2;
+ }
+ }
+
+ buffer[i] = bilinear_interpolation (
+ tl, tr, bl, br, distx, disty);
+
+ next:
+ x += ux;
+ y += uy;
+ }
+}
+
+static force_inline void
+bits_image_fetch_nearest_affine (pixman_image_t * image,
+ int offset,
+ int line,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask,
+
+ convert_pixel_t convert_pixel,
+ pixman_format_code_t format,
+ pixman_repeat_t repeat_mode)
+{
+ pixman_fixed_t x, y;
+ pixman_fixed_t ux, uy;
+ pixman_vector_t v;
+ bits_image_t *bits = &image->bits;
+ int i;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (image->common.transform, &v))
+ return;
+
+ ux = image->common.transform->matrix[0][0];
+ uy = image->common.transform->matrix[1][0];
+
+ x = v.vector[0];
+ y = v.vector[1];
+
+ for (i = 0; i < width; ++i)
+ {
+ int width, height, x0, y0;
+ const uint8_t *row;
+
+ if (mask && !mask[i])
+ goto next;
+
+ width = image->bits.width;
+ height = image->bits.height;
+ x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+ y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+ if (repeat_mode == PIXMAN_REPEAT_NONE &&
+ (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
+ {
+ buffer[i] = 0;
+ }
+ else
+ {
+ uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+ if (repeat_mode != PIXMAN_REPEAT_NONE)
+ {
+ repeat (repeat_mode, width, &x0);
+ repeat (repeat_mode, height, &y0);
+ }
+
+ row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
+
+ buffer[i] = convert_pixel (row, x0) | mask;
+ }
+
+ next:
+ x += ux;
+ y += uy;
+ }
+}
+
+static force_inline uint32_t
+convert_a8r8g8b8 (const uint8_t *row, int x)
+{
+ return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_x8r8g8b8 (const uint8_t *row, int x)
+{
+ return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_a8 (const uint8_t *row, int x)
+{
+ return *(row + x) << 24;
+}
+
+static force_inline uint32_t
+convert_r5g6b5 (const uint8_t *row, int x)
+{
+ return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
+}
+
+#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode) \
+ static void \
+ bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image, \
+ int offset, \
+ int line, \
+ int width, \
+ uint32_t * buffer, \
+ const uint32_t * mask) \
+ { \
+ bits_image_fetch_bilinear_affine (image, offset, line, \
+ width, buffer, mask, \
+ convert_ ## format, \
+ PIXMAN_ ## format, \
+ repeat_mode); \
+ }
+
+#define MAKE_NEAREST_FETCHER(name, format, repeat_mode) \
+ static void \
+ bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image, \
+ int offset, \
+ int line, \
+ int width, \
+ uint32_t * buffer, \
+ const uint32_t * mask) \
+ { \
+ bits_image_fetch_nearest_affine (image, offset, line, \
+ width, buffer, mask, \
+ convert_ ## format, \
+ PIXMAN_ ## format, \
+ repeat_mode); \
+ }
+
+#define MAKE_FETCHERS(name, format, repeat_mode) \
+ MAKE_NEAREST_FETCHER (name, format, repeat_mode) \
+ MAKE_BILINEAR_FETCHER (name, format, repeat_mode)
+
+MAKE_FETCHERS (pad_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_a8, a8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8, a8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8, a8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8, a8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_r5g6b5, r5g6b5, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_r5g6b5, r5g6b5, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_r5g6b5, r5g6b5, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_r5g6b5, r5g6b5, PIXMAN_REPEAT_NORMAL)
+
+static void
+bits_image_fetch_solid_32 (pixman_image_t * image,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask)
+{
+ uint32_t color;
+ uint32_t *end;
+
+ color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
+
+ end = buffer + width;
+ while (buffer < end)
+ *(buffer++) = color;
+}
+
+static void
+bits_image_fetch_solid_64 (pixman_image_t * image,
+ int x,
+ int y,
+ int width,
+ uint32_t * b,
+ const uint32_t * unused)
+{
+ uint64_t color;
+ uint64_t *buffer = (uint64_t *)b;
+ uint64_t *end;
+
+ color = image->bits.fetch_pixel_64 (&image->bits, 0, 0);
+
+ end = buffer + width;
+ while (buffer < end)
+ *(buffer++) = color;
+}
+
+static void
+bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
+ pixman_bool_t wide,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer)
+{
+ uint32_t w;
+
+ if (y < 0 || y >= image->height)
+ {
+ memset (buffer, 0, width * (wide? 8 : 4));
+ return;
+ }
+
+ if (x < 0)
+ {
+ w = MIN (width, -x);
+
+ memset (buffer, 0, w * (wide ? 8 : 4));
+
+ width -= w;
+ buffer += w * (wide? 2 : 1);
+ x += w;
+ }
+
+ if (x < image->width)
+ {
+ w = MIN (width, image->width - x);
+
+ if (wide)
+ image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+ else
+ image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+ width -= w;
+ buffer += w * (wide? 2 : 1);
+ x += w;
+ }
+
+ memset (buffer, 0, width * (wide ? 8 : 4));
+}
+
+static void
+bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
+ pixman_bool_t wide,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer)
+{
+ uint32_t w;
+
+ while (y < 0)
+ y += image->height;
+
+ while (y >= image->height)
+ y -= image->height;
+
+ while (width)
+ {
+ while (x < 0)
+ x += image->width;
+ while (x >= image->width)
+ x -= image->width;
+
+ w = MIN (width, image->width - x);
+
+ if (wide)
+ image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+ else
+ image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+ buffer += w * (wide? 2 : 1);
+ x += w;
+ width -= w;
+ }
+}
+
+static void
+bits_image_fetch_untransformed_32 (pixman_image_t * image,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask)
+{
+ if (image->common.repeat == PIXMAN_REPEAT_NONE)
+ {
+ bits_image_fetch_untransformed_repeat_none (
+ &image->bits, FALSE, x, y, width, buffer);
+ }
+ else
+ {
+ bits_image_fetch_untransformed_repeat_normal (
+ &image->bits, FALSE, x, y, width, buffer);
+ }
+}
+
+static void
+bits_image_fetch_untransformed_64 (pixman_image_t * image,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * unused)
+{
+ if (image->common.repeat == PIXMAN_REPEAT_NONE)
+ {
+ bits_image_fetch_untransformed_repeat_none (
+ &image->bits, TRUE, x, y, width, buffer);
+ }
+ else
+ {
+ bits_image_fetch_untransformed_repeat_normal (
+ &image->bits, TRUE, x, y, width, buffer);
+ }
+}
+
+typedef struct
+{
+ pixman_format_code_t format;
+ uint32_t flags;
+ fetch_scanline_t fetch_32;
+ fetch_scanline_t fetch_64;
+} fetcher_info_t;
+
+static const fetcher_info_t fetcher_info[] =
+{
+ { PIXMAN_solid,
+ FAST_PATH_NO_ALPHA_MAP,
+ bits_image_fetch_solid_32,
+ bits_image_fetch_solid_64
+ },
+
+ { PIXMAN_any,
+ (FAST_PATH_NO_ALPHA_MAP |
+ FAST_PATH_ID_TRANSFORM |
+ FAST_PATH_NO_CONVOLUTION_FILTER |
+ FAST_PATH_NO_PAD_REPEAT |
+ FAST_PATH_NO_REFLECT_REPEAT),
+ bits_image_fetch_untransformed_32,
+ bits_image_fetch_untransformed_64
+ },
+
+#define FAST_BILINEAR_FLAGS \
+ (FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_HAS_TRANSFORM | \
+ FAST_PATH_AFFINE_TRANSFORM | \
+ FAST_PATH_X_UNIT_POSITIVE | \
+ FAST_PATH_Y_UNIT_ZERO | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_BILINEAR_FILTER)
+
+ { PIXMAN_a8r8g8b8,
+ FAST_BILINEAR_FLAGS,
+ bits_image_fetch_bilinear_no_repeat_8888,
+ _pixman_image_get_scanline_generic_64
+ },
+
+ { PIXMAN_x8r8g8b8,
+ FAST_BILINEAR_FLAGS,
+ bits_image_fetch_bilinear_no_repeat_8888,
+ _pixman_image_get_scanline_generic_64
+ },
+
+#define GENERAL_BILINEAR_FLAGS \
+ (FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_HAS_TRANSFORM | \
+ FAST_PATH_AFFINE_TRANSFORM | \
+ FAST_PATH_BILINEAR_FILTER)
+
+#define GENERAL_NEAREST_FLAGS \
+ (FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_HAS_TRANSFORM | \
+ FAST_PATH_AFFINE_TRANSFORM | \
+ FAST_PATH_NEAREST_FILTER)
+
+#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \
+ { PIXMAN_ ## format, \
+ GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
+ bits_image_fetch_bilinear_affine_ ## name, \
+ _pixman_image_get_scanline_generic_64 \
+ },
+
+#define NEAREST_AFFINE_FAST_PATH(name, format, repeat) \
+ { PIXMAN_ ## format, \
+ GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
+ bits_image_fetch_nearest_affine_ ## name, \
+ _pixman_image_get_scanline_generic_64 \
+ },
+
+#define AFFINE_FAST_PATHS(name, format, repeat) \
+ BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \
+ NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+
+ AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
+ AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
+ AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
+ AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
+ AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
+ AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
+ AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
+ AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
+ AFFINE_FAST_PATHS (pad_a8, a8, PAD)
+ AFFINE_FAST_PATHS (none_a8, a8, NONE)
+ AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
+ AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
+ AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
+ AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
+ AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
+ AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
+
+ /* Affine, no alpha */
+ { PIXMAN_any,
+ (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
+ bits_image_fetch_affine_no_alpha,
+ _pixman_image_get_scanline_generic_64
+ },
+
+ /* General */
+ { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
+
+ { PIXMAN_null },
+};
+
+static void
+bits_image_property_changed (pixman_image_t *image)
+{
+ uint32_t flags = image->common.flags;
+ pixman_format_code_t format = image->common.extended_format_code;
+ const fetcher_info_t *info;
+
+ _pixman_bits_image_setup_accessors (&image->bits);
+
+ info = fetcher_info;
+ while (info->format != PIXMAN_null)
+ {
+ if ((info->format == format || info->format == PIXMAN_any) &&
+ (info->flags & flags) == info->flags)
+ {
+ image->bits.get_scanline_32 = info->fetch_32;
+ image->bits.get_scanline_64 = info->fetch_64;
+ break;
+ }
+
+ info++;
+ }
+}
+
+static uint32_t *
+src_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+ iter->image->bits.get_scanline_32 (
+ iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
+
+ return iter->buffer;
+}
+
+static uint32_t *
+src_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+ iter->image->bits.get_scanline_64 (
+ iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
+
+ return iter->buffer;
+}
+
+void
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+ if (iter->flags & ITER_NARROW)
+ iter->get_scanline = src_get_scanline_narrow;
+ else
+ iter->get_scanline = src_get_scanline_wide;
+}
+
+static uint32_t *
+dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+ pixman_image_t *image = iter->image;
+ int x = iter->x;
+ int y = iter->y;
+ int width = iter->width;
+ uint32_t * buffer = iter->buffer;
+
+ image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask);
+ if (image->common.alpha_map)
+ {
+ x -= image->common.alpha_origin_x;
+ y -= image->common.alpha_origin_y;
+
+ image->common.alpha_map->fetch_scanline_32 (
+ (pixman_image_t *)image->common.alpha_map,
+ x, y, width, buffer, mask);
+ }
+
+ return iter->buffer;
+}
+
+static uint32_t *
+dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+ bits_image_t * image = &iter->image->bits;
+ int x = iter->x;
+ int y = iter->y;
+ int width = iter->width;
+ uint32_t * buffer = iter->buffer;
+
+ image->fetch_scanline_64 (
+ (pixman_image_t *)image, x, y, width, buffer, mask);
+ if (image->common.alpha_map)
+ {
+ x -= image->common.alpha_origin_x;
+ y -= image->common.alpha_origin_y;
+
+ image->common.alpha_map->fetch_scanline_64 (
+ (pixman_image_t *)image->common.alpha_map, x, y, width, buffer, mask);
+ }
+
+ return iter->buffer;
+}
+
+static void
+dest_write_back_narrow (pixman_iter_t *iter)
+{
+ bits_image_t * image = &iter->image->bits;
+ int x = iter->x;
+ int y = iter->y;
+ int width = iter->width;
+ const uint32_t *buffer = iter->buffer;
+
+ image->store_scanline_32 (image, x, y, width, buffer);
+
+ if (image->common.alpha_map)
+ {
+ x -= image->common.alpha_origin_x;
+ y -= image->common.alpha_origin_y;
+
+ image->common.alpha_map->store_scanline_32 (
+ image->common.alpha_map, x, y, width, buffer);
+ }
+
+ iter->y++;
+}
+
+static void
+dest_write_back_wide (pixman_iter_t *iter)
+{
+ bits_image_t * image = &iter->image->bits;
+ int x = iter->x;
+ int y = iter->y;
+ int width = iter->width;
+ const uint32_t *buffer = iter->buffer;
+
+ image->store_scanline_64 (image, x, y, width, buffer);
+
+ if (image->common.alpha_map)
+ {
+ x -= image->common.alpha_origin_x;
+ y -= image->common.alpha_origin_y;
+
+ image->common.alpha_map->store_scanline_64 (
+ image->common.alpha_map, x, y, width, buffer);
+ }
+
+ iter->y++;
+}
+
+static void
+dest_write_back_direct (pixman_iter_t *iter)
+{
+ iter->buffer += iter->image->bits.rowstride;
+}
+
+void
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+ if (iter->flags & ITER_NARROW)
+ {
+ if (((image->common.flags &
+ (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) ==
+ (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) &&
+ (image->bits.format == PIXMAN_a8r8g8b8 ||
+ (image->bits.format == PIXMAN_x8r8g8b8 &&
+ (iter->flags & ITER_LOCALIZED_ALPHA))))
+ {
+ iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x;
+
+ iter->get_scanline = _pixman_iter_get_scanline_noop;
+ iter->write_back = dest_write_back_direct;
+ }
+ else
+ {
+ if ((iter->flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
+ (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+ {
+ iter->get_scanline = _pixman_iter_get_scanline_noop;
+ }
+ else
+ {
+ iter->get_scanline = dest_get_scanline_narrow;
+ }
+
+ iter->write_back = dest_write_back_narrow;
+ }
+ }
+ else
+ {
+ iter->get_scanline = dest_get_scanline_wide;
+ iter->write_back = dest_write_back_wide;
+ }
+}
+
+static uint32_t *
+create_bits (pixman_format_code_t format,
+ int width,
+ int height,
+ int * rowstride_bytes)
+{
+ int stride;
+ int buf_size;
+ int bpp;
+
+ /* what follows is a long-winded way, avoiding any possibility of integer
+ * overflows, of saying:
+ * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
+ */
+
+ bpp = PIXMAN_FORMAT_BPP (format);
+ if (pixman_multiply_overflows_int (width, bpp))
+ return NULL;
+
+ stride = width * bpp;
+ if (pixman_addition_overflows_int (stride, 0x1f))
+ return NULL;
+
+ stride += 0x1f;
+ stride >>= 5;
+
+ stride *= sizeof (uint32_t);
+
+ if (pixman_multiply_overflows_int (height, stride))
+ return NULL;
+
+ buf_size = height * stride;
+
+ if (rowstride_bytes)
+ *rowstride_bytes = stride;
+
+ return calloc (buf_size, 1);
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_bits (pixman_format_code_t format,
+ int width,
+ int height,
+ uint32_t * bits,
+ int rowstride_bytes)
+{
+ pixman_image_t *image;
+ uint32_t *free_me = NULL;
+
+ /* must be a whole number of uint32_t's
+ */
+ return_val_if_fail (
+ bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
+
+ return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
+
+ if (!bits && width && height)
+ {
+ free_me = bits = create_bits (format, width, height, &rowstride_bytes);
+ if (!bits)
+ return NULL;
+ }
+
+ image = _pixman_image_allocate ();
+
+ if (!image)
+ {
+ if (free_me)
+ free (free_me);
+
+ return NULL;
+ }
+
+ image->type = BITS;
+ image->bits.format = format;
+ image->bits.width = width;
+ image->bits.height = height;
+ image->bits.bits = bits;
+ image->bits.free_me = free_me;
+ image->bits.read_func = NULL;
+ image->bits.write_func = NULL;
+
+ /* The rowstride is stored in number of uint32_t */
+ image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t);
+
+ image->bits.indexed = NULL;
+
+ image->common.property_changed = bits_image_property_changed;
+
+ _pixman_image_reset_clip_region (image);
+
+ return image;
+}
diff --git a/pixman/pixman/pixman-conical-gradient.c b/pixman/pixman/pixman-conical-gradient.c
index 9d7d2e8b5..e3f230262 100644
--- a/pixman/pixman/pixman-conical-gradient.c
+++ b/pixman/pixman/pixman-conical-gradient.c
@@ -1,214 +1,211 @@
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- * 2005 Lars Knoll & Zack Rusin, Trolltech
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. Keith Packard makes no
- * representations about the suitability of this software for any purpose. It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <stdlib.h>
-#include <math.h>
-#include "pixman-private.h"
-
-static force_inline double
-coordinates_to_parameter (double x, double y, double angle)
-{
- double t;
-
- t = atan2 (y, x) + angle;
-
- while (t < 0)
- t += 2 * M_PI;
-
- while (t >= 2 * M_PI)
- t -= 2 * M_PI;
-
- return 1 - t * (1 / (2 * M_PI)); /* Scale t to [0, 1] and
- * make rotation CCW
- */
-}
-
-static uint32_t *
-conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
-{
- pixman_image_t *image = iter->image;
- int x = iter->x;
- int y = iter->y;
- int width = iter->width;
- uint32_t *buffer = iter->buffer;
-
- gradient_t *gradient = (gradient_t *)image;
- conical_gradient_t *conical = (conical_gradient_t *)image;
- uint32_t *end = buffer + width;
- pixman_gradient_walker_t walker;
- pixman_bool_t affine = TRUE;
- double cx = 1.;
- double cy = 0.;
- double cz = 0.;
- double rx = x + 0.5;
- double ry = y + 0.5;
- double rz = 1.;
-
- _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
-
- if (image->common.transform)
- {
- pixman_vector_t v;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return iter->buffer;
-
- cx = image->common.transform->matrix[0][0] / 65536.;
- cy = image->common.transform->matrix[1][0] / 65536.;
- cz = image->common.transform->matrix[2][0] / 65536.;
-
- rx = v.vector[0] / 65536.;
- ry = v.vector[1] / 65536.;
- rz = v.vector[2] / 65536.;
-
- affine =
- image->common.transform->matrix[2][0] == 0 &&
- v.vector[2] == pixman_fixed_1;
- }
-
- if (affine)
- {
- rx -= conical->center.x / 65536.;
- ry -= conical->center.y / 65536.;
-
- while (buffer < end)
- {
- if (!mask || *mask++)
- {
- double t = coordinates_to_parameter (rx, ry, conical->angle);
-
- *buffer = _pixman_gradient_walker_pixel (
- &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
- }
-
- ++buffer;
-
- rx += cx;
- ry += cy;
- }
- }
- else
- {
- while (buffer < end)
- {
- double x, y;
-
- if (!mask || *mask++)
- {
- double t;
-
- if (rz != 0)
- {
- x = rx / rz;
- y = ry / rz;
- }
- else
- {
- x = y = 0.;
- }
-
- x -= conical->center.x / 65536.;
- y -= conical->center.y / 65536.;
-
- t = coordinates_to_parameter (x, y, conical->angle);
-
- *buffer = _pixman_gradient_walker_pixel (
- &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
- }
-
- ++buffer;
-
- rx += cx;
- ry += cy;
- rz += cz;
- }
- }
-
- iter->y++;
- return iter->buffer;
-}
-
-static uint32_t *
-conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
- uint32_t *buffer = conical_get_scanline_narrow (iter, NULL);
-
- pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
-
- return buffer;
-}
-
-void
-_pixman_conical_gradient_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags)
-{
- if (flags & ITER_NARROW)
- iter->get_scanline = conical_get_scanline_narrow;
- else
- iter->get_scanline = conical_get_scanline_wide;
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_conical_gradient (pixman_point_fixed_t * center,
- pixman_fixed_t angle,
- const pixman_gradient_stop_t *stops,
- int n_stops)
-{
- pixman_image_t *image = _pixman_image_allocate ();
- conical_gradient_t *conical;
-
- if (!image)
- return NULL;
-
- conical = &image->conical;
-
- if (!_pixman_init_gradient (&conical->common, stops, n_stops))
- {
- free (image);
- return NULL;
- }
-
- angle = MOD (angle, pixman_int_to_fixed (360));
-
- image->type = CONICAL;
-
- conical->center = *center;
- conical->angle = (pixman_fixed_to_double (angle) / 180.0) * M_PI;
-
- return image;
-}
-
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * 2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Keith Packard makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+static force_inline double
+coordinates_to_parameter (double x, double y, double angle)
+{
+ double t;
+
+ t = atan2 (y, x) + angle;
+
+ while (t < 0)
+ t += 2 * M_PI;
+
+ while (t >= 2 * M_PI)
+ t -= 2 * M_PI;
+
+ return 1 - t * (1 / (2 * M_PI)); /* Scale t to [0, 1] and
+ * make rotation CCW
+ */
+}
+
+static uint32_t *
+conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+ pixman_image_t *image = iter->image;
+ int x = iter->x;
+ int y = iter->y;
+ int width = iter->width;
+ uint32_t *buffer = iter->buffer;
+
+ gradient_t *gradient = (gradient_t *)image;
+ conical_gradient_t *conical = (conical_gradient_t *)image;
+ uint32_t *end = buffer + width;
+ pixman_gradient_walker_t walker;
+ pixman_bool_t affine = TRUE;
+ double cx = 1.;
+ double cy = 0.;
+ double cz = 0.;
+ double rx = x + 0.5;
+ double ry = y + 0.5;
+ double rz = 1.;
+
+ _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+ if (image->common.transform)
+ {
+ pixman_vector_t v;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (image->common.transform, &v))
+ return iter->buffer;
+
+ cx = image->common.transform->matrix[0][0] / 65536.;
+ cy = image->common.transform->matrix[1][0] / 65536.;
+ cz = image->common.transform->matrix[2][0] / 65536.;
+
+ rx = v.vector[0] / 65536.;
+ ry = v.vector[1] / 65536.;
+ rz = v.vector[2] / 65536.;
+
+ affine =
+ image->common.transform->matrix[2][0] == 0 &&
+ v.vector[2] == pixman_fixed_1;
+ }
+
+ if (affine)
+ {
+ rx -= conical->center.x / 65536.;
+ ry -= conical->center.y / 65536.;
+
+ while (buffer < end)
+ {
+ if (!mask || *mask++)
+ {
+ double t = coordinates_to_parameter (rx, ry, conical->angle);
+
+ *buffer = _pixman_gradient_walker_pixel (
+ &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+ }
+
+ ++buffer;
+
+ rx += cx;
+ ry += cy;
+ }
+ }
+ else
+ {
+ while (buffer < end)
+ {
+ double x, y;
+
+ if (!mask || *mask++)
+ {
+ double t;
+
+ if (rz != 0)
+ {
+ x = rx / rz;
+ y = ry / rz;
+ }
+ else
+ {
+ x = y = 0.;
+ }
+
+ x -= conical->center.x / 65536.;
+ y -= conical->center.y / 65536.;
+
+ t = coordinates_to_parameter (x, y, conical->angle);
+
+ *buffer = _pixman_gradient_walker_pixel (
+ &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+ }
+
+ ++buffer;
+
+ rx += cx;
+ ry += cy;
+ rz += cz;
+ }
+ }
+
+ iter->y++;
+ return iter->buffer;
+}
+
+static uint32_t *
+conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+ uint32_t *buffer = conical_get_scanline_narrow (iter, NULL);
+
+ pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+ return buffer;
+}
+
+void
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+ if (iter->flags & ITER_NARROW)
+ iter->get_scanline = conical_get_scanline_narrow;
+ else
+ iter->get_scanline = conical_get_scanline_wide;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_conical_gradient (pixman_point_fixed_t * center,
+ pixman_fixed_t angle,
+ const pixman_gradient_stop_t *stops,
+ int n_stops)
+{
+ pixman_image_t *image = _pixman_image_allocate ();
+ conical_gradient_t *conical;
+
+ if (!image)
+ return NULL;
+
+ conical = &image->conical;
+
+ if (!_pixman_init_gradient (&conical->common, stops, n_stops))
+ {
+ free (image);
+ return NULL;
+ }
+
+ angle = MOD (angle, pixman_int_to_fixed (360));
+
+ image->type = CONICAL;
+
+ conical->center = *center;
+ conical->angle = (pixman_fixed_to_double (angle) / 180.0) * M_PI;
+
+ return image;
+}
+
diff --git a/pixman/pixman/pixman-general.c b/pixman/pixman/pixman-general.c
index 872fb7e9f..5bac6c65a 100644
--- a/pixman/pixman/pixman-general.c
+++ b/pixman/pixman/pixman-general.c
@@ -1,311 +1,275 @@
-/*
- * Copyright © 2009 Red Hat, Inc.
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- * 2005 Lars Knoll & Zack Rusin, Trolltech
- * 2008 Aaron Plattner, NVIDIA Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Red Hat not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. Red Hat makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "pixman-private.h"
-
-static void
-general_src_iter_init (pixman_implementation_t *imp,
- pixman_iter_t *iter,
- pixman_image_t *image,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags)
-{
- iter->image = image;
- iter->x = x;
- iter->y = y;
- iter->width = width;
- iter->buffer = (uint32_t *)buffer;
-
- if (image->type == SOLID)
- {
- _pixman_solid_fill_iter_init (
- image, iter, x, y, width, height, buffer, flags);
- }
- else if (image->type == LINEAR)
- {
- _pixman_linear_gradient_iter_init (
- image, iter, x, y, width, height, buffer, flags);
- }
- else if (image->type == RADIAL)
- {
- _pixman_radial_gradient_iter_init (
- image, iter, x, y, width, height, buffer, flags);
- }
- else if (image->type == CONICAL)
- {
- _pixman_conical_gradient_iter_init (
- image, iter, x, y, width, height, buffer, flags);
- }
- else if (image->type == BITS)
- {
- _pixman_bits_image_src_iter_init (
- image, iter, x, y, width, height, buffer, flags);
- }
- else
- {
- _pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
- }
-}
-
-static void
-general_dest_iter_init (pixman_implementation_t *imp,
- pixman_iter_t *iter,
- pixman_image_t *image,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags)
-{
- iter->image = image;
- iter->x = x;
- iter->y = y;
- iter->width = width;
- iter->buffer = (uint32_t *)buffer;
-
- if (image->type == BITS)
- {
- _pixman_bits_image_dest_iter_init (
- image, iter, x, y, width, height, buffer, flags);
- }
- else
- {
- _pixman_log_error (FUNC, "Trying to write to a non-writable image");
- }
-}
-
-typedef struct op_info_t op_info_t;
-struct op_info_t
-{
- uint8_t src, dst;
-};
-
-#define ITER_IGNORE_BOTH \
- (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_LOCALIZED_ALPHA)
-
-static const op_info_t op_flags[PIXMAN_N_OPERATORS] =
-{
- /* Src Dst */
- { ITER_IGNORE_BOTH, ITER_IGNORE_BOTH }, /* CLEAR */
- { ITER_LOCALIZED_ALPHA, ITER_IGNORE_BOTH }, /* SRC */
- { ITER_IGNORE_BOTH, ITER_LOCALIZED_ALPHA }, /* DST */
- { 0, ITER_LOCALIZED_ALPHA }, /* OVER */
- { ITER_LOCALIZED_ALPHA, 0 }, /* OVER_REVERSE */
- { ITER_LOCALIZED_ALPHA, ITER_IGNORE_RGB }, /* IN */
- { ITER_IGNORE_RGB, ITER_LOCALIZED_ALPHA }, /* IN_REVERSE */
- { ITER_LOCALIZED_ALPHA, ITER_IGNORE_RGB }, /* OUT */
- { ITER_IGNORE_RGB, ITER_LOCALIZED_ALPHA }, /* OUT_REVERSE */
- { 0, 0 }, /* ATOP */
- { 0, 0 }, /* ATOP_REVERSE */
- { 0, 0 }, /* XOR */
- { ITER_LOCALIZED_ALPHA, ITER_LOCALIZED_ALPHA }, /* ADD */
- { 0, 0 }, /* SATURATE */
-};
-
-#define SCANLINE_BUFFER_LENGTH 8192
-
-static void
-general_composite_rect (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src,
- pixman_image_t * mask,
- pixman_image_t * dest,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8];
- uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
- uint8_t *src_buffer, *mask_buffer, *dest_buffer;
- pixman_iter_t src_iter, mask_iter, dest_iter;
- pixman_combine_32_func_t compose;
- pixman_bool_t component_alpha;
- iter_flags_t narrow, src_flags;
- int Bpp;
- int i;
-
- if ((src->common.flags & FAST_PATH_NARROW_FORMAT) &&
- (!mask || mask->common.flags & FAST_PATH_NARROW_FORMAT) &&
- (dest->common.flags & FAST_PATH_NARROW_FORMAT))
- {
- narrow = ITER_NARROW;
- Bpp = 4;
- }
- else
- {
- narrow = 0;
- Bpp = 8;
- }
-
- if (width * Bpp > SCANLINE_BUFFER_LENGTH)
- {
- scanline_buffer = pixman_malloc_abc (width, 3, Bpp);
-
- if (!scanline_buffer)
- return;
- }
-
- src_buffer = scanline_buffer;
- mask_buffer = src_buffer + width * Bpp;
- dest_buffer = mask_buffer + width * Bpp;
-
- /* src iter */
- src_flags = narrow | op_flags[op].src;
-
- _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src,
- src_x, src_y, width, height,
- src_buffer, src_flags);
-
- /* mask iter */
- if ((src_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
- (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
- {
- /* If it doesn't matter what the source is, then it doesn't matter
- * what the mask is
- */
- mask = NULL;
- }
-
- component_alpha =
- mask &&
- mask->common.type == BITS &&
- mask->common.component_alpha &&
- PIXMAN_FORMAT_RGB (mask->bits.format);
-
- _pixman_implementation_src_iter_init (
- imp->toplevel, &mask_iter, mask, mask_x, mask_y, width, height,
- mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB));
-
- /* dest iter */
- _pixman_implementation_dest_iter_init (imp->toplevel, &dest_iter, dest,
- dest_x, dest_y, width, height,
- dest_buffer,
- narrow | op_flags[op].dst);
-
- if (narrow)
- {
- if (component_alpha)
- compose = _pixman_implementation_combine_32_ca;
- else
- compose = _pixman_implementation_combine_32;
- }
- else
- {
- if (component_alpha)
- compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64_ca;
- else
- compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64;
- }
-
- if (!compose)
- return;
-
- for (i = 0; i < height; ++i)
- {
- uint32_t *s, *m, *d;
-
- m = mask_iter.get_scanline (&mask_iter, NULL);
- s = src_iter.get_scanline (&src_iter, m);
- d = dest_iter.get_scanline (&dest_iter, NULL);
-
- compose (imp->toplevel, op, d, s, m, width);
-
- dest_iter.write_back (&dest_iter);
- }
-
- if (scanline_buffer != (uint8_t *) stack_scanline_buffer)
- free (scanline_buffer);
-}
-
-static const pixman_fast_path_t general_fast_path[] =
-{
- { PIXMAN_OP_any, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, general_composite_rect },
- { PIXMAN_OP_NONE }
-};
-
-static pixman_bool_t
-general_blt (pixman_implementation_t *imp,
- uint32_t * src_bits,
- uint32_t * dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x,
- int src_y,
- int dst_x,
- int dst_y,
- int width,
- int height)
-{
- /* We can't blit unless we have sse2 or mmx */
-
- return FALSE;
-}
-
-static pixman_bool_t
-general_fill (pixman_implementation_t *imp,
- uint32_t * bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- return FALSE;
-}
-
-pixman_implementation_t *
-_pixman_implementation_create_general (void)
-{
- pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path);
-
- _pixman_setup_combiner_functions_32 (imp);
- _pixman_setup_combiner_functions_64 (imp);
-
- imp->blt = general_blt;
- imp->fill = general_fill;
- imp->src_iter_init = general_src_iter_init;
- imp->dest_iter_init = general_dest_iter_init;
-
- return imp;
-}
-
+/*
+ * Copyright © 2009 Red Hat, Inc.
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * 2005 Lars Knoll & Zack Rusin, Trolltech
+ * 2008 Aaron Plattner, NVIDIA Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. Red Hat makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+
+static void
+general_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+ pixman_image_t *image = iter->image;
+
+ if (image->type == SOLID)
+ _pixman_solid_fill_iter_init (image, iter);
+ else if (image->type == LINEAR)
+ _pixman_linear_gradient_iter_init (image, iter);
+ else if (image->type == RADIAL)
+ _pixman_radial_gradient_iter_init (image, iter);
+ else if (image->type == CONICAL)
+ _pixman_conical_gradient_iter_init (image, iter);
+ else if (image->type == BITS)
+ _pixman_bits_image_src_iter_init (image, iter);
+ else
+ _pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+}
+
+static void
+general_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+ if (iter->image->type == BITS)
+ {
+ _pixman_bits_image_dest_iter_init (iter->image, iter);
+ }
+ else
+ {
+ _pixman_log_error (FUNC, "Trying to write to a non-writable image");
+ }
+}
+
+typedef struct op_info_t op_info_t;
+struct op_info_t
+{
+ uint8_t src, dst;
+};
+
+#define ITER_IGNORE_BOTH \
+ (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_LOCALIZED_ALPHA)
+
+static const op_info_t op_flags[PIXMAN_N_OPERATORS] =
+{
+ /* Src Dst */
+ { ITER_IGNORE_BOTH, ITER_IGNORE_BOTH }, /* CLEAR */
+ { ITER_LOCALIZED_ALPHA, ITER_IGNORE_BOTH }, /* SRC */
+ { ITER_IGNORE_BOTH, ITER_LOCALIZED_ALPHA }, /* DST */
+ { 0, ITER_LOCALIZED_ALPHA }, /* OVER */
+ { ITER_LOCALIZED_ALPHA, 0 }, /* OVER_REVERSE */
+ { ITER_LOCALIZED_ALPHA, ITER_IGNORE_RGB }, /* IN */
+ { ITER_IGNORE_RGB, ITER_LOCALIZED_ALPHA }, /* IN_REVERSE */
+ { ITER_LOCALIZED_ALPHA, ITER_IGNORE_RGB }, /* OUT */
+ { ITER_IGNORE_RGB, ITER_LOCALIZED_ALPHA }, /* OUT_REVERSE */
+ { 0, 0 }, /* ATOP */
+ { 0, 0 }, /* ATOP_REVERSE */
+ { 0, 0 }, /* XOR */
+ { ITER_LOCALIZED_ALPHA, ITER_LOCALIZED_ALPHA }, /* ADD */
+ { 0, 0 }, /* SATURATE */
+};
+
+#define SCANLINE_BUFFER_LENGTH 8192
+
+static void
+general_composite_rect (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src,
+ pixman_image_t * mask,
+ pixman_image_t * dest,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8];
+ uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
+ uint8_t *src_buffer, *mask_buffer, *dest_buffer;
+ pixman_iter_t src_iter, mask_iter, dest_iter;
+ pixman_combine_32_func_t compose;
+ pixman_bool_t component_alpha;
+ iter_flags_t narrow, src_flags;
+ int Bpp;
+ int i;
+
+ if ((src->common.flags & FAST_PATH_NARROW_FORMAT) &&
+ (!mask || mask->common.flags & FAST_PATH_NARROW_FORMAT) &&
+ (dest->common.flags & FAST_PATH_NARROW_FORMAT))
+ {
+ narrow = ITER_NARROW;
+ Bpp = 4;
+ }
+ else
+ {
+ narrow = 0;
+ Bpp = 8;
+ }
+
+ if (width * Bpp > SCANLINE_BUFFER_LENGTH)
+ {
+ scanline_buffer = pixman_malloc_abc (width, 3, Bpp);
+
+ if (!scanline_buffer)
+ return;
+ }
+
+ src_buffer = scanline_buffer;
+ mask_buffer = src_buffer + width * Bpp;
+ dest_buffer = mask_buffer + width * Bpp;
+
+ /* src iter */
+ src_flags = narrow | op_flags[op].src;
+
+ _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src,
+ src_x, src_y, width, height,
+ src_buffer, src_flags);
+
+ /* mask iter */
+ if ((src_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
+ (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
+ {
+ /* If it doesn't matter what the source is, then it doesn't matter
+ * what the mask is
+ */
+ mask = NULL;
+ }
+
+ component_alpha =
+ mask &&
+ mask->common.type == BITS &&
+ mask->common.component_alpha &&
+ PIXMAN_FORMAT_RGB (mask->bits.format);
+
+ _pixman_implementation_src_iter_init (
+ imp->toplevel, &mask_iter, mask, mask_x, mask_y, width, height,
+ mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB));
+
+ /* dest iter */
+ _pixman_implementation_dest_iter_init (imp->toplevel, &dest_iter, dest,
+ dest_x, dest_y, width, height,
+ dest_buffer,
+ narrow | op_flags[op].dst);
+
+ if (narrow)
+ {
+ if (component_alpha)
+ compose = _pixman_implementation_combine_32_ca;
+ else
+ compose = _pixman_implementation_combine_32;
+ }
+ else
+ {
+ if (component_alpha)
+ compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64_ca;
+ else
+ compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64;
+ }
+
+ if (!compose)
+ return;
+
+ for (i = 0; i < height; ++i)
+ {
+ uint32_t *s, *m, *d;
+
+ m = mask_iter.get_scanline (&mask_iter, NULL);
+ s = src_iter.get_scanline (&src_iter, m);
+ d = dest_iter.get_scanline (&dest_iter, NULL);
+
+ compose (imp->toplevel, op, d, s, m, width);
+
+ dest_iter.write_back (&dest_iter);
+ }
+
+ if (scanline_buffer != (uint8_t *) stack_scanline_buffer)
+ free (scanline_buffer);
+}
+
+static const pixman_fast_path_t general_fast_path[] =
+{
+ { PIXMAN_OP_any, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, general_composite_rect },
+ { PIXMAN_OP_NONE }
+};
+
+static pixman_bool_t
+general_blt (pixman_implementation_t *imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height)
+{
+ /* We can't blit unless we have sse2 or mmx */
+
+ return FALSE;
+}
+
+static pixman_bool_t
+general_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ return FALSE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_general (void)
+{
+ pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path);
+
+ _pixman_setup_combiner_functions_32 (imp);
+ _pixman_setup_combiner_functions_64 (imp);
+
+ imp->blt = general_blt;
+ imp->fill = general_fill;
+ imp->src_iter_init = general_src_iter_init;
+ imp->dest_iter_init = general_dest_iter_init;
+
+ return imp;
+}
+
diff --git a/pixman/pixman/pixman-implementation.c b/pixman/pixman/pixman-implementation.c
index adaf9c61e..caade9332 100644
--- a/pixman/pixman/pixman-implementation.c
+++ b/pixman/pixman/pixman-implementation.c
@@ -1,306 +1,304 @@
-/*
- * Copyright © 2009 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Red Hat not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. Red Hat makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdlib.h>
-#include "pixman-private.h"
-
-static void
-delegate_combine_32 (pixman_implementation_t * imp,
- pixman_op_t op,
- uint32_t * dest,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- _pixman_implementation_combine_32 (imp->delegate,
- op, dest, src, mask, width);
-}
-
-static void
-delegate_combine_64 (pixman_implementation_t * imp,
- pixman_op_t op,
- uint64_t * dest,
- const uint64_t * src,
- const uint64_t * mask,
- int width)
-{
- _pixman_implementation_combine_64 (imp->delegate,
- op, dest, src, mask, width);
-}
-
-static void
-delegate_combine_32_ca (pixman_implementation_t * imp,
- pixman_op_t op,
- uint32_t * dest,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- _pixman_implementation_combine_32_ca (imp->delegate,
- op, dest, src, mask, width);
-}
-
-static void
-delegate_combine_64_ca (pixman_implementation_t * imp,
- pixman_op_t op,
- uint64_t * dest,
- const uint64_t * src,
- const uint64_t * mask,
- int width)
-{
- _pixman_implementation_combine_64_ca (imp->delegate,
- op, dest, src, mask, width);
-}
-
-static pixman_bool_t
-delegate_blt (pixman_implementation_t * imp,
- uint32_t * src_bits,
- uint32_t * dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x,
- int src_y,
- int dst_x,
- int dst_y,
- int width,
- int height)
-{
- return _pixman_implementation_blt (
- imp->delegate, src_bits, dst_bits, src_stride, dst_stride,
- src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
- width, height);
-}
-
-static pixman_bool_t
-delegate_fill (pixman_implementation_t *imp,
- uint32_t * bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- return _pixman_implementation_fill (
- imp->delegate, bits, stride, bpp, x, y, width, height, xor);
-}
-
-static void
-delegate_src_iter_init (pixman_implementation_t *imp,
- pixman_iter_t * iter,
- pixman_image_t * image,
- int x,
- int y,
- int width,
- int height,
- uint8_t * buffer,
- iter_flags_t flags)
-{
- _pixman_implementation_src_iter_init (
- imp->delegate, iter, image, x, y, width, height, buffer, flags);
-}
-
-static void
-delegate_dest_iter_init (pixman_implementation_t *imp,
- pixman_iter_t * iter,
- pixman_image_t * image,
- int x,
- int y,
- int width,
- int height,
- uint8_t * buffer,
- iter_flags_t flags)
-{
- _pixman_implementation_dest_iter_init (
- imp->delegate, iter, image, x, y, width, height, buffer, flags);
-}
-
-pixman_implementation_t *
-_pixman_implementation_create (pixman_implementation_t *delegate,
- const pixman_fast_path_t *fast_paths)
-{
- pixman_implementation_t *imp = malloc (sizeof (pixman_implementation_t));
- pixman_implementation_t *d;
- int i;
-
- if (!imp)
- return NULL;
-
- assert (fast_paths);
-
- /* Make sure the whole delegate chain has the right toplevel */
- imp->delegate = delegate;
- for (d = imp; d != NULL; d = d->delegate)
- d->toplevel = imp;
-
- /* Fill out function pointers with ones that just delegate
- */
- imp->blt = delegate_blt;
- imp->fill = delegate_fill;
- imp->src_iter_init = delegate_src_iter_init;
- imp->dest_iter_init = delegate_dest_iter_init;
-
- for (i = 0; i < PIXMAN_N_OPERATORS; ++i)
- {
- imp->combine_32[i] = delegate_combine_32;
- imp->combine_64[i] = delegate_combine_64;
- imp->combine_32_ca[i] = delegate_combine_32_ca;
- imp->combine_64_ca[i] = delegate_combine_64_ca;
- }
-
- imp->fast_paths = fast_paths;
-
- return imp;
-}
-
-void
-_pixman_implementation_combine_32 (pixman_implementation_t * imp,
- pixman_op_t op,
- uint32_t * dest,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- (*imp->combine_32[op]) (imp, op, dest, src, mask, width);
-}
-
-void
-_pixman_implementation_combine_64 (pixman_implementation_t * imp,
- pixman_op_t op,
- uint64_t * dest,
- const uint64_t * src,
- const uint64_t * mask,
- int width)
-{
- (*imp->combine_64[op]) (imp, op, dest, src, mask, width);
-}
-
-void
-_pixman_implementation_combine_32_ca (pixman_implementation_t * imp,
- pixman_op_t op,
- uint32_t * dest,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- (*imp->combine_32_ca[op]) (imp, op, dest, src, mask, width);
-}
-
-void
-_pixman_implementation_combine_64_ca (pixman_implementation_t * imp,
- pixman_op_t op,
- uint64_t * dest,
- const uint64_t * src,
- const uint64_t * mask,
- int width)
-{
- (*imp->combine_64_ca[op]) (imp, op, dest, src, mask, width);
-}
-
-pixman_bool_t
-_pixman_implementation_blt (pixman_implementation_t * imp,
- uint32_t * src_bits,
- uint32_t * dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x,
- int src_y,
- int dst_x,
- int dst_y,
- int width,
- int height)
-{
- return (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride,
- src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
- width, height);
-}
-
-pixman_bool_t
-_pixman_implementation_fill (pixman_implementation_t *imp,
- uint32_t * bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- return (*imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor);
-}
-
-static uint32_t *
-get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
-{
- return NULL;
-}
-
-void
-_pixman_implementation_src_iter_init (pixman_implementation_t *imp,
- pixman_iter_t *iter,
- pixman_image_t *image,
- int x,
- int y,
- int width,
- int height,
- uint8_t *buffer,
- iter_flags_t flags)
-{
- if (!image)
- {
- iter->get_scanline = get_scanline_null;
- }
- else if ((flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
- (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
- {
- iter->get_scanline = _pixman_iter_get_scanline_noop;
- }
- else
- {
- (*imp->src_iter_init) (
- imp, iter, image, x, y, width, height, buffer, flags);
- }
-}
-
-void
-_pixman_implementation_dest_iter_init (pixman_implementation_t *imp,
- pixman_iter_t *iter,
- pixman_image_t *image,
- int x,
- int y,
- int width,
- int height,
- uint8_t *buffer,
- iter_flags_t flags)
-{
- (*imp->dest_iter_init) (
- imp, iter, image, x, y, width, height, buffer, flags);
-}
+/*
+ * Copyright © 2009 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. Red Hat makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+static void
+delegate_combine_32 (pixman_implementation_t * imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ _pixman_implementation_combine_32 (imp->delegate,
+ op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_64 (pixman_implementation_t * imp,
+ pixman_op_t op,
+ uint64_t * dest,
+ const uint64_t * src,
+ const uint64_t * mask,
+ int width)
+{
+ _pixman_implementation_combine_64 (imp->delegate,
+ op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_32_ca (pixman_implementation_t * imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ _pixman_implementation_combine_32_ca (imp->delegate,
+ op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_64_ca (pixman_implementation_t * imp,
+ pixman_op_t op,
+ uint64_t * dest,
+ const uint64_t * src,
+ const uint64_t * mask,
+ int width)
+{
+ _pixman_implementation_combine_64_ca (imp->delegate,
+ op, dest, src, mask, width);
+}
+
+static pixman_bool_t
+delegate_blt (pixman_implementation_t * imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height)
+{
+ return _pixman_implementation_blt (
+ imp->delegate, src_bits, dst_bits, src_stride, dst_stride,
+ src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
+ width, height);
+}
+
+static pixman_bool_t
+delegate_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ return _pixman_implementation_fill (
+ imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+}
+
+static void
+delegate_src_iter_init (pixman_implementation_t *imp,
+ pixman_iter_t * iter)
+{
+ imp->delegate->src_iter_init (imp->delegate, iter);
+}
+
+static void
+delegate_dest_iter_init (pixman_implementation_t *imp,
+ pixman_iter_t * iter)
+{
+ imp->delegate->dest_iter_init (imp->delegate, iter);
+}
+
+pixman_implementation_t *
+_pixman_implementation_create (pixman_implementation_t *delegate,
+ const pixman_fast_path_t *fast_paths)
+{
+ pixman_implementation_t *imp = malloc (sizeof (pixman_implementation_t));
+ pixman_implementation_t *d;
+ int i;
+
+ if (!imp)
+ return NULL;
+
+ assert (fast_paths);
+
+ /* Make sure the whole delegate chain has the right toplevel */
+ imp->delegate = delegate;
+ for (d = imp; d != NULL; d = d->delegate)
+ d->toplevel = imp;
+
+ /* Fill out function pointers with ones that just delegate
+ */
+ imp->blt = delegate_blt;
+ imp->fill = delegate_fill;
+ imp->src_iter_init = delegate_src_iter_init;
+ imp->dest_iter_init = delegate_dest_iter_init;
+
+ for (i = 0; i < PIXMAN_N_OPERATORS; ++i)
+ {
+ imp->combine_32[i] = delegate_combine_32;
+ imp->combine_64[i] = delegate_combine_64;
+ imp->combine_32_ca[i] = delegate_combine_32_ca;
+ imp->combine_64_ca[i] = delegate_combine_64_ca;
+ }
+
+ imp->fast_paths = fast_paths;
+
+ return imp;
+}
+
+void
+_pixman_implementation_combine_32 (pixman_implementation_t * imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ (*imp->combine_32[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_64 (pixman_implementation_t * imp,
+ pixman_op_t op,
+ uint64_t * dest,
+ const uint64_t * src,
+ const uint64_t * mask,
+ int width)
+{
+ (*imp->combine_64[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_32_ca (pixman_implementation_t * imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ (*imp->combine_32_ca[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_64_ca (pixman_implementation_t * imp,
+ pixman_op_t op,
+ uint64_t * dest,
+ const uint64_t * src,
+ const uint64_t * mask,
+ int width)
+{
+ (*imp->combine_64_ca[op]) (imp, op, dest, src, mask, width);
+}
+
+pixman_bool_t
+_pixman_implementation_blt (pixman_implementation_t * imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height)
+{
+ return (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride,
+ src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
+ width, height);
+}
+
+pixman_bool_t
+_pixman_implementation_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ return (*imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor);
+}
+
+static uint32_t *
+get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
+{
+ return NULL;
+}
+
+void
+_pixman_implementation_src_iter_init (pixman_implementation_t *imp,
+ pixman_iter_t *iter,
+ pixman_image_t *image,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint8_t *buffer,
+ iter_flags_t flags)
+{
+ iter->image = image;
+ iter->buffer = (uint32_t *)buffer;
+ iter->x = x;
+ iter->y = y;
+ iter->width = width;
+ iter->height = height;
+ iter->flags = flags;
+
+ if (!image)
+ {
+ iter->get_scanline = get_scanline_null;
+ }
+ else if ((flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
+ (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
+ {
+ iter->get_scanline = _pixman_iter_get_scanline_noop;
+ }
+ else
+ {
+ (*imp->src_iter_init) (imp, iter);
+ }
+}
+
+void
+_pixman_implementation_dest_iter_init (pixman_implementation_t *imp,
+ pixman_iter_t *iter,
+ pixman_image_t *image,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint8_t *buffer,
+ iter_flags_t flags)
+{
+ iter->image = image;
+ iter->buffer = (uint32_t *)buffer;
+ iter->x = x;
+ iter->y = y;
+ iter->width = width;
+ iter->height = height;
+ iter->flags = flags;
+
+ (*imp->dest_iter_init) (imp, iter);
+}
diff --git a/pixman/pixman/pixman-linear-gradient.c b/pixman/pixman/pixman-linear-gradient.c
index 07303fc03..3d5bbf63d 100644
--- a/pixman/pixman/pixman-linear-gradient.c
+++ b/pixman/pixman/pixman-linear-gradient.c
@@ -1,292 +1,286 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- * 2005 Lars Knoll & Zack Rusin, Trolltech
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. Keith Packard makes no
- * representations about the suitability of this software for any purpose. It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdlib.h>
-#include "pixman-private.h"
-
-static pixman_bool_t
-linear_gradient_is_horizontal (pixman_image_t *image,
- int x,
- int y,
- int width,
- int height)
-{
- linear_gradient_t *linear = (linear_gradient_t *)image;
- pixman_vector_t v;
- pixman_fixed_32_32_t l;
- pixman_fixed_48_16_t dx, dy;
- double inc;
-
- if (image->common.transform)
- {
- /* projective transformation */
- if (image->common.transform->matrix[2][0] != 0 ||
- image->common.transform->matrix[2][1] != 0 ||
- image->common.transform->matrix[2][2] == 0)
- {
- return FALSE;
- }
-
- v.vector[0] = image->common.transform->matrix[0][1];
- v.vector[1] = image->common.transform->matrix[1][1];
- v.vector[2] = image->common.transform->matrix[2][2];
- }
- else
- {
- v.vector[0] = 0;
- v.vector[1] = pixman_fixed_1;
- v.vector[2] = pixman_fixed_1;
- }
-
- dx = linear->p2.x - linear->p1.x;
- dy = linear->p2.y - linear->p1.y;
-
- l = dx * dx + dy * dy;
-
- if (l == 0)
- return FALSE;
-
- /*
- * compute how much the input of the gradient walked changes
- * when moving vertically through the whole image
- */
- inc = height * (double) pixman_fixed_1 * pixman_fixed_1 *
- (dx * v.vector[0] + dy * v.vector[1]) /
- (v.vector[2] * (double) l);
-
- /* check that casting to integer would result in 0 */
- if (-1 < inc && inc < 1)
- return TRUE;
-
- return FALSE;
-}
-
-static uint32_t *
-linear_get_scanline_narrow (pixman_iter_t *iter,
- const uint32_t *mask)
-{
- pixman_image_t *image = iter->image;
- int x = iter->x;
- int y = iter->y;
- int width = iter->width;
- uint32_t * buffer = iter->buffer;
-
- pixman_vector_t v, unit;
- pixman_fixed_32_32_t l;
- pixman_fixed_48_16_t dx, dy;
- gradient_t *gradient = (gradient_t *)image;
- linear_gradient_t *linear = (linear_gradient_t *)image;
- uint32_t *end = buffer + width;
- pixman_gradient_walker_t walker;
-
- _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (image->common.transform)
- {
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return iter->buffer;
-
- unit.vector[0] = image->common.transform->matrix[0][0];
- unit.vector[1] = image->common.transform->matrix[1][0];
- unit.vector[2] = image->common.transform->matrix[2][0];
- }
- else
- {
- unit.vector[0] = pixman_fixed_1;
- unit.vector[1] = 0;
- unit.vector[2] = 0;
- }
-
- dx = linear->p2.x - linear->p1.x;
- dy = linear->p2.y - linear->p1.y;
-
- l = dx * dx + dy * dy;
-
- if (l == 0 || unit.vector[2] == 0)
- {
- /* affine transformation only */
- pixman_fixed_32_32_t t, next_inc;
- double inc;
-
- if (l == 0 || v.vector[2] == 0)
- {
- t = 0;
- inc = 0;
- }
- else
- {
- double invden, v2;
-
- invden = pixman_fixed_1 * (double) pixman_fixed_1 /
- (l * (double) v.vector[2]);
- v2 = v.vector[2] * (1. / pixman_fixed_1);
- t = ((dx * v.vector[0] + dy * v.vector[1]) -
- (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
- inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden;
- }
- next_inc = 0;
-
- if (((pixman_fixed_32_32_t )(inc * width)) == 0)
- {
- register uint32_t color;
-
- color = _pixman_gradient_walker_pixel (&walker, t);
- while (buffer < end)
- *buffer++ = color;
- }
- else
- {
- int i;
-
- i = 0;
- while (buffer < end)
- {
- if (!mask || *mask++)
- {
- *buffer = _pixman_gradient_walker_pixel (&walker,
- t + next_inc);
- }
- i++;
- next_inc = inc * i;
- buffer++;
- }
- }
- }
- else
- {
- /* projective transformation */
- double t;
-
- t = 0;
-
- while (buffer < end)
- {
- if (!mask || *mask++)
- {
- if (v.vector[2] != 0)
- {
- double invden, v2;
-
- invden = pixman_fixed_1 * (double) pixman_fixed_1 /
- (l * (double) v.vector[2]);
- v2 = v.vector[2] * (1. / pixman_fixed_1);
- t = ((dx * v.vector[0] + dy * v.vector[1]) -
- (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
- }
-
- *buffer = _pixman_gradient_walker_pixel (&walker, t);
- }
-
- ++buffer;
-
- v.vector[0] += unit.vector[0];
- v.vector[1] += unit.vector[1];
- v.vector[2] += unit.vector[2];
- }
- }
-
- iter->y++;
-
- return iter->buffer;
-}
-
-static uint32_t *
-linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
- uint32_t *buffer = linear_get_scanline_narrow (iter, NULL);
-
- pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
-
- return buffer;
-}
-
-void
-_pixman_linear_gradient_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x,
- int y,
- int width,
- int height,
- uint8_t *buffer,
- iter_flags_t flags)
-{
- if (linear_gradient_is_horizontal (image, x, y, width, height))
- {
- if (flags & ITER_NARROW)
- linear_get_scanline_narrow (iter, NULL);
- else
- linear_get_scanline_wide (iter, NULL);
-
- iter->get_scanline = _pixman_iter_get_scanline_noop;
- }
- else
- {
- if (flags & ITER_NARROW)
- iter->get_scanline = linear_get_scanline_narrow;
- else
- iter->get_scanline = linear_get_scanline_wide;
- }
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_linear_gradient (pixman_point_fixed_t * p1,
- pixman_point_fixed_t * p2,
- const pixman_gradient_stop_t *stops,
- int n_stops)
-{
- pixman_image_t *image;
- linear_gradient_t *linear;
-
- image = _pixman_image_allocate ();
-
- if (!image)
- return NULL;
-
- linear = &image->linear;
-
- if (!_pixman_init_gradient (&linear->common, stops, n_stops))
- {
- free (image);
- return NULL;
- }
-
- linear->p1 = *p1;
- linear->p2 = *p2;
-
- image->type = LINEAR;
-
- return image;
-}
-
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * 2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Keith Packard makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+static pixman_bool_t
+linear_gradient_is_horizontal (pixman_image_t *image,
+ int x,
+ int y,
+ int width,
+ int height)
+{
+ linear_gradient_t *linear = (linear_gradient_t *)image;
+ pixman_vector_t v;
+ pixman_fixed_32_32_t l;
+ pixman_fixed_48_16_t dx, dy;
+ double inc;
+
+ if (image->common.transform)
+ {
+ /* projective transformation */
+ if (image->common.transform->matrix[2][0] != 0 ||
+ image->common.transform->matrix[2][1] != 0 ||
+ image->common.transform->matrix[2][2] == 0)
+ {
+ return FALSE;
+ }
+
+ v.vector[0] = image->common.transform->matrix[0][1];
+ v.vector[1] = image->common.transform->matrix[1][1];
+ v.vector[2] = image->common.transform->matrix[2][2];
+ }
+ else
+ {
+ v.vector[0] = 0;
+ v.vector[1] = pixman_fixed_1;
+ v.vector[2] = pixman_fixed_1;
+ }
+
+ dx = linear->p2.x - linear->p1.x;
+ dy = linear->p2.y - linear->p1.y;
+
+ l = dx * dx + dy * dy;
+
+ if (l == 0)
+ return FALSE;
+
+ /*
+ * compute how much the input of the gradient walked changes
+ * when moving vertically through the whole image
+ */
+ inc = height * (double) pixman_fixed_1 * pixman_fixed_1 *
+ (dx * v.vector[0] + dy * v.vector[1]) /
+ (v.vector[2] * (double) l);
+
+ /* check that casting to integer would result in 0 */
+ if (-1 < inc && inc < 1)
+ return TRUE;
+
+ return FALSE;
+}
+
+static uint32_t *
+linear_get_scanline_narrow (pixman_iter_t *iter,
+ const uint32_t *mask)
+{
+ pixman_image_t *image = iter->image;
+ int x = iter->x;
+ int y = iter->y;
+ int width = iter->width;
+ uint32_t * buffer = iter->buffer;
+
+ pixman_vector_t v, unit;
+ pixman_fixed_32_32_t l;
+ pixman_fixed_48_16_t dx, dy;
+ gradient_t *gradient = (gradient_t *)image;
+ linear_gradient_t *linear = (linear_gradient_t *)image;
+ uint32_t *end = buffer + width;
+ pixman_gradient_walker_t walker;
+
+ _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (image->common.transform)
+ {
+ if (!pixman_transform_point_3d (image->common.transform, &v))
+ return iter->buffer;
+
+ unit.vector[0] = image->common.transform->matrix[0][0];
+ unit.vector[1] = image->common.transform->matrix[1][0];
+ unit.vector[2] = image->common.transform->matrix[2][0];
+ }
+ else
+ {
+ unit.vector[0] = pixman_fixed_1;
+ unit.vector[1] = 0;
+ unit.vector[2] = 0;
+ }
+
+ dx = linear->p2.x - linear->p1.x;
+ dy = linear->p2.y - linear->p1.y;
+
+ l = dx * dx + dy * dy;
+
+ if (l == 0 || unit.vector[2] == 0)
+ {
+ /* affine transformation only */
+ pixman_fixed_32_32_t t, next_inc;
+ double inc;
+
+ if (l == 0 || v.vector[2] == 0)
+ {
+ t = 0;
+ inc = 0;
+ }
+ else
+ {
+ double invden, v2;
+
+ invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+ (l * (double) v.vector[2]);
+ v2 = v.vector[2] * (1. / pixman_fixed_1);
+ t = ((dx * v.vector[0] + dy * v.vector[1]) -
+ (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
+ inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden;
+ }
+ next_inc = 0;
+
+ if (((pixman_fixed_32_32_t )(inc * width)) == 0)
+ {
+ register uint32_t color;
+
+ color = _pixman_gradient_walker_pixel (&walker, t);
+ while (buffer < end)
+ *buffer++ = color;
+ }
+ else
+ {
+ int i;
+
+ i = 0;
+ while (buffer < end)
+ {
+ if (!mask || *mask++)
+ {
+ *buffer = _pixman_gradient_walker_pixel (&walker,
+ t + next_inc);
+ }
+ i++;
+ next_inc = inc * i;
+ buffer++;
+ }
+ }
+ }
+ else
+ {
+ /* projective transformation */
+ double t;
+
+ t = 0;
+
+ while (buffer < end)
+ {
+ if (!mask || *mask++)
+ {
+ if (v.vector[2] != 0)
+ {
+ double invden, v2;
+
+ invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+ (l * (double) v.vector[2]);
+ v2 = v.vector[2] * (1. / pixman_fixed_1);
+ t = ((dx * v.vector[0] + dy * v.vector[1]) -
+ (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
+ }
+
+ *buffer = _pixman_gradient_walker_pixel (&walker, t);
+ }
+
+ ++buffer;
+
+ v.vector[0] += unit.vector[0];
+ v.vector[1] += unit.vector[1];
+ v.vector[2] += unit.vector[2];
+ }
+ }
+
+ iter->y++;
+
+ return iter->buffer;
+}
+
+static uint32_t *
+linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+ uint32_t *buffer = linear_get_scanline_narrow (iter, NULL);
+
+ pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+ return buffer;
+}
+
+void
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+ if (linear_gradient_is_horizontal (
+ iter->image, iter->x, iter->y, iter->width, iter->height))
+ {
+ if (iter->flags & ITER_NARROW)
+ linear_get_scanline_narrow (iter, NULL);
+ else
+ linear_get_scanline_wide (iter, NULL);
+
+ iter->get_scanline = _pixman_iter_get_scanline_noop;
+ }
+ else
+ {
+ if (iter->flags & ITER_NARROW)
+ iter->get_scanline = linear_get_scanline_narrow;
+ else
+ iter->get_scanline = linear_get_scanline_wide;
+ }
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_linear_gradient (pixman_point_fixed_t * p1,
+ pixman_point_fixed_t * p2,
+ const pixman_gradient_stop_t *stops,
+ int n_stops)
+{
+ pixman_image_t *image;
+ linear_gradient_t *linear;
+
+ image = _pixman_image_allocate ();
+
+ if (!image)
+ return NULL;
+
+ linear = &image->linear;
+
+ if (!_pixman_init_gradient (&linear->common, stops, n_stops))
+ {
+ free (image);
+ return NULL;
+ }
+
+ linear->p1 = *p1;
+ linear->p2 = *p2;
+
+ image->type = LINEAR;
+
+ return image;
+}
+
diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h
index ee7f4d676..658aeea8a 100644
--- a/pixman/pixman/pixman-private.h
+++ b/pixman/pixman/pixman-private.h
@@ -212,14 +212,19 @@ typedef enum
struct pixman_iter_t
{
- pixman_iter_get_scanline_t get_scanline;
- pixman_iter_write_back_t write_back;
-
+ /* These are initialized by _pixman_implementation_{src,dest}_init */
pixman_image_t * image;
uint32_t * buffer;
int x, y;
int width;
+ int height;
+ iter_flags_t flags;
+ /* These function pointers are initialized by the implementation */
+ pixman_iter_get_scanline_t get_scanline;
+ pixman_iter_write_back_t write_back;
+
+ /* These fields are scratch data that implementations can use */
uint8_t * bits;
int stride;
};
@@ -228,39 +233,22 @@ void
_pixman_bits_image_setup_accessors (bits_image_t *image);
void
-_pixman_bits_image_src_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags);
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
void
-_pixman_bits_image_dest_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags);
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter);
void
-_pixman_solid_fill_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags);
+_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t *iter);
void
-_pixman_linear_gradient_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags);
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
void
-_pixman_radial_gradient_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags);
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
void
-_pixman_conical_gradient_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags);
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
pixman_image_t *
_pixman_image_allocate (void);
@@ -408,14 +396,7 @@ typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
int height,
uint32_t xor);
typedef void (*pixman_iter_init_func_t) (pixman_implementation_t *imp,
- pixman_iter_t *iter,
- pixman_image_t *image,
- int x,
- int y,
- int width,
- int height,
- uint8_t *buffer,
- iter_flags_t flags);
+ pixman_iter_t *iter);
void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
void _pixman_setup_combiner_functions_64 (pixman_implementation_t *imp);
diff --git a/pixman/pixman/pixman-radial-gradient.c b/pixman/pixman/pixman-radial-gradient.c
index 6523b8259..63c712cc2 100644
--- a/pixman/pixman/pixman-radial-gradient.c
+++ b/pixman/pixman/pixman-radial-gradient.c
@@ -1,463 +1,460 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- *
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- * Copyright © 2000 SuSE, Inc.
- * 2005 Lars Knoll & Zack Rusin, Trolltech
- * Copyright © 2007 Red Hat, Inc.
- *
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. Keith Packard makes no
- * representations about the suitability of this software for any purpose. It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdlib.h>
-#include <math.h>
-#include "pixman-private.h"
-
-static inline pixman_fixed_32_32_t
-dot (pixman_fixed_48_16_t x1,
- pixman_fixed_48_16_t y1,
- pixman_fixed_48_16_t z1,
- pixman_fixed_48_16_t x2,
- pixman_fixed_48_16_t y2,
- pixman_fixed_48_16_t z2)
-{
- /*
- * Exact computation, assuming that the input values can
- * be represented as pixman_fixed_16_16_t
- */
- return x1 * x2 + y1 * y2 + z1 * z2;
-}
-
-static inline double
-fdot (double x1,
- double y1,
- double z1,
- double x2,
- double y2,
- double z2)
-{
- /*
- * Error can be unbound in some special cases.
- * Using clever dot product algorithms (for example compensated
- * dot product) would improve this but make the code much less
- * obvious
- */
- return x1 * x2 + y1 * y2 + z1 * z2;
-}
-
-static uint32_t
-radial_compute_color (double a,
- double b,
- double c,
- double inva,
- double dr,
- double mindr,
- pixman_gradient_walker_t *walker,
- pixman_repeat_t repeat)
-{
- /*
- * In this function error propagation can lead to bad results:
- * - det can have an unbound error (if b*b-a*c is very small),
- * potentially making it the opposite sign of what it should have been
- * (thus clearing a pixel that would have been colored or vice-versa)
- * or propagating the error to sqrtdet;
- * if det has the wrong sign or b is very small, this can lead to bad
- * results
- *
- * - the algorithm used to compute the solutions of the quadratic
- * equation is not numerically stable (but saves one division compared
- * to the numerically stable one);
- * this can be a problem if a*c is much smaller than b*b
- *
- * - the above problems are worse if a is small (as inva becomes bigger)
- */
- double det;
-
- if (a == 0)
- {
- double t;
-
- if (b == 0)
- return 0;
-
- t = pixman_fixed_1 / 2 * c / b;
- if (repeat == PIXMAN_REPEAT_NONE)
- {
- if (0 <= t && t <= pixman_fixed_1)
- return _pixman_gradient_walker_pixel (walker, t);
- }
- else
- {
- if (t * dr > mindr)
- return _pixman_gradient_walker_pixel (walker, t);
- }
-
- return 0;
- }
-
- det = fdot (b, a, 0, b, -c, 0);
- if (det >= 0)
- {
- double sqrtdet, t0, t1;
-
- sqrtdet = sqrt (det);
- t0 = (b + sqrtdet) * inva;
- t1 = (b - sqrtdet) * inva;
-
- if (repeat == PIXMAN_REPEAT_NONE)
- {
- if (0 <= t0 && t0 <= pixman_fixed_1)
- return _pixman_gradient_walker_pixel (walker, t0);
- else if (0 <= t1 && t1 <= pixman_fixed_1)
- return _pixman_gradient_walker_pixel (walker, t1);
- }
- else
- {
- if (t0 * dr > mindr)
- return _pixman_gradient_walker_pixel (walker, t0);
- else if (t1 * dr > mindr)
- return _pixman_gradient_walker_pixel (walker, t1);
- }
- }
-
- return 0;
-}
-
-static uint32_t *
-radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
-{
- /*
- * Implementation of radial gradients following the PDF specification.
- * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference
- * Manual (PDF 32000-1:2008 at the time of this writing).
- *
- * In the radial gradient problem we are given two circles (c₁,r₁) and
- * (c₂,r₂) that define the gradient itself.
- *
- * Mathematically the gradient can be defined as the family of circles
- *
- * ((1-t)·c₁ + t·(c₂), (1-t)·r₁ + t·r₂)
- *
- * excluding those circles whose radius would be < 0. When a point
- * belongs to more than one circle, the one with a bigger t is the only
- * one that contributes to its color. When a point does not belong
- * to any of the circles, it is transparent black, i.e. RGBA (0, 0, 0, 0).
- * Further limitations on the range of values for t are imposed when
- * the gradient is not repeated, namely t must belong to [0,1].
- *
- * The graphical result is the same as drawing the valid (radius > 0)
- * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient
- * is not repeated) using SOURCE operatior composition.
- *
- * It looks like a cone pointing towards the viewer if the ending circle
- * is smaller than the starting one, a cone pointing inside the page if
- * the starting circle is the smaller one and like a cylinder if they
- * have the same radius.
- *
- * What we actually do is, given the point whose color we are interested
- * in, compute the t values for that point, solving for t in:
- *
- * length((1-t)·c₁ + t·(c₂) - p) = (1-t)·r₁ + t·r₂
- *
- * Let's rewrite it in a simpler way, by defining some auxiliary
- * variables:
- *
- * cd = c₂ - c₁
- * pd = p - c₁
- * dr = r₂ - r₁
- * lenght(t·cd - pd) = r₁ + t·dr
- *
- * which actually means
- *
- * hypot(t·cdx - pdx, t·cdy - pdy) = r₁ + t·dr
- *
- * or
- *
- * ⎷((t·cdx - pdx)² + (t·cdy - pdy)²) = r₁ + t·dr.
- *
- * If we impose (as stated earlier) that r₁ + t·dr >= 0, it becomes:
- *
- * (t·cdx - pdx)² + (t·cdy - pdy)² = (r₁ + t·dr)²
- *
- * where we can actually expand the squares and solve for t:
- *
- * t²cdx² - 2t·cdx·pdx + pdx² + t²cdy² - 2t·cdy·pdy + pdy² =
- * = r₁² + 2·r₁·t·dr + t²·dr²
- *
- * (cdx² + cdy² - dr²)t² - 2(cdx·pdx + cdy·pdy + r₁·dr)t +
- * (pdx² + pdy² - r₁²) = 0
- *
- * A = cdx² + cdy² - dr²
- * B = pdx·cdx + pdy·cdy + r₁·dr
- * C = pdx² + pdy² - r₁²
- * At² - 2Bt + C = 0
- *
- * The solutions (unless the equation degenerates because of A = 0) are:
- *
- * t = (B ± ⎷(B² - A·C)) / A
- *
- * The solution we are going to prefer is the bigger one, unless the
- * radius associated to it is negative (or it falls outside the valid t
- * range).
- *
- * Additional observations (useful for optimizations):
- * A does not depend on p
- *
- * A < 0 <=> one of the two circles completely contains the other one
- * <=> for every p, the radiuses associated with the two t solutions
- * have opposite sign
- */
- pixman_image_t *image = iter->image;
- int x = iter->x;
- int y = iter->y;
- int width = iter->width;
- uint32_t *buffer = iter->buffer;
-
- gradient_t *gradient = (gradient_t *)image;
- radial_gradient_t *radial = (radial_gradient_t *)image;
- uint32_t *end = buffer + width;
- pixman_gradient_walker_t walker;
- pixman_vector_t v, unit;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
-
- if (image->common.transform)
- {
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return iter->buffer;
-
- unit.vector[0] = image->common.transform->matrix[0][0];
- unit.vector[1] = image->common.transform->matrix[1][0];
- unit.vector[2] = image->common.transform->matrix[2][0];
- }
- else
- {
- unit.vector[0] = pixman_fixed_1;
- unit.vector[1] = 0;
- unit.vector[2] = 0;
- }
-
- if (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1)
- {
- /*
- * Given:
- *
- * t = (B ± ⎷(B² - A·C)) / A
- *
- * where
- *
- * A = cdx² + cdy² - dr²
- * B = pdx·cdx + pdy·cdy + r₁·dr
- * C = pdx² + pdy² - r₁²
- * det = B² - A·C
- *
- * Since we have an affine transformation, we know that (pdx, pdy)
- * increase linearly with each pixel,
- *
- * pdx = pdx₀ + n·ux,
- * pdy = pdy₀ + n·uy,
- *
- * we can then express B, C and det through multiple differentiation.
- */
- pixman_fixed_32_32_t b, db, c, dc, ddc;
-
- /* warning: this computation may overflow */
- v.vector[0] -= radial->c1.x;
- v.vector[1] -= radial->c1.y;
-
- /*
- * B and C are computed and updated exactly.
- * If fdot was used instead of dot, in the worst case it would
- * lose 11 bits of precision in each of the multiplication and
- * summing up would zero out all the bit that were preserved,
- * thus making the result 0 instead of the correct one.
- * This would mean a worst case of unbound relative error or
- * about 2^10 absolute error
- */
- b = dot (v.vector[0], v.vector[1], radial->c1.radius,
- radial->delta.x, radial->delta.y, radial->delta.radius);
- db = dot (unit.vector[0], unit.vector[1], 0,
- radial->delta.x, radial->delta.y, 0);
-
- c = dot (v.vector[0], v.vector[1],
- -((pixman_fixed_48_16_t) radial->c1.radius),
- v.vector[0], v.vector[1], radial->c1.radius);
- dc = dot (2 * (pixman_fixed_48_16_t) v.vector[0] + unit.vector[0],
- 2 * (pixman_fixed_48_16_t) v.vector[1] + unit.vector[1],
- 0,
- unit.vector[0], unit.vector[1], 0);
- ddc = 2 * dot (unit.vector[0], unit.vector[1], 0,
- unit.vector[0], unit.vector[1], 0);
-
- while (buffer < end)
- {
- if (!mask || *mask++)
- {
- *buffer = radial_compute_color (radial->a, b, c,
- radial->inva,
- radial->delta.radius,
- radial->mindr,
- &walker,
- image->common.repeat);
- }
-
- b += db;
- c += dc;
- dc += ddc;
- ++buffer;
- }
- }
- else
- {
- /* projective */
- /* Warning:
- * error propagation guarantees are much looser than in the affine case
- */
- while (buffer < end)
- {
- if (!mask || *mask++)
- {
- if (v.vector[2] != 0)
- {
- double pdx, pdy, invv2, b, c;
-
- invv2 = 1. * pixman_fixed_1 / v.vector[2];
-
- pdx = v.vector[0] * invv2 - radial->c1.x;
- /* / pixman_fixed_1 */
-
- pdy = v.vector[1] * invv2 - radial->c1.y;
- /* / pixman_fixed_1 */
-
- b = fdot (pdx, pdy, radial->c1.radius,
- radial->delta.x, radial->delta.y,
- radial->delta.radius);
- /* / pixman_fixed_1 / pixman_fixed_1 */
-
- c = fdot (pdx, pdy, -radial->c1.radius,
- pdx, pdy, radial->c1.radius);
- /* / pixman_fixed_1 / pixman_fixed_1 */
-
- *buffer = radial_compute_color (radial->a, b, c,
- radial->inva,
- radial->delta.radius,
- radial->mindr,
- &walker,
- image->common.repeat);
- }
- else
- {
- *buffer = 0;
- }
- }
-
- ++buffer;
-
- v.vector[0] += unit.vector[0];
- v.vector[1] += unit.vector[1];
- v.vector[2] += unit.vector[2];
- }
- }
-
- iter->y++;
- return iter->buffer;
-}
-
-static uint32_t *
-radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
- uint32_t *buffer = radial_get_scanline_narrow (iter, NULL);
-
- pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
-
- return buffer;
-}
-
-void
-_pixman_radial_gradient_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags)
-{
- if (flags & ITER_NARROW)
- iter->get_scanline = radial_get_scanline_narrow;
- else
- iter->get_scanline = radial_get_scanline_wide;
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_radial_gradient (pixman_point_fixed_t * inner,
- pixman_point_fixed_t * outer,
- pixman_fixed_t inner_radius,
- pixman_fixed_t outer_radius,
- const pixman_gradient_stop_t *stops,
- int n_stops)
-{
- pixman_image_t *image;
- radial_gradient_t *radial;
-
- image = _pixman_image_allocate ();
-
- if (!image)
- return NULL;
-
- radial = &image->radial;
-
- if (!_pixman_init_gradient (&radial->common, stops, n_stops))
- {
- free (image);
- return NULL;
- }
-
- image->type = RADIAL;
-
- radial->c1.x = inner->x;
- radial->c1.y = inner->y;
- radial->c1.radius = inner_radius;
- radial->c2.x = outer->x;
- radial->c2.y = outer->y;
- radial->c2.radius = outer_radius;
-
- /* warning: this computations may overflow */
- radial->delta.x = radial->c2.x - radial->c1.x;
- radial->delta.y = radial->c2.y - radial->c1.y;
- radial->delta.radius = radial->c2.radius - radial->c1.radius;
-
- /* computed exactly, then cast to double -> every bit of the double
- representation is correct (53 bits) */
- radial->a = dot (radial->delta.x, radial->delta.y, -radial->delta.radius,
- radial->delta.x, radial->delta.y, radial->delta.radius);
- if (radial->a != 0)
- radial->inva = 1. * pixman_fixed_1 / radial->a;
-
- radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius;
-
- return image;
-}
-
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright © 2000 SuSE, Inc.
+ * 2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Keith Packard makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+static inline pixman_fixed_32_32_t
+dot (pixman_fixed_48_16_t x1,
+ pixman_fixed_48_16_t y1,
+ pixman_fixed_48_16_t z1,
+ pixman_fixed_48_16_t x2,
+ pixman_fixed_48_16_t y2,
+ pixman_fixed_48_16_t z2)
+{
+ /*
+ * Exact computation, assuming that the input values can
+ * be represented as pixman_fixed_16_16_t
+ */
+ return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static inline double
+fdot (double x1,
+ double y1,
+ double z1,
+ double x2,
+ double y2,
+ double z2)
+{
+ /*
+ * Error can be unbound in some special cases.
+ * Using clever dot product algorithms (for example compensated
+ * dot product) would improve this but make the code much less
+ * obvious
+ */
+ return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static uint32_t
+radial_compute_color (double a,
+ double b,
+ double c,
+ double inva,
+ double dr,
+ double mindr,
+ pixman_gradient_walker_t *walker,
+ pixman_repeat_t repeat)
+{
+ /*
+ * In this function error propagation can lead to bad results:
+ * - det can have an unbound error (if b*b-a*c is very small),
+ * potentially making it the opposite sign of what it should have been
+ * (thus clearing a pixel that would have been colored or vice-versa)
+ * or propagating the error to sqrtdet;
+ * if det has the wrong sign or b is very small, this can lead to bad
+ * results
+ *
+ * - the algorithm used to compute the solutions of the quadratic
+ * equation is not numerically stable (but saves one division compared
+ * to the numerically stable one);
+ * this can be a problem if a*c is much smaller than b*b
+ *
+ * - the above problems are worse if a is small (as inva becomes bigger)
+ */
+ double det;
+
+ if (a == 0)
+ {
+ double t;
+
+ if (b == 0)
+ return 0;
+
+ t = pixman_fixed_1 / 2 * c / b;
+ if (repeat == PIXMAN_REPEAT_NONE)
+ {
+ if (0 <= t && t <= pixman_fixed_1)
+ return _pixman_gradient_walker_pixel (walker, t);
+ }
+ else
+ {
+ if (t * dr > mindr)
+ return _pixman_gradient_walker_pixel (walker, t);
+ }
+
+ return 0;
+ }
+
+ det = fdot (b, a, 0, b, -c, 0);
+ if (det >= 0)
+ {
+ double sqrtdet, t0, t1;
+
+ sqrtdet = sqrt (det);
+ t0 = (b + sqrtdet) * inva;
+ t1 = (b - sqrtdet) * inva;
+
+ if (repeat == PIXMAN_REPEAT_NONE)
+ {
+ if (0 <= t0 && t0 <= pixman_fixed_1)
+ return _pixman_gradient_walker_pixel (walker, t0);
+ else if (0 <= t1 && t1 <= pixman_fixed_1)
+ return _pixman_gradient_walker_pixel (walker, t1);
+ }
+ else
+ {
+ if (t0 * dr > mindr)
+ return _pixman_gradient_walker_pixel (walker, t0);
+ else if (t1 * dr > mindr)
+ return _pixman_gradient_walker_pixel (walker, t1);
+ }
+ }
+
+ return 0;
+}
+
+static uint32_t *
+radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+ /*
+ * Implementation of radial gradients following the PDF specification.
+ * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference
+ * Manual (PDF 32000-1:2008 at the time of this writing).
+ *
+ * In the radial gradient problem we are given two circles (c₁,r₁) and
+ * (c₂,r₂) that define the gradient itself.
+ *
+ * Mathematically the gradient can be defined as the family of circles
+ *
+ * ((1-t)·c₁ + t·(c₂), (1-t)·r₁ + t·r₂)
+ *
+ * excluding those circles whose radius would be < 0. When a point
+ * belongs to more than one circle, the one with a bigger t is the only
+ * one that contributes to its color. When a point does not belong
+ * to any of the circles, it is transparent black, i.e. RGBA (0, 0, 0, 0).
+ * Further limitations on the range of values for t are imposed when
+ * the gradient is not repeated, namely t must belong to [0,1].
+ *
+ * The graphical result is the same as drawing the valid (radius > 0)
+ * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient
+ * is not repeated) using SOURCE operatior composition.
+ *
+ * It looks like a cone pointing towards the viewer if the ending circle
+ * is smaller than the starting one, a cone pointing inside the page if
+ * the starting circle is the smaller one and like a cylinder if they
+ * have the same radius.
+ *
+ * What we actually do is, given the point whose color we are interested
+ * in, compute the t values for that point, solving for t in:
+ *
+ * length((1-t)·c₁ + t·(c₂) - p) = (1-t)·r₁ + t·r₂
+ *
+ * Let's rewrite it in a simpler way, by defining some auxiliary
+ * variables:
+ *
+ * cd = c₂ - c₁
+ * pd = p - c₁
+ * dr = r₂ - r₁
+ * lenght(t·cd - pd) = r₁ + t·dr
+ *
+ * which actually means
+ *
+ * hypot(t·cdx - pdx, t·cdy - pdy) = r₁ + t·dr
+ *
+ * or
+ *
+ * ⎷((t·cdx - pdx)² + (t·cdy - pdy)²) = r₁ + t·dr.
+ *
+ * If we impose (as stated earlier) that r₁ + t·dr >= 0, it becomes:
+ *
+ * (t·cdx - pdx)² + (t·cdy - pdy)² = (r₁ + t·dr)²
+ *
+ * where we can actually expand the squares and solve for t:
+ *
+ * t²cdx² - 2t·cdx·pdx + pdx² + t²cdy² - 2t·cdy·pdy + pdy² =
+ * = r₁² + 2·r₁·t·dr + t²·dr²
+ *
+ * (cdx² + cdy² - dr²)t² - 2(cdx·pdx + cdy·pdy + r₁·dr)t +
+ * (pdx² + pdy² - r₁²) = 0
+ *
+ * A = cdx² + cdy² - dr²
+ * B = pdx·cdx + pdy·cdy + r₁·dr
+ * C = pdx² + pdy² - r₁²
+ * At² - 2Bt + C = 0
+ *
+ * The solutions (unless the equation degenerates because of A = 0) are:
+ *
+ * t = (B ± ⎷(B² - A·C)) / A
+ *
+ * The solution we are going to prefer is the bigger one, unless the
+ * radius associated to it is negative (or it falls outside the valid t
+ * range).
+ *
+ * Additional observations (useful for optimizations):
+ * A does not depend on p
+ *
+ * A < 0 <=> one of the two circles completely contains the other one
+ * <=> for every p, the radiuses associated with the two t solutions
+ * have opposite sign
+ */
+ pixman_image_t *image = iter->image;
+ int x = iter->x;
+ int y = iter->y;
+ int width = iter->width;
+ uint32_t *buffer = iter->buffer;
+
+ gradient_t *gradient = (gradient_t *)image;
+ radial_gradient_t *radial = (radial_gradient_t *)image;
+ uint32_t *end = buffer + width;
+ pixman_gradient_walker_t walker;
+ pixman_vector_t v, unit;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+ if (image->common.transform)
+ {
+ if (!pixman_transform_point_3d (image->common.transform, &v))
+ return iter->buffer;
+
+ unit.vector[0] = image->common.transform->matrix[0][0];
+ unit.vector[1] = image->common.transform->matrix[1][0];
+ unit.vector[2] = image->common.transform->matrix[2][0];
+ }
+ else
+ {
+ unit.vector[0] = pixman_fixed_1;
+ unit.vector[1] = 0;
+ unit.vector[2] = 0;
+ }
+
+ if (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1)
+ {
+ /*
+ * Given:
+ *
+ * t = (B ± ⎷(B² - A·C)) / A
+ *
+ * where
+ *
+ * A = cdx² + cdy² - dr²
+ * B = pdx·cdx + pdy·cdy + r₁·dr
+ * C = pdx² + pdy² - r₁²
+ * det = B² - A·C
+ *
+ * Since we have an affine transformation, we know that (pdx, pdy)
+ * increase linearly with each pixel,
+ *
+ * pdx = pdx₀ + n·ux,
+ * pdy = pdy₀ + n·uy,
+ *
+ * we can then express B, C and det through multiple differentiation.
+ */
+ pixman_fixed_32_32_t b, db, c, dc, ddc;
+
+ /* warning: this computation may overflow */
+ v.vector[0] -= radial->c1.x;
+ v.vector[1] -= radial->c1.y;
+
+ /*
+ * B and C are computed and updated exactly.
+ * If fdot was used instead of dot, in the worst case it would
+ * lose 11 bits of precision in each of the multiplication and
+ * summing up would zero out all the bit that were preserved,
+ * thus making the result 0 instead of the correct one.
+ * This would mean a worst case of unbound relative error or
+ * about 2^10 absolute error
+ */
+ b = dot (v.vector[0], v.vector[1], radial->c1.radius,
+ radial->delta.x, radial->delta.y, radial->delta.radius);
+ db = dot (unit.vector[0], unit.vector[1], 0,
+ radial->delta.x, radial->delta.y, 0);
+
+ c = dot (v.vector[0], v.vector[1],
+ -((pixman_fixed_48_16_t) radial->c1.radius),
+ v.vector[0], v.vector[1], radial->c1.radius);
+ dc = dot (2 * (pixman_fixed_48_16_t) v.vector[0] + unit.vector[0],
+ 2 * (pixman_fixed_48_16_t) v.vector[1] + unit.vector[1],
+ 0,
+ unit.vector[0], unit.vector[1], 0);
+ ddc = 2 * dot (unit.vector[0], unit.vector[1], 0,
+ unit.vector[0], unit.vector[1], 0);
+
+ while (buffer < end)
+ {
+ if (!mask || *mask++)
+ {
+ *buffer = radial_compute_color (radial->a, b, c,
+ radial->inva,
+ radial->delta.radius,
+ radial->mindr,
+ &walker,
+ image->common.repeat);
+ }
+
+ b += db;
+ c += dc;
+ dc += ddc;
+ ++buffer;
+ }
+ }
+ else
+ {
+ /* projective */
+ /* Warning:
+ * error propagation guarantees are much looser than in the affine case
+ */
+ while (buffer < end)
+ {
+ if (!mask || *mask++)
+ {
+ if (v.vector[2] != 0)
+ {
+ double pdx, pdy, invv2, b, c;
+
+ invv2 = 1. * pixman_fixed_1 / v.vector[2];
+
+ pdx = v.vector[0] * invv2 - radial->c1.x;
+ /* / pixman_fixed_1 */
+
+ pdy = v.vector[1] * invv2 - radial->c1.y;
+ /* / pixman_fixed_1 */
+
+ b = fdot (pdx, pdy, radial->c1.radius,
+ radial->delta.x, radial->delta.y,
+ radial->delta.radius);
+ /* / pixman_fixed_1 / pixman_fixed_1 */
+
+ c = fdot (pdx, pdy, -radial->c1.radius,
+ pdx, pdy, radial->c1.radius);
+ /* / pixman_fixed_1 / pixman_fixed_1 */
+
+ *buffer = radial_compute_color (radial->a, b, c,
+ radial->inva,
+ radial->delta.radius,
+ radial->mindr,
+ &walker,
+ image->common.repeat);
+ }
+ else
+ {
+ *buffer = 0;
+ }
+ }
+
+ ++buffer;
+
+ v.vector[0] += unit.vector[0];
+ v.vector[1] += unit.vector[1];
+ v.vector[2] += unit.vector[2];
+ }
+ }
+
+ iter->y++;
+ return iter->buffer;
+}
+
+static uint32_t *
+radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+ uint32_t *buffer = radial_get_scanline_narrow (iter, NULL);
+
+ pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+ return buffer;
+}
+
+void
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+ if (iter->flags & ITER_NARROW)
+ iter->get_scanline = radial_get_scanline_narrow;
+ else
+ iter->get_scanline = radial_get_scanline_wide;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_radial_gradient (pixman_point_fixed_t * inner,
+ pixman_point_fixed_t * outer,
+ pixman_fixed_t inner_radius,
+ pixman_fixed_t outer_radius,
+ const pixman_gradient_stop_t *stops,
+ int n_stops)
+{
+ pixman_image_t *image;
+ radial_gradient_t *radial;
+
+ image = _pixman_image_allocate ();
+
+ if (!image)
+ return NULL;
+
+ radial = &image->radial;
+
+ if (!_pixman_init_gradient (&radial->common, stops, n_stops))
+ {
+ free (image);
+ return NULL;
+ }
+
+ image->type = RADIAL;
+
+ radial->c1.x = inner->x;
+ radial->c1.y = inner->y;
+ radial->c1.radius = inner_radius;
+ radial->c2.x = outer->x;
+ radial->c2.y = outer->y;
+ radial->c2.radius = outer_radius;
+
+ /* warning: this computations may overflow */
+ radial->delta.x = radial->c2.x - radial->c1.x;
+ radial->delta.y = radial->c2.y - radial->c1.y;
+ radial->delta.radius = radial->c2.radius - radial->c1.radius;
+
+ /* computed exactly, then cast to double -> every bit of the double
+ representation is correct (53 bits) */
+ radial->a = dot (radial->delta.x, radial->delta.y, -radial->delta.radius,
+ radial->delta.x, radial->delta.y, radial->delta.radius);
+ if (radial->a != 0)
+ radial->inva = 1. * pixman_fixed_1 / radial->a;
+
+ radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius;
+
+ return image;
+}
+
diff --git a/pixman/pixman/pixman-solid-fill.c b/pixman/pixman/pixman-solid-fill.c
index 67681f2c0..fcda3abb5 100644
--- a/pixman/pixman/pixman-solid-fill.c
+++ b/pixman/pixman/pixman-solid-fill.c
@@ -1,92 +1,89 @@
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007, 2009 Red Hat, Inc.
- * Copyright © 2009 Soren Sandmann
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. SuSE makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include "pixman-private.h"
-
-void
-_pixman_solid_fill_iter_init (pixman_image_t *image,
- pixman_iter_t *iter,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags)
-{
- if (flags & ITER_NARROW)
- {
- uint32_t *b = (uint32_t *)buffer;
- uint32_t *e = b + width;
- uint32_t color = image->solid.color_32;
-
- while (b < e)
- *(b++) = color;
- }
- else
- {
- uint64_t *b = (uint64_t *)buffer;
- uint64_t *e = b + width;
- uint64_t color = image->solid.color_64;
-
- while (b < e)
- *(b++) = color;
- }
-
- iter->get_scanline = _pixman_iter_get_scanline_noop;
-}
-
-static uint32_t
-color_to_uint32 (const pixman_color_t *color)
-{
- return
- (color->alpha >> 8 << 24) |
- (color->red >> 8 << 16) |
- (color->green & 0xff00) |
- (color->blue >> 8);
-}
-
-static uint64_t
-color_to_uint64 (const pixman_color_t *color)
-{
- return
- ((uint64_t)color->alpha << 48) |
- ((uint64_t)color->red << 32) |
- ((uint64_t)color->green << 16) |
- ((uint64_t)color->blue);
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_solid_fill (pixman_color_t *color)
-{
- pixman_image_t *img = _pixman_image_allocate ();
-
- if (!img)
- return NULL;
-
- img->type = SOLID;
- img->solid.color = *color;
- img->solid.color_32 = color_to_uint32 (color);
- img->solid.color_64 = color_to_uint64 (color);
-
- return img;
-}
-
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007, 2009 Red Hat, Inc.
+ * Copyright © 2009 Soren Sandmann
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. SuSE makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+void
+_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+ if (iter->flags & ITER_NARROW)
+ {
+ uint32_t *b = (uint32_t *)iter->buffer;
+ uint32_t *e = b + iter->width;
+ uint32_t color = iter->image->solid.color_32;
+
+ while (b < e)
+ *(b++) = color;
+ }
+ else
+ {
+ uint64_t *b = (uint64_t *)iter->buffer;
+ uint64_t *e = b + iter->width;
+ uint64_t color = image->solid.color_64;
+
+ while (b < e)
+ *(b++) = color;
+ }
+
+ iter->get_scanline = _pixman_iter_get_scanline_noop;
+}
+
+static uint32_t
+color_to_uint32 (const pixman_color_t *color)
+{
+ return
+ (color->alpha >> 8 << 24) |
+ (color->red >> 8 << 16) |
+ (color->green & 0xff00) |
+ (color->blue >> 8);
+}
+
+static uint64_t
+color_to_uint64 (const pixman_color_t *color)
+{
+ return
+ ((uint64_t)color->alpha << 48) |
+ ((uint64_t)color->red << 32) |
+ ((uint64_t)color->green << 16) |
+ ((uint64_t)color->blue);
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_solid_fill (pixman_color_t *color)
+{
+ pixman_image_t *img = _pixman_image_allocate ();
+
+ if (!img)
+ return NULL;
+
+ img->type = SOLID;
+ img->solid.color = *color;
+ img->solid.color_32 = color_to_uint32 (color);
+ img->solid.color_64 = color_to_uint64 (color);
+
+ return img;
+}
+
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index 696005f75..a52a959f5 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -1,6077 +1,6076 @@
-/*
- * Copyright © 2008 Rodrigo Kumpera
- * Copyright © 2008 André Tupinambá
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Red Hat not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. Red Hat makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author: Rodrigo Kumpera (kumpera@gmail.com)
- * André Tupinambá (andrelrt@gmail.com)
- *
- * Based on work by Owen Taylor and Søren Sandmann
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
-#include <emmintrin.h> /* for SSE2 intrinsics */
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-fast-path.h"
-
-static __m128i mask_0080;
-static __m128i mask_00ff;
-static __m128i mask_0101;
-static __m128i mask_ffff;
-static __m128i mask_ff000000;
-static __m128i mask_alpha;
-
-static __m128i mask_565_r;
-static __m128i mask_565_g1, mask_565_g2;
-static __m128i mask_565_b;
-static __m128i mask_red;
-static __m128i mask_green;
-static __m128i mask_blue;
-
-static __m128i mask_565_fix_rb;
-static __m128i mask_565_fix_g;
-
-static force_inline __m128i
-unpack_32_1x128 (uint32_t data)
-{
- return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
-}
-
-static force_inline void
-unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
-{
- *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
- *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
-}
-
-static force_inline __m128i
-unpack_565_to_8888 (__m128i lo)
-{
- __m128i r, g, b, rb, t;
-
- r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
- g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
- b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
-
- rb = _mm_or_si128 (r, b);
- t = _mm_and_si128 (rb, mask_565_fix_rb);
- t = _mm_srli_epi32 (t, 5);
- rb = _mm_or_si128 (rb, t);
-
- t = _mm_and_si128 (g, mask_565_fix_g);
- t = _mm_srli_epi32 (t, 6);
- g = _mm_or_si128 (g, t);
-
- return _mm_or_si128 (rb, g);
-}
-
-static force_inline void
-unpack_565_128_4x128 (__m128i data,
- __m128i* data0,
- __m128i* data1,
- __m128i* data2,
- __m128i* data3)
-{
- __m128i lo, hi;
-
- lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
- hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
-
- lo = unpack_565_to_8888 (lo);
- hi = unpack_565_to_8888 (hi);
-
- unpack_128_2x128 (lo, data0, data1);
- unpack_128_2x128 (hi, data2, data3);
-}
-
-static force_inline uint16_t
-pack_565_32_16 (uint32_t pixel)
-{
- return (uint16_t) (((pixel >> 8) & 0xf800) |
- ((pixel >> 5) & 0x07e0) |
- ((pixel >> 3) & 0x001f));
-}
-
-static force_inline __m128i
-pack_2x128_128 (__m128i lo, __m128i hi)
-{
- return _mm_packus_epi16 (lo, hi);
-}
-
-static force_inline __m128i
-pack_565_2x128_128 (__m128i lo, __m128i hi)
-{
- __m128i data;
- __m128i r, g1, g2, b;
-
- data = pack_2x128_128 (lo, hi);
-
- r = _mm_and_si128 (data, mask_565_r);
- g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
- g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
- b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
-
- return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
-}
-
-static force_inline __m128i
-pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
-{
- return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
- pack_565_2x128_128 (*xmm2, *xmm3));
-}
-
-static force_inline int
-is_opaque (__m128i x)
-{
- __m128i ffs = _mm_cmpeq_epi8 (x, x);
-
- return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
-}
-
-static force_inline int
-is_zero (__m128i x)
-{
- return _mm_movemask_epi8 (
- _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
-}
-
-static force_inline int
-is_transparent (__m128i x)
-{
- return (_mm_movemask_epi8 (
- _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
-}
-
-static force_inline __m128i
-expand_pixel_32_1x128 (uint32_t data)
-{
- return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
-}
-
-static force_inline __m128i
-expand_alpha_1x128 (__m128i data)
-{
- return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
- _MM_SHUFFLE (3, 3, 3, 3)),
- _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline void
-expand_alpha_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
- hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
-
- *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
- *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline void
-expand_alpha_rev_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
- hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
- *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
- *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline void
-pix_multiply_2x128 (__m128i* data_lo,
- __m128i* data_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi,
- __m128i* ret_lo,
- __m128i* ret_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
- hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
- lo = _mm_adds_epu16 (lo, mask_0080);
- hi = _mm_adds_epu16 (hi, mask_0080);
- *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
- *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
-}
-
-static force_inline void
-pix_add_multiply_2x128 (__m128i* src_lo,
- __m128i* src_hi,
- __m128i* alpha_dst_lo,
- __m128i* alpha_dst_hi,
- __m128i* dst_lo,
- __m128i* dst_hi,
- __m128i* alpha_src_lo,
- __m128i* alpha_src_hi,
- __m128i* ret_lo,
- __m128i* ret_hi)
-{
- __m128i t1_lo, t1_hi;
- __m128i t2_lo, t2_hi;
-
- pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
- pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
-
- *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
- *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
-}
-
-static force_inline void
-negate_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* neg_lo,
- __m128i* neg_hi)
-{
- *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
- *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
-}
-
-static force_inline void
-invert_colors_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* inv_lo,
- __m128i* inv_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
- hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
- *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
- *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
-static force_inline void
-over_2x128 (__m128i* src_lo,
- __m128i* src_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi,
- __m128i* dst_lo,
- __m128i* dst_hi)
-{
- __m128i t1, t2;
-
- negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
-
- pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
-
- *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
- *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
-}
-
-static force_inline void
-over_rev_non_pre_2x128 (__m128i src_lo,
- __m128i src_hi,
- __m128i* dst_lo,
- __m128i* dst_hi)
-{
- __m128i lo, hi;
- __m128i alpha_lo, alpha_hi;
-
- expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
-
- lo = _mm_or_si128 (alpha_lo, mask_alpha);
- hi = _mm_or_si128 (alpha_hi, mask_alpha);
-
- invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
-
- pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
-
- over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
-}
-
-static force_inline void
-in_over_2x128 (__m128i* src_lo,
- __m128i* src_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi,
- __m128i* mask_lo,
- __m128i* mask_hi,
- __m128i* dst_lo,
- __m128i* dst_hi)
-{
- __m128i s_lo, s_hi;
- __m128i a_lo, a_hi;
-
- pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
- pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
-
- over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
-}
-
-/* load 4 pixels from a 16-byte boundary aligned address */
-static force_inline __m128i
-load_128_aligned (__m128i* src)
-{
- return _mm_load_si128 (src);
-}
-
-/* load 4 pixels from a unaligned address */
-static force_inline __m128i
-load_128_unaligned (const __m128i* src)
-{
- return _mm_loadu_si128 (src);
-}
-
-/* save 4 pixels using Write Combining memory on a 16-byte
- * boundary aligned address
- */
-static force_inline void
-save_128_write_combining (__m128i* dst,
- __m128i data)
-{
- _mm_stream_si128 (dst, data);
-}
-
-/* save 4 pixels on a 16-byte boundary aligned address */
-static force_inline void
-save_128_aligned (__m128i* dst,
- __m128i data)
-{
- _mm_store_si128 (dst, data);
-}
-
-/* save 4 pixels on a unaligned address */
-static force_inline void
-save_128_unaligned (__m128i* dst,
- __m128i data)
-{
- _mm_storeu_si128 (dst, data);
-}
-
-static force_inline __m128i
-load_32_1x128 (uint32_t data)
-{
- return _mm_cvtsi32_si128 (data);
-}
-
-static force_inline __m128i
-expand_alpha_rev_1x128 (__m128i data)
-{
- return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline __m128i
-expand_pixel_8_1x128 (uint8_t data)
-{
- return _mm_shufflelo_epi16 (
- unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline __m128i
-pix_multiply_1x128 (__m128i data,
- __m128i alpha)
-{
- return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
- mask_0080),
- mask_0101);
-}
-
-static force_inline __m128i
-pix_add_multiply_1x128 (__m128i* src,
- __m128i* alpha_dst,
- __m128i* dst,
- __m128i* alpha_src)
-{
- __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
- __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
-
- return _mm_adds_epu8 (t1, t2);
-}
-
-static force_inline __m128i
-negate_1x128 (__m128i data)
-{
- return _mm_xor_si128 (data, mask_00ff);
-}
-
-static force_inline __m128i
-invert_colors_1x128 (__m128i data)
-{
- return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
-static force_inline __m128i
-over_1x128 (__m128i src, __m128i alpha, __m128i dst)
-{
- return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
-}
-
-static force_inline __m128i
-in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
-{
- return over_1x128 (pix_multiply_1x128 (*src, *mask),
- pix_multiply_1x128 (*alpha, *mask),
- *dst);
-}
-
-static force_inline __m128i
-over_rev_non_pre_1x128 (__m128i src, __m128i dst)
-{
- __m128i alpha = expand_alpha_1x128 (src);
-
- return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
- _mm_or_si128 (alpha, mask_alpha)),
- alpha,
- dst);
-}
-
-static force_inline uint32_t
-pack_1x128_32 (__m128i data)
-{
- return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
-}
-
-static force_inline __m128i
-expand565_16_1x128 (uint16_t pixel)
-{
- __m128i m = _mm_cvtsi32_si128 (pixel);
-
- m = unpack_565_to_8888 (m);
-
- return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
-}
-
-static force_inline uint32_t
-core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
-{
- uint8_t a;
- __m128i xmms;
-
- a = src >> 24;
-
- if (a == 0xff)
- {
- return src;
- }
- else if (src)
- {
- xmms = unpack_32_1x128 (src);
- return pack_1x128_32 (
- over_1x128 (xmms, expand_alpha_1x128 (xmms),
- unpack_32_1x128 (dst)));
- }
-
- return dst;
-}
-
-static force_inline uint32_t
-combine1 (const uint32_t *ps, const uint32_t *pm)
-{
- uint32_t s = *ps;
-
- if (pm)
- {
- __m128i ms, mm;
-
- mm = unpack_32_1x128 (*pm);
- mm = expand_alpha_1x128 (mm);
-
- ms = unpack_32_1x128 (s);
- ms = pix_multiply_1x128 (ms, mm);
-
- s = pack_1x128_32 (ms);
- }
-
- return s;
-}
-
-static force_inline __m128i
-combine4 (const __m128i *ps, const __m128i *pm)
-{
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_msk_lo, xmm_msk_hi;
- __m128i s;
-
- if (pm)
- {
- xmm_msk_lo = load_128_unaligned (pm);
-
- if (is_transparent (xmm_msk_lo))
- return _mm_setzero_si128 ();
- }
-
- s = load_128_unaligned (ps);
-
- if (pm)
- {
- unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
-
- expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_msk_lo, &xmm_msk_hi,
- &xmm_src_lo, &xmm_src_hi);
-
- s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
- }
-
- return s;
-}
-
-static force_inline void
-core_combine_over_u_sse2_mask (uint32_t * pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- uint32_t s, d;
-
- /* Align dst on a 16-byte boundary */
- while (w && ((unsigned long)pd & 15))
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- if (s)
- *pd = core_combine_over_u_pixel_sse2 (s, d);
- pd++;
- ps++;
- pm++;
- w--;
- }
-
- while (w >= 4)
- {
- __m128i mask = load_128_unaligned ((__m128i *)pm);
-
- if (!is_zero (mask))
- {
- __m128i src;
- __m128i src_hi, src_lo;
- __m128i mask_hi, mask_lo;
- __m128i alpha_hi, alpha_lo;
-
- src = load_128_unaligned ((__m128i *)ps);
-
- if (is_opaque (_mm_and_si128 (src, mask)))
- {
- save_128_aligned ((__m128i *)pd, src);
- }
- else
- {
- __m128i dst = load_128_aligned ((__m128i *)pd);
- __m128i dst_hi, dst_lo;
-
- unpack_128_2x128 (mask, &mask_lo, &mask_hi);
- unpack_128_2x128 (src, &src_lo, &src_hi);
-
- expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
- pix_multiply_2x128 (&src_lo, &src_hi,
- &mask_lo, &mask_hi,
- &src_lo, &src_hi);
-
- unpack_128_2x128 (dst, &dst_lo, &dst_hi);
-
- expand_alpha_2x128 (src_lo, src_hi,
- &alpha_lo, &alpha_hi);
-
- over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
- &dst_lo, &dst_hi);
-
- save_128_aligned (
- (__m128i *)pd,
- pack_2x128_128 (dst_lo, dst_hi));
- }
- }
-
- pm += 4;
- ps += 4;
- pd += 4;
- w -= 4;
- }
- while (w)
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- if (s)
- *pd = core_combine_over_u_pixel_sse2 (s, d);
- pd++;
- ps++;
- pm++;
-
- w--;
- }
-}
-
-static force_inline void
-core_combine_over_u_sse2_no_mask (uint32_t * pd,
- const uint32_t* ps,
- int w)
-{
- uint32_t s, d;
-
- /* Align dst on a 16-byte boundary */
- while (w && ((unsigned long)pd & 15))
- {
- d = *pd;
- s = *ps;
-
- if (s)
- *pd = core_combine_over_u_pixel_sse2 (s, d);
- pd++;
- ps++;
- w--;
- }
-
- while (w >= 4)
- {
- __m128i src;
- __m128i src_hi, src_lo, dst_hi, dst_lo;
- __m128i alpha_hi, alpha_lo;
-
- src = load_128_unaligned ((__m128i *)ps);
-
- if (!is_zero (src))
- {
- if (is_opaque (src))
- {
- save_128_aligned ((__m128i *)pd, src);
- }
- else
- {
- __m128i dst = load_128_aligned ((__m128i *)pd);
-
- unpack_128_2x128 (src, &src_lo, &src_hi);
- unpack_128_2x128 (dst, &dst_lo, &dst_hi);
-
- expand_alpha_2x128 (src_lo, src_hi,
- &alpha_lo, &alpha_hi);
- over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
- &dst_lo, &dst_hi);
-
- save_128_aligned (
- (__m128i *)pd,
- pack_2x128_128 (dst_lo, dst_hi));
- }
- }
-
- ps += 4;
- pd += 4;
- w -= 4;
- }
- while (w)
- {
- d = *pd;
- s = *ps;
-
- if (s)
- *pd = core_combine_over_u_pixel_sse2 (s, d);
- pd++;
- ps++;
-
- w--;
- }
-}
-
-static force_inline void
-sse2_combine_over_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- if (pm)
- core_combine_over_u_sse2_mask (pd, ps, pm, w);
- else
- core_combine_over_u_sse2_no_mask (pd, ps, w);
-}
-
-static void
-sse2_combine_over_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- /* Align dst on a 16-byte boundary */
- while (w &&
- ((unsigned long)pd & 15))
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- *pd++ = core_combine_over_u_pixel_sse2 (d, s);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- /* I'm loading unaligned because I'm not sure
- * about the address alignment.
- */
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_src_lo, &xmm_src_hi);
-
- /* rebuid the 4 pixel data and save*/
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_src_lo, xmm_src_hi));
-
- w -= 4;
- ps += 4;
- pd += 4;
-
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- *pd++ = core_combine_over_u_pixel_sse2 (d, s);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
-{
- uint32_t maska = src >> 24;
-
- if (maska == 0)
- {
- return 0;
- }
- else if (maska != 0xff)
- {
- return pack_1x128_32 (
- pix_multiply_1x128 (unpack_32_1x128 (dst),
- expand_alpha_1x128 (unpack_32_1x128 (src))));
- }
-
- return dst;
-}
-
-static void
-sse2_combine_in_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixel_sse2 (d, s);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixel_sse2 (d, s);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static void
-sse2_combine_in_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixel_sse2 (s, d);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static void
-sse2_combine_out_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- while (w && ((unsigned long) pd & 15))
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- unpack_32_1x128 (d), negate_1x128 (
- expand_alpha_1x128 (unpack_32_1x128 (s)))));
-
- if (pm)
- pm++;
- ps++;
- w--;
- }
-
- while (w >= 4)
- {
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- if (pm)
- pm += 4;
-
- w -= 4;
- }
-
- while (w)
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- unpack_32_1x128 (d), negate_1x128 (
- expand_alpha_1x128 (unpack_32_1x128 (s)))));
- ps++;
- if (pm)
- pm++;
- w--;
- }
-}
-
-static void
-sse2_combine_out_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- while (w && ((unsigned long) pd & 15))
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- unpack_32_1x128 (s), negate_1x128 (
- expand_alpha_1x128 (unpack_32_1x128 (d)))));
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- unpack_32_1x128 (s), negate_1x128 (
- expand_alpha_1x128 (unpack_32_1x128 (d)))));
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_atop_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m128i s = unpack_32_1x128 (src);
- __m128i d = unpack_32_1x128 (dst);
-
- __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
- __m128i da = expand_alpha_1x128 (d);
-
- return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
-}
-
-static void
-sse2_combine_atop_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-
- pix_add_multiply_2x128 (
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m128i s = unpack_32_1x128 (src);
- __m128i d = unpack_32_1x128 (dst);
-
- __m128i sa = expand_alpha_1x128 (s);
- __m128i da = negate_1x128 (expand_alpha_1x128 (d));
-
- return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
-}
-
-static void
-sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_add_multiply_2x128 (
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_xor_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m128i s = unpack_32_1x128 (src);
- __m128i d = unpack_32_1x128 (dst);
-
- __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
- __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
-
- return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
-}
-
-static void
-sse2_combine_xor_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- int w = width;
- uint32_t s, d;
- uint32_t* pd = dst;
- const uint32_t* ps = src;
- const uint32_t* pm = mask;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
- xmm_dst = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_add_multiply_2x128 (
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline void
-sse2_combine_add_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- int w = width;
- uint32_t s, d;
- uint32_t* pd = dst;
- const uint32_t* ps = src;
- const uint32_t* pm = mask;
-
- while (w && (unsigned long)pd & 15)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- ps++;
- if (pm)
- pm++;
- *pd++ = _mm_cvtsi128_si32 (
- _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
- w--;
- }
-
- while (w >= 4)
- {
- __m128i s;
-
- s = combine4 ((__m128i*)ps, (__m128i*)pm);
-
- save_128_aligned (
- (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
-
- pd += 4;
- ps += 4;
- if (pm)
- pm += 4;
- w -= 4;
- }
-
- while (w--)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- ps++;
- *pd++ = _mm_cvtsi128_si32 (
- _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_saturate_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m128i ms = unpack_32_1x128 (src);
- __m128i md = unpack_32_1x128 (dst);
- uint32_t sa = src >> 24;
- uint32_t da = ~dst >> 24;
-
- if (sa > da)
- {
- ms = pix_multiply_1x128 (
- ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
- }
-
- return pack_1x128_32 (_mm_adds_epu16 (md, ms));
-}
-
-static void
-sse2_combine_saturate_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, d;
-
- uint32_t pack_cmp;
- __m128i xmm_src, xmm_dst;
-
- while (w && (unsigned long)pd & 15)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_dst = load_128_aligned ((__m128i*)pd);
- xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
-
- pack_cmp = _mm_movemask_epi8 (
- _mm_cmpgt_epi32 (
- _mm_srli_epi32 (xmm_src, 24),
- _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
-
- /* if some alpha src is grater than respective ~alpha dst */
- if (pack_cmp)
- {
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
- }
- else
- {
- save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
-
- pd += 4;
- ps += 4;
- if (pm)
- pm += 4;
- }
-
- w -= 4;
- }
-
- while (w--)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static void
-sse2_combine_src_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, m;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_over_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m128i s = unpack_32_1x128 (src);
- __m128i expAlpha = expand_alpha_1x128 (s);
- __m128i unpk_mask = unpack_32_1x128 (mask);
- __m128i unpk_dst = unpack_32_1x128 (dst);
-
- return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
-}
-
-static void
-sse2_combine_over_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m128i d = unpack_32_1x128 (dst);
-
- return pack_1x128_32 (
- over_1x128 (d, expand_alpha_1x128 (d),
- pix_multiply_1x128 (unpack_32_1x128 (src),
- unpack_32_1x128 (mask))));
-}
-
-static void
-sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static void
-sse2_combine_in_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
- expand_alpha_1x128 (unpack_32_1x128 (d))));
-
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- pix_multiply_1x128 (
- unpack_32_1x128 (s), unpack_32_1x128 (m)),
- expand_alpha_1x128 (unpack_32_1x128 (d))));
-
- w--;
- }
-}
-
-static void
-sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- unpack_32_1x128 (d),
- pix_multiply_1x128 (unpack_32_1x128 (m),
- expand_alpha_1x128 (unpack_32_1x128 (s)))));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- unpack_32_1x128 (d),
- pix_multiply_1x128 (unpack_32_1x128 (m),
- expand_alpha_1x128 (unpack_32_1x128 (s)))));
- w--;
- }
-}
-
-static void
-sse2_combine_out_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- pix_multiply_1x128 (
- unpack_32_1x128 (s), unpack_32_1x128 (m)),
- negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
- negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- pix_multiply_1x128 (
- unpack_32_1x128 (s), unpack_32_1x128 (m)),
- negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
-
- w--;
- }
-}
-
-static void
-sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- unpack_32_1x128 (d),
- negate_1x128 (pix_multiply_1x128 (
- unpack_32_1x128 (m),
- expand_alpha_1x128 (unpack_32_1x128 (s))))));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x128_32 (
- pix_multiply_1x128 (
- unpack_32_1x128 (d),
- negate_1x128 (pix_multiply_1x128 (
- unpack_32_1x128 (m),
- expand_alpha_1x128 (unpack_32_1x128 (s))))));
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_atop_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m128i m = unpack_32_1x128 (mask);
- __m128i s = unpack_32_1x128 (src);
- __m128i d = unpack_32_1x128 (dst);
- __m128i sa = expand_alpha_1x128 (s);
- __m128i da = expand_alpha_1x128 (d);
-
- s = pix_multiply_1x128 (s, m);
- m = negate_1x128 (pix_multiply_1x128 (m, sa));
-
- return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
-}
-
-static void
-sse2_combine_atop_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- pix_add_multiply_2x128 (
- &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m128i m = unpack_32_1x128 (mask);
- __m128i s = unpack_32_1x128 (src);
- __m128i d = unpack_32_1x128 (dst);
-
- __m128i da = negate_1x128 (expand_alpha_1x128 (d));
- __m128i sa = expand_alpha_1x128 (s);
-
- s = pix_multiply_1x128 (s, m);
- m = pix_multiply_1x128 (m, sa);
-
- return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
-}
-
-static void
-sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_add_multiply_2x128 (
- &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_xor_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m128i a = unpack_32_1x128 (mask);
- __m128i s = unpack_32_1x128 (src);
- __m128i d = unpack_32_1x128 (dst);
-
- __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
- a, expand_alpha_1x128 (s)));
- __m128i dest = pix_multiply_1x128 (s, a);
- __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
-
- return pack_1x128_32 (pix_add_multiply_1x128 (&d,
- &alpha_dst,
- &dest,
- &alpha_src));
-}
-
-static void
-sse2_combine_xor_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
- negate_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_add_multiply_2x128 (
- &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static void
-sse2_combine_add_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * pd,
- const uint32_t * ps,
- const uint32_t * pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x128_32 (
- _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
- unpack_32_1x128 (m)),
- unpack_32_1x128 (d)));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (
- _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
- _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x128_32 (
- _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
- unpack_32_1x128 (m)),
- unpack_32_1x128 (d)));
- w--;
- }
-}
-
-static force_inline __m128i
-create_mask_16_128 (uint16_t mask)
-{
- return _mm_set1_epi16 (mask);
-}
-
-/* Work around a code generation bug in Sun Studio 12. */
-#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
-# define create_mask_2x32_128(mask0, mask1) \
- (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
-#else
-static force_inline __m128i
-create_mask_2x32_128 (uint32_t mask0,
- uint32_t mask1)
-{
- return _mm_set_epi32 (mask0, mask1, mask0, mask1);
-}
-#endif
-
-static void
-sse2_composite_over_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint32_t *dst_line, *dst, d;
- int32_t w;
- int dst_stride;
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
-
- while (height--)
- {
- dst = dst_line;
-
- dst_line += dst_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- d = *dst;
- *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
- xmm_alpha,
- unpack_32_1x128 (d)));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_dst_lo, &xmm_dst_hi);
-
- /* rebuid the 4 pixel data and save*/
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- w -= 4;
- dst += 4;
- }
-
- while (w)
- {
- d = *dst;
- *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
- xmm_alpha,
- unpack_32_1x128 (d)));
- w--;
- }
-
- }
-}
-
-static void
-sse2_composite_over_n_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint16_t *dst_line, *dst, d;
- int32_t w;
- int dst_stride;
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
-
- while (height--)
- {
- dst = dst_line;
-
- dst_line += dst_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- d = *dst;
-
- *dst++ = pack_565_32_16 (
- pack_1x128_32 (over_1x128 (xmm_src,
- xmm_alpha,
- expand565_16_1x128 (d))));
- w--;
- }
-
- while (w >= 8)
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
- over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_dst0, &xmm_dst1);
- over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_dst2, &xmm_dst3);
-
- xmm_dst = pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
- save_128_aligned ((__m128i*)dst, xmm_dst);
-
- dst += 8;
- w -= 8;
- }
-
- while (w--)
- {
- d = *dst;
- *dst++ = pack_565_32_16 (
- pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
- expand565_16_1x128 (d))));
- }
- }
-
-}
-
-static void
-sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, d;
- uint32_t *mask_line, m;
- uint32_t pack_cmp;
- int dst_stride, mask_stride;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
- srca = src >> 24;
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- xmm_src = _mm_unpacklo_epi8 (
- create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = xmm_src;
- mmx_alpha = xmm_alpha;
-
- while (height--)
- {
- int w = width;
- const uint32_t *pm = (uint32_t *)mask_line;
- uint32_t *pd = (uint32_t *)dst_line;
-
- dst_line += dst_stride;
- mask_line += mask_stride;
-
- while (w && (unsigned long)pd & 15)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
-
- mmx_mask = unpack_32_1x128 (m);
- mmx_dest = unpack_32_1x128 (d);
-
- *pd = pack_1x128_32 (
- _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
- mmx_dest));
- }
-
- pd++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_mask = load_128_unaligned ((__m128i*)pm);
-
- pack_cmp =
- _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
- if (pack_cmp != 0xffff)
- {
- xmm_dst = load_128_aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_src, &xmm_src,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
- xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
-
- save_128_aligned (
- (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
- }
-
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
-
- mmx_mask = unpack_32_1x128 (m);
- mmx_dest = unpack_32_1x128 (d);
-
- *pd = pack_1x128_32 (
- _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
- mmx_dest));
- }
-
- pd++;
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint32_t *dst_line, d;
- uint32_t *mask_line, m;
- uint32_t pack_cmp;
- int dst_stride, mask_stride;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- xmm_src = _mm_unpacklo_epi8 (
- create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = xmm_src;
- mmx_alpha = xmm_alpha;
-
- while (height--)
- {
- int w = width;
- const uint32_t *pm = (uint32_t *)mask_line;
- uint32_t *pd = (uint32_t *)dst_line;
-
- dst_line += dst_stride;
- mask_line += mask_stride;
-
- while (w && (unsigned long)pd & 15)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
- mmx_mask = unpack_32_1x128 (m);
- mmx_dest = unpack_32_1x128 (d);
-
- *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
- &mmx_alpha,
- &mmx_mask,
- &mmx_dest));
- }
-
- pd++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_mask = load_128_unaligned ((__m128i*)pm);
-
- pack_cmp =
- _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
- if (pack_cmp != 0xffff)
- {
- xmm_dst = load_128_aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
- mmx_mask = unpack_32_1x128 (m);
- mmx_dest = unpack_32_1x128 (d);
-
- *pd = pack_1x128_32 (
- in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
- }
-
- pd++;
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- uint32_t mask;
- int32_t w;
- int dst_stride, src_stride;
-
- __m128i xmm_mask;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
-
- xmm_mask = create_mask_16_128 (mask >> 24);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t s = *src++;
-
- if (s)
- {
- uint32_t d = *dst;
-
- __m128i ms = unpack_32_1x128 (s);
- __m128i alpha = expand_alpha_1x128 (ms);
- __m128i dest = xmm_mask;
- __m128i alpha_dst = unpack_32_1x128 (d);
-
- *dst = pack_1x128_32 (
- in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
- }
- dst++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
-
- if (!is_zero (xmm_src))
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask, &xmm_mask,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- dst += 4;
- src += 4;
- w -= 4;
- }
-
- while (w)
- {
- uint32_t s = *src++;
-
- if (s)
- {
- uint32_t d = *dst;
-
- __m128i ms = unpack_32_1x128 (s);
- __m128i alpha = expand_alpha_1x128 (ms);
- __m128i mask = xmm_mask;
- __m128i dest = unpack_32_1x128 (d);
-
- *dst = pack_1x128_32 (
- in_over_1x128 (&ms, &alpha, &mask, &dest));
- }
-
- dst++;
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int32_t w;
- int dst_stride, src_stride;
-
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- *dst++ = *src++ | 0xff000000;
- w--;
- }
-
- while (w >= 16)
- {
- __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
-
- xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
- xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
- xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
- xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
-
- save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
- save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
- save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
- save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
-
- dst += 16;
- src += 16;
- w -= 16;
- }
-
- while (w)
- {
- *dst++ = *src++ | 0xff000000;
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- uint32_t mask;
- int dst_stride, src_stride;
- int32_t w;
-
- __m128i xmm_mask, xmm_alpha;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
-
- xmm_mask = create_mask_16_128 (mask >> 24);
- xmm_alpha = mask_00ff;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t s = (*src++) | 0xff000000;
- uint32_t d = *dst;
-
- __m128i src = unpack_32_1x128 (s);
- __m128i alpha = xmm_alpha;
- __m128i mask = xmm_mask;
- __m128i dest = unpack_32_1x128 (d);
-
- *dst++ = pack_1x128_32 (
- in_over_1x128 (&src, &alpha, &mask, &dest));
-
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src = _mm_or_si128 (
- load_128_unaligned ((__m128i*)src), mask_ff000000);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask, &xmm_mask,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- dst += 4;
- src += 4;
- w -= 4;
-
- }
-
- while (w)
- {
- uint32_t s = (*src++) | 0xff000000;
- uint32_t d = *dst;
-
- __m128i src = unpack_32_1x128 (s);
- __m128i alpha = xmm_alpha;
- __m128i mask = xmm_mask;
- __m128i dest = unpack_32_1x128 (d);
-
- *dst++ = pack_1x128_32 (
- in_over_1x128 (&src, &alpha, &mask, &dest));
-
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- int dst_stride, src_stride;
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- dst = dst_line;
- src = src_line;
-
- while (height--)
- {
- sse2_combine_over_u (imp, op, dst, src, NULL, width);
-
- dst += dst_stride;
- src += src_stride;
- }
-}
-
-static force_inline uint16_t
-composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
-{
- __m128i ms;
-
- ms = unpack_32_1x128 (src);
- return pack_565_32_16 (
- pack_1x128_32 (
- over_1x128 (
- ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
-}
-
-static void
-sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst, d;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- src = src_line;
-
- dst_line += dst_stride;
- src_line += src_stride;
- w = width;
-
- /* Align dst on a 16-byte boundary */
- while (w &&
- ((unsigned long)dst & 15))
- {
- s = *src++;
- d = *dst;
-
- *dst++ = composite_over_8888_0565pixel (s, d);
- w--;
- }
-
- /* It's a 8 pixel loop */
- while (w >= 8)
- {
- /* I'm loading unaligned because I'm not sure
- * about the address alignment.
- */
- xmm_src = load_128_unaligned ((__m128i*) src);
- xmm_dst = load_128_aligned ((__m128i*) dst);
-
- /* Unpacking */
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- /* I'm loading next 4 pixels from memory
- * before to optimze the memory read.
- */
- xmm_src = load_128_unaligned ((__m128i*) (src + 4));
-
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst0, &xmm_dst1);
-
- /* Unpacking */
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst2, &xmm_dst3);
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- dst += 8;
- src += 8;
- }
-
- while (w--)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = composite_over_8888_0565pixel (s, d);
- }
- }
-
-}
-
-static void
-sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t m, d;
-
- __m128i xmm_src, xmm_alpha, xmm_def;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- xmm_def = create_mask_2x32_128 (src, src);
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = xmm_src;
- mmx_alpha = xmm_alpha;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_pixel_8_1x128 (m);
- mmx_dest = unpack_32_1x128 (d);
-
- *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
- &mmx_alpha,
- &mmx_mask,
- &mmx_dest));
- }
-
- w--;
- dst++;
- }
-
- while (w >= 4)
- {
- m = *((uint32_t*)mask);
-
- if (srca == 0xff && m == 0xffffffff)
- {
- save_128_aligned ((__m128i*)dst, xmm_def);
- }
- else if (m)
- {
- xmm_dst = load_128_aligned ((__m128i*) dst);
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- w -= 4;
- dst += 4;
- mask += 4;
- }
-
- while (w)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_pixel_8_1x128 (m);
- mmx_dest = unpack_32_1x128 (d);
-
- *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
- &mmx_alpha,
- &mmx_mask,
- &mmx_dest));
- }
-
- w--;
- dst++;
- }
- }
-
-}
-
-static pixman_bool_t
-pixman_fill_sse2 (uint32_t *bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t data)
-{
- uint32_t byte_width;
- uint8_t *byte_line;
-
- __m128i xmm_def;
-
- if (bpp == 8)
- {
- uint8_t b;
- uint16_t w;
-
- stride = stride * (int) sizeof (uint32_t) / 1;
- byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
- byte_width = width;
- stride *= 1;
-
- b = data & 0xff;
- w = (b << 8) | b;
- data = (w << 16) | w;
- }
- else if (bpp == 16)
- {
- stride = stride * (int) sizeof (uint32_t) / 2;
- byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
- byte_width = 2 * width;
- stride *= 2;
-
- data = (data & 0xffff) * 0x00010001;
- }
- else if (bpp == 32)
- {
- stride = stride * (int) sizeof (uint32_t) / 4;
- byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
- byte_width = 4 * width;
- stride *= 4;
- }
- else
- {
- return FALSE;
- }
-
- xmm_def = create_mask_2x32_128 (data, data);
-
- while (height--)
- {
- int w;
- uint8_t *d = byte_line;
- byte_line += stride;
- w = byte_width;
-
- while (w >= 1 && ((unsigned long)d & 1))
- {
- *(uint8_t *)d = data;
- w -= 1;
- d += 1;
- }
-
- while (w >= 2 && ((unsigned long)d & 3))
- {
- *(uint16_t *)d = data;
- w -= 2;
- d += 2;
- }
-
- while (w >= 4 && ((unsigned long)d & 15))
- {
- *(uint32_t *)d = data;
-
- w -= 4;
- d += 4;
- }
-
- while (w >= 128)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
- save_128_aligned ((__m128i*)(d + 16), xmm_def);
- save_128_aligned ((__m128i*)(d + 32), xmm_def);
- save_128_aligned ((__m128i*)(d + 48), xmm_def);
- save_128_aligned ((__m128i*)(d + 64), xmm_def);
- save_128_aligned ((__m128i*)(d + 80), xmm_def);
- save_128_aligned ((__m128i*)(d + 96), xmm_def);
- save_128_aligned ((__m128i*)(d + 112), xmm_def);
-
- d += 128;
- w -= 128;
- }
-
- if (w >= 64)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
- save_128_aligned ((__m128i*)(d + 16), xmm_def);
- save_128_aligned ((__m128i*)(d + 32), xmm_def);
- save_128_aligned ((__m128i*)(d + 48), xmm_def);
-
- d += 64;
- w -= 64;
- }
-
- if (w >= 32)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
- save_128_aligned ((__m128i*)(d + 16), xmm_def);
-
- d += 32;
- w -= 32;
- }
-
- if (w >= 16)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
-
- d += 16;
- w -= 16;
- }
-
- while (w >= 4)
- {
- *(uint32_t *)d = data;
-
- w -= 4;
- d += 4;
- }
-
- if (w >= 2)
- {
- *(uint16_t *)d = data;
- w -= 2;
- d += 2;
- }
-
- if (w >= 1)
- {
- *(uint8_t *)d = data;
- w -= 1;
- d += 1;
- }
- }
-
- return TRUE;
-}
-
-static void
-sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t m;
-
- __m128i xmm_src, xmm_def;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- {
- pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
- dest_x, dest_y, width, height, 0);
- return;
- }
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- xmm_def = create_mask_2x32_128 (src, src);
- xmm_src = expand_pixel_32_1x128 (src);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- *dst = pack_1x128_32 (
- pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
- }
- else
- {
- *dst = 0;
- }
-
- w--;
- dst++;
- }
-
- while (w >= 4)
- {
- m = *((uint32_t*)mask);
-
- if (srca == 0xff && m == 0xffffffff)
- {
- save_128_aligned ((__m128i*)dst, xmm_def);
- }
- else if (m)
- {
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_src, &xmm_src,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
- }
- else
- {
- save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
- }
-
- w -= 4;
- dst += 4;
- mask += 4;
- }
-
- while (w)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- *dst = pack_1x128_32 (
- pix_multiply_1x128 (
- xmm_src, expand_pixel_8_1x128 (m)));
- }
- else
- {
- *dst = 0;
- }
-
- w--;
- dst++;
- }
- }
-
-}
-
-static void
-sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dst_line, *dst, d;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t m;
- __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = xmm_src;
- mmx_alpha = xmm_alpha;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
- mmx_dest = expand565_16_1x128 (d);
-
- *dst = pack_565_32_16 (
- pack_1x128_32 (
- in_over_1x128 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- }
-
- while (w >= 8)
- {
- xmm_dst = load_128_aligned ((__m128i*) dst);
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
- m = *((uint32_t*)mask);
- mask += 4;
-
- if (m)
- {
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst0, &xmm_dst1);
- }
-
- m = *((uint32_t*)mask);
- mask += 4;
-
- if (m)
- {
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst2, &xmm_dst3);
- }
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- dst += 8;
- }
-
- while (w)
- {
- m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
- mmx_dest = expand565_16_1x128 (d);
-
- *dst = pack_565_32_16 (
- pack_1x128_32 (
- in_over_1x128 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- }
- }
-
-}
-
-static void
-sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst, d;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
- uint32_t opaque, zero;
-
- __m128i ms;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- s = *src++;
- d = *dst;
-
- ms = unpack_32_1x128 (s);
-
- *dst++ = pack_565_32_16 (
- pack_1x128_32 (
- over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
- w--;
- }
-
- while (w >= 8)
- {
- /* First round */
- xmm_src = load_128_unaligned ((__m128i*)src);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- opaque = is_opaque (xmm_src);
- zero = is_zero (xmm_src);
-
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-
- /* preload next round*/
- xmm_src = load_128_unaligned ((__m128i*)(src + 4));
-
- if (opaque)
- {
- invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst0, &xmm_dst1);
- }
- else if (!zero)
- {
- over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst0, &xmm_dst1);
- }
-
- /* Second round */
- opaque = is_opaque (xmm_src);
- zero = is_zero (xmm_src);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-
- if (opaque)
- {
- invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst2, &xmm_dst3);
- }
- else if (!zero)
- {
- over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst2, &xmm_dst3);
- }
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- src += 8;
- dst += 8;
- }
-
- while (w)
- {
- s = *src++;
- d = *dst;
-
- ms = unpack_32_1x128 (s);
-
- *dst++ = pack_565_32_16 (
- pack_1x128_32 (
- over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst, d;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
- uint32_t opaque, zero;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = pack_1x128_32 (
- over_rev_non_pre_1x128 (
- unpack_32_1x128 (s), unpack_32_1x128 (d)));
-
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = load_128_unaligned ((__m128i*)src);
-
- opaque = is_opaque (xmm_src_hi);
- zero = is_zero (xmm_src_hi);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
- if (opaque)
- {
- invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
- else if (!zero)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- w -= 4;
- dst += 4;
- src += 4;
- }
-
- while (w)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = pack_1x128_32 (
- over_rev_non_pre_1x128 (
- unpack_32_1x128 (s), unpack_32_1x128 (d)));
-
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint16_t *dst_line, *dst, d;
- uint32_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int w;
- uint32_t pack_cmp;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = xmm_src;
- mmx_alpha = xmm_alpha;
-
- while (height--)
- {
- w = width;
- mask = mask_line;
- dst = dst_line;
- mask_line += mask_stride;
- dst_line += dst_stride;
-
- while (w && ((unsigned long)dst & 15))
- {
- m = *(uint32_t *) mask;
-
- if (m)
- {
- d = *dst;
- mmx_mask = unpack_32_1x128 (m);
- mmx_dest = expand565_16_1x128 (d);
-
- *dst = pack_565_32_16 (
- pack_1x128_32 (
- in_over_1x128 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- mask++;
- }
-
- while (w >= 8)
- {
- /* First round */
- xmm_mask = load_128_unaligned ((__m128i*)mask);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- pack_cmp = _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- /* preload next round */
- xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
-
- /* preload next round */
- if (pack_cmp != 0xffff)
- {
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst0, &xmm_dst1);
- }
-
- /* Second round */
- pack_cmp = _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- if (pack_cmp != 0xffff)
- {
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst2, &xmm_dst3);
- }
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- dst += 8;
- mask += 8;
- }
-
- while (w)
- {
- m = *(uint32_t *) mask;
-
- if (m)
- {
- d = *dst;
- mmx_mask = unpack_32_1x128 (m);
- mmx_dest = expand565_16_1x128 (d);
-
- *dst = pack_565_32_16 (
- pack_1x128_32 (
- in_over_1x128 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- mask++;
- }
- }
-
-}
-
-static void
-sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- uint32_t d, m;
- uint32_t src;
- uint8_t sa;
- int32_t w;
-
- __m128i xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- sa = src >> 24;
-
- xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x128_32 (
- pix_multiply_1x128 (
- pix_multiply_1x128 (xmm_alpha,
- unpack_32_1x128 (m)),
- unpack_32_1x128 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_mask = load_128_unaligned ((__m128i*)mask);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- mask += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x128_32 (
- pix_multiply_1x128 (
- pix_multiply_1x128 (
- xmm_alpha, unpack_32_1x128 (m)),
- unpack_32_1x128 (d)));
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_in_n_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- int dst_stride;
- uint32_t d;
- uint32_t src;
- int32_t w;
-
- __m128i xmm_alpha;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
- src = src >> 24;
-
- if (src == 0xff)
- return;
-
- if (src == 0x00)
- {
- pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
- 8, dest_x, dest_y, width, height, src);
-
- return;
- }
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x128_32 (
- pix_multiply_1x128 (
- xmm_alpha,
- unpack_32_1x128 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x128_32 (
- pix_multiply_1x128 (
- xmm_alpha,
- unpack_32_1x128 (d)));
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_in_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int src_stride, dst_stride;
- int32_t w;
- uint32_t s, d;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- s = (uint32_t) *src++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x128_32 (
- pix_multiply_1x128 (
- unpack_32_1x128 (s), unpack_32_1x128 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- src += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- s = (uint32_t) *src++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x128_32 (
- pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t src;
- uint8_t sa;
- uint32_t m, d;
-
- __m128i xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- sa = src >> 24;
-
- xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x128_32 (
- _mm_adds_epu16 (
- pix_multiply_1x128 (
- xmm_alpha, unpack_32_1x128 (m)),
- unpack_32_1x128 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_mask = load_128_unaligned ((__m128i*)mask);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
- xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- mask += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x128_32 (
- _mm_adds_epu16 (
- pix_multiply_1x128 (
- xmm_alpha, unpack_32_1x128 (m)),
- unpack_32_1x128 (d)));
-
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_add_n_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- int dst_stride;
- int32_t w;
- uint32_t src;
-
- __m128i xmm_src;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- src >>= 24;
-
- if (src == 0x00)
- return;
-
- if (src == 0xff)
- {
- pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
- 8, dest_x, dest_y, width, height, 0xff);
-
- return;
- }
-
- src = (src << 24) | (src << 16) | (src << 8) | src;
- xmm_src = _mm_set_epi32 (src, src, src, src);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- *dst = (uint8_t)_mm_cvtsi128_si32 (
- _mm_adds_epu8 (
- xmm_src,
- _mm_cvtsi32_si128 (*dst)));
-
- w--;
- dst++;
- }
-
- while (w >= 16)
- {
- save_128_aligned (
- (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
-
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- *dst = (uint8_t)_mm_cvtsi128_si32 (
- _mm_adds_epu8 (
- xmm_src,
- _mm_cvtsi32_si128 (*dst)));
-
- w--;
- dst++;
- }
- }
-
-}
-
-static void
-sse2_composite_add_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint16_t t;
-
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- src = src_line;
-
- dst_line += dst_stride;
- src_line += src_stride;
- w = width;
-
- /* Small head */
- while (w && (unsigned long)dst & 3)
- {
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
- w--;
- }
-
- sse2_combine_add_u (imp, op,
- (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
-
- /* Small tail */
- dst += w & 0xfffc;
- src += w & 0xfffc;
-
- w &= 3;
-
- while (w)
- {
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
-
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
-
- sse2_combine_add_u (imp, op, dst, src, NULL, width);
- }
-
-}
-
-static pixman_bool_t
-pixman_blt_sse2 (uint32_t *src_bits,
- uint32_t *dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x,
- int src_y,
- int dst_x,
- int dst_y,
- int width,
- int height)
-{
- uint8_t * src_bytes;
- uint8_t * dst_bytes;
- int byte_width;
-
- if (src_bpp != dst_bpp)
- return FALSE;
-
- if (src_bpp == 16)
- {
- src_stride = src_stride * (int) sizeof (uint32_t) / 2;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
- src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 2 * width;
- src_stride *= 2;
- dst_stride *= 2;
- }
- else if (src_bpp == 32)
- {
- src_stride = src_stride * (int) sizeof (uint32_t) / 4;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
- src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 4 * width;
- src_stride *= 4;
- dst_stride *= 4;
- }
- else
- {
- return FALSE;
- }
-
- while (height--)
- {
- int w;
- uint8_t *s = src_bytes;
- uint8_t *d = dst_bytes;
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- w = byte_width;
-
- while (w >= 2 && ((unsigned long)d & 3))
- {
- *(uint16_t *)d = *(uint16_t *)s;
- w -= 2;
- s += 2;
- d += 2;
- }
-
- while (w >= 4 && ((unsigned long)d & 15))
- {
- *(uint32_t *)d = *(uint32_t *)s;
-
- w -= 4;
- s += 4;
- d += 4;
- }
-
- while (w >= 64)
- {
- __m128i xmm0, xmm1, xmm2, xmm3;
-
- xmm0 = load_128_unaligned ((__m128i*)(s));
- xmm1 = load_128_unaligned ((__m128i*)(s + 16));
- xmm2 = load_128_unaligned ((__m128i*)(s + 32));
- xmm3 = load_128_unaligned ((__m128i*)(s + 48));
-
- save_128_aligned ((__m128i*)(d), xmm0);
- save_128_aligned ((__m128i*)(d + 16), xmm1);
- save_128_aligned ((__m128i*)(d + 32), xmm2);
- save_128_aligned ((__m128i*)(d + 48), xmm3);
-
- s += 64;
- d += 64;
- w -= 64;
- }
-
- while (w >= 16)
- {
- save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
-
- w -= 16;
- d += 16;
- s += 16;
- }
-
- while (w >= 4)
- {
- *(uint32_t *)d = *(uint32_t *)s;
-
- w -= 4;
- s += 4;
- d += 4;
- }
-
- if (w >= 2)
- {
- *(uint16_t *)d = *(uint16_t *)s;
- w -= 2;
- s += 2;
- d += 2;
- }
- }
-
-
- return TRUE;
-}
-
-static void
-sse2_composite_copy_area (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- pixman_blt_sse2 (src_image->bits.bits,
- dst_image->bits.bits,
- src_image->bits.rowstride,
- dst_image->bits.rowstride,
- PIXMAN_FORMAT_BPP (src_image->bits.format),
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
- src_x, src_y, dest_x, dest_y, width, height);
-}
-
-static void
-sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line, s;
- uint32_t *dst, *dst_line, d;
- uint8_t *mask, *mask_line;
- uint32_t m;
- int src_stride, mask_stride, dst_stride;
- int32_t w;
- __m128i ms;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- s = 0xff000000 | *src++;
- m = (uint32_t) *mask++;
- d = *dst;
- ms = unpack_32_1x128 (s);
-
- if (m != 0xff)
- {
- __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
- __m128i md = unpack_32_1x128 (d);
-
- ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
- }
-
- *dst++ = pack_1x128_32 (ms);
- w--;
- }
-
- while (w >= 4)
- {
- m = *(uint32_t*) mask;
- xmm_src = _mm_or_si128 (
- load_128_unaligned ((__m128i*)src), mask_ff000000);
-
- if (m == 0xffffffff)
- {
- save_128_aligned ((__m128i*)dst, xmm_src);
- }
- else
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_rev_2x128 (
- xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- src += 4;
- dst += 4;
- mask += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
-
- if (m)
- {
- s = 0xff000000 | *src;
-
- if (m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m128i ma, md, ms;
-
- d = *dst;
-
- ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
- md = unpack_32_1x128 (d);
- ms = unpack_32_1x128 (s);
-
- *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
- }
-
- }
-
- src++;
- dst++;
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line, s;
- uint32_t *dst, *dst_line, d;
- uint8_t *mask, *mask_line;
- uint32_t m;
- int src_stride, mask_stride, dst_stride;
- int32_t w;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t sa;
-
- s = *src++;
- m = (uint32_t) *mask++;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m128i ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
- ms = unpack_32_1x128 (s);
- md = unpack_32_1x128 (d);
-
- msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
-
- *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
-
- while (w >= 4)
- {
- m = *(uint32_t *) mask;
-
- if (m)
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
-
- if (m == 0xffffffff && is_opaque (xmm_src))
- {
- save_128_aligned ((__m128i *)dst, xmm_src);
- }
- else
- {
- xmm_dst = load_128_aligned ((__m128i *)dst);
-
- xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
- &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
- }
-
- src += 4;
- dst += 4;
- mask += 4;
- w -= 4;
- }
-
- while (w)
- {
- uint32_t sa;
-
- s = *src++;
- m = (uint32_t) *mask++;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m128i ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
- ms = unpack_32_1x128 (s);
- md = unpack_32_1x128 (d);
-
- msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
-
- *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
- }
-
-}
-
-static void
-sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint32_t *dst_line, *dst;
- __m128i xmm_src;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_dsta_hi, xmm_dsta_lo;
- int dst_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
-
- while (height--)
- {
- dst = dst_line;
-
- dst_line += dst_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- __m128i vd;
-
- vd = unpack_32_1x128 (*dst);
-
- *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
- xmm_src));
- w--;
- dst++;
- }
-
- while (w >= 4)
- {
- __m128i tmp_lo, tmp_hi;
-
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
-
- tmp_lo = xmm_src;
- tmp_hi = xmm_src;
-
- over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_dsta_lo, &xmm_dsta_hi,
- &tmp_lo, &tmp_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
-
- w -= 4;
- dst += 4;
- }
-
- while (w)
- {
- __m128i vd;
-
- vd = unpack_32_1x128 (*dst);
-
- *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
- xmm_src));
- w--;
- dst++;
- }
-
- }
-
-}
-
-static void
-sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line, s;
- uint32_t *dst, *dst_line, d;
- uint32_t *mask, *mask_line;
- uint32_t m;
- int src_stride, mask_stride, dst_stride;
- int32_t w;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t sa;
-
- s = *src++;
- m = (*mask++) >> 24;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m128i ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
- ms = unpack_32_1x128 (s);
- md = unpack_32_1x128 (d);
-
- msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
-
- *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_mask = load_128_unaligned ((__m128i*)mask);
-
- if (!is_transparent (xmm_mask))
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
-
- if (is_opaque (xmm_mask) && is_opaque (xmm_src))
- {
- save_128_aligned ((__m128i *)dst, xmm_src);
- }
- else
- {
- xmm_dst = load_128_aligned ((__m128i *)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
- expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
- &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
- }
-
- src += 4;
- dst += 4;
- mask += 4;
- w -= 4;
- }
-
- while (w)
- {
- uint32_t sa;
-
- s = *src++;
- m = (*mask++) >> 24;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m128i ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
- ms = unpack_32_1x128 (s);
- md = unpack_32_1x128 (d);
-
- msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
-
- *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
- }
-
-}
-
-/* A variant of 'sse2_combine_over_u' with minor tweaks */
-static force_inline void
-scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
- const uint32_t* ps,
- int32_t w,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- pixman_fixed_t max_vx,
- pixman_bool_t fully_transparent_src)
-{
- uint32_t s, d;
- const uint32_t* pm = NULL;
-
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- if (fully_transparent_src)
- return;
-
- /* Align dst on a 16-byte boundary */
- while (w && ((unsigned long)pd & 15))
- {
- d = *pd;
- s = combine1 (ps + (vx >> 16), pm);
- vx += unit_x;
-
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
- w--;
- }
-
- while (w >= 4)
- {
- __m128i tmp;
- uint32_t tmp1, tmp2, tmp3, tmp4;
-
- tmp1 = ps[vx >> 16];
- vx += unit_x;
- tmp2 = ps[vx >> 16];
- vx += unit_x;
- tmp3 = ps[vx >> 16];
- vx += unit_x;
- tmp4 = ps[vx >> 16];
- vx += unit_x;
-
- tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
-
- xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
-
- if (is_opaque (xmm_src_hi))
- {
- save_128_aligned ((__m128i*)pd, xmm_src_hi);
- }
- else if (!is_zero (xmm_src_hi))
- {
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (
- xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- /* rebuid the 4 pixel data and save*/
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- w -= 4;
- pd += 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- d = *pd;
- s = combine1 (ps + (vx >> 16), pm);
- vx += unit_x;
-
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- w--;
- }
-}
-
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
- scaled_nearest_scanline_sse2_8888_8888_OVER,
- uint32_t, uint32_t, COVER)
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
- scaled_nearest_scanline_sse2_8888_8888_OVER,
- uint32_t, uint32_t, NONE)
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
- scaled_nearest_scanline_sse2_8888_8888_OVER,
- uint32_t, uint32_t, PAD)
-
-static force_inline void
-scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
- uint32_t * dst,
- const uint32_t * src,
- int32_t w,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- pixman_fixed_t max_vx,
- pixman_bool_t zero_src)
-{
- __m128i xmm_mask;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- if (zero_src || (*mask >> 24) == 0)
- return;
-
- xmm_mask = create_mask_16_128 (*mask >> 24);
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t s = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
-
- if (s)
- {
- uint32_t d = *dst;
-
- __m128i ms = unpack_32_1x128 (s);
- __m128i alpha = expand_alpha_1x128 (ms);
- __m128i dest = xmm_mask;
- __m128i alpha_dst = unpack_32_1x128 (d);
-
- *dst = pack_1x128_32 (
- in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
- }
- dst++;
- w--;
- }
-
- while (w >= 4)
- {
- uint32_t tmp1, tmp2, tmp3, tmp4;
-
- tmp1 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp3 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp4 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
-
- xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
-
- if (!is_zero (xmm_src))
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask, &xmm_mask,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- dst += 4;
- w -= 4;
- }
-
- while (w)
- {
- uint32_t s = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
-
- if (s)
- {
- uint32_t d = *dst;
-
- __m128i ms = unpack_32_1x128 (s);
- __m128i alpha = expand_alpha_1x128 (ms);
- __m128i mask = xmm_mask;
- __m128i dest = unpack_32_1x128 (d);
-
- *dst = pack_1x128_32 (
- in_over_1x128 (&ms, &alpha, &mask, &dest));
- }
-
- dst++;
- w--;
- }
-
-}
-
-FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
- scaled_nearest_scanline_sse2_8888_n_8888_OVER,
- uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
-FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
- scaled_nearest_scanline_sse2_8888_n_8888_OVER,
- uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
-FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
- scaled_nearest_scanline_sse2_8888_n_8888_OVER,
- uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
-
-static void
-bilinear_interpolate_line_sse2 (uint32_t * out,
- const uint32_t * top,
- const uint32_t * bottom,
- int wt,
- int wb,
- pixman_fixed_t x,
- pixman_fixed_t ux,
- int width)
-{
- const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
- const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
- const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
- const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
- const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
- const __m128i xmm_zero = _mm_setzero_si128 ();
- __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
- uint32_t pix1, pix2, pix3, pix4;
-
- #define INTERPOLATE_ONE_PIXEL(pix) \
- do { \
- __m128i xmm_wh, xmm_lo, xmm_hi, a; \
- /* fetch 2x2 pixel block into sse2 register */ \
- uint32_t tl = top [pixman_fixed_to_int (x)]; \
- uint32_t tr = top [pixman_fixed_to_int (x) + 1]; \
- uint32_t bl = bottom [pixman_fixed_to_int (x)]; \
- uint32_t br = bottom [pixman_fixed_to_int (x) + 1]; \
- a = _mm_set_epi32 (tr, tl, br, bl); \
- x += ux; \
- /* vertical interpolation */ \
- a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \
- xmm_wt), \
- _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \
- xmm_wb)); \
- /* calculate horizontal weights */ \
- xmm_wh = _mm_add_epi16 (xmm_addc, \
- _mm_xor_si128 (xmm_xorc, \
- _mm_srli_epi16 (xmm_x, 8))); \
- xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
- /* horizontal interpolation */ \
- xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
- xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
- a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
- _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
- /* shift and pack the result */ \
- a = _mm_srli_epi32 (a, 16); \
- a = _mm_packs_epi32 (a, a); \
- a = _mm_packus_epi16 (a, a); \
- pix = _mm_cvtsi128_si32 (a); \
- } while (0)
-
- while ((width -= 4) >= 0)
- {
- INTERPOLATE_ONE_PIXEL (pix1);
- INTERPOLATE_ONE_PIXEL (pix2);
- INTERPOLATE_ONE_PIXEL (pix3);
- INTERPOLATE_ONE_PIXEL (pix4);
- *out++ = pix1;
- *out++ = pix2;
- *out++ = pix3;
- *out++ = pix4;
- }
- if (width & 2)
- {
- INTERPOLATE_ONE_PIXEL (pix1);
- INTERPOLATE_ONE_PIXEL (pix2);
- *out++ = pix1;
- *out++ = pix2;
- }
- if (width & 1)
- {
- INTERPOLATE_ONE_PIXEL (pix1);
- *out = pix1;
- }
-
- #undef INTERPOLATE_ONE_PIXEL
-}
-
-static force_inline void
-scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
- const uint32_t * mask,
- const uint32_t * src_top,
- const uint32_t * src_bottom,
- int32_t w,
- int wt,
- int wb,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- pixman_fixed_t max_vx,
- pixman_bool_t zero_src)
-{
- bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
- wt, wb, vx, unit_x, w);
-}
-
-FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
- scaled_bilinear_scanline_sse2_8888_8888_SRC,
- uint32_t, uint32_t, uint32_t,
- COVER, FALSE, FALSE)
-FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
- scaled_bilinear_scanline_sse2_8888_8888_SRC,
- uint32_t, uint32_t, uint32_t,
- PAD, FALSE, FALSE)
-FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
- scaled_bilinear_scanline_sse2_8888_8888_SRC,
- uint32_t, uint32_t, uint32_t,
- NONE, FALSE, FALSE)
-
-static const pixman_fast_path_t sse2_fast_paths[] =
-{
- /* PIXMAN_OP_OVER */
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
- PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
-
- /* PIXMAN_OP_OVER_REVERSE */
- PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
- PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
-
- /* PIXMAN_OP_ADD */
- PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
- PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
- PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
-
- /* PIXMAN_OP_SRC */
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
-
- /* PIXMAN_OP_IN */
- PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
- PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
- PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
-
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
-
- SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
- SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
- SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
- SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
-
- SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
-
- { PIXMAN_OP_NONE },
-};
-
-static pixman_bool_t
-sse2_blt (pixman_implementation_t *imp,
- uint32_t * src_bits,
- uint32_t * dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x,
- int src_y,
- int dst_x,
- int dst_y,
- int width,
- int height)
-{
- if (!pixman_blt_sse2 (
- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
- src_x, src_y, dst_x, dst_y, width, height))
-
- {
- return _pixman_implementation_blt (
- imp->delegate,
- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
- src_x, src_y, dst_x, dst_y, width, height);
- }
-
- return TRUE;
-}
-
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static pixman_bool_t
-sse2_fill (pixman_implementation_t *imp,
- uint32_t * bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
- {
- return _pixman_implementation_fill (
- imp->delegate, bits, stride, bpp, x, y, width, height, xor);
- }
-
- return TRUE;
-}
-
-static uint32_t *
-sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
-{
- int w = iter->width;
- __m128i ff000000 = mask_ff000000;
- uint32_t *dst = iter->buffer;
- uint32_t *src = (uint32_t *)iter->bits;
-
- iter->bits += iter->stride;
-
- while (w && ((unsigned long)dst) & 0x0f)
- {
- *dst++ = (*src++) | 0xff000000;
- w--;
- }
-
- while (w >= 4)
- {
- save_128_aligned (
- (__m128i *)dst, _mm_or_si128 (
- load_128_unaligned ((__m128i *)src), ff000000));
-
- dst += 4;
- src += 4;
- w -= 4;
- }
-
- while (w)
- {
- *dst++ = (*src++) | 0xff000000;
- w--;
- }
-
- return iter->buffer;
-}
-
-static uint32_t *
-sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
-{
- int w = iter->width;
- uint32_t *dst = iter->buffer;
- uint16_t *src = (uint16_t *)iter->bits;
- __m128i ff000000 = mask_ff000000;
-
- iter->bits += iter->stride;
-
- while (w && ((unsigned long)dst) & 0x0f)
- {
- uint16_t s = *src++;
-
- *dst++ = CONVERT_0565_TO_8888 (s);
- w--;
- }
-
- while (w >= 8)
- {
- __m128i lo, hi, s;
-
- s = _mm_loadu_si128 ((__m128i *)src);
-
- lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
- hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
-
- save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
- save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
-
- dst += 8;
- src += 8;
- w -= 8;
- }
-
- while (w)
- {
- uint16_t s = *src++;
-
- *dst++ = CONVERT_0565_TO_8888 (s);
- w--;
- }
-
- return iter->buffer;
-}
-
-static uint32_t *
-sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
-{
- int w = iter->width;
- uint32_t *dst = iter->buffer;
- uint8_t *src = iter->bits;
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
-
- iter->bits += iter->stride;
-
- while (w && (((unsigned long)dst) & 15))
- {
- *dst++ = *(src++) << 24;
- w--;
- }
-
- while (w >= 16)
- {
- xmm0 = _mm_loadu_si128((__m128i *)src);
-
- xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
- xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
- xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
- xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
- xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
- xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
-
- _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
- _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
- _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
- _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
-
- dst += 16;
- src += 16;
- w -= 16;
- }
-
- while (w)
- {
- *dst++ = *(src++) << 24;
- w--;
- }
-
- return iter->buffer;
-}
-
-typedef struct
-{
- pixman_format_code_t format;
- pixman_iter_get_scanline_t get_scanline;
-} fetcher_info_t;
-
-static const fetcher_info_t fetchers[] =
-{
- { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
- { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
- { PIXMAN_a8, sse2_fetch_a8 },
- { PIXMAN_null }
-};
-
-static void
-sse2_src_iter_init (pixman_implementation_t *imp,
- pixman_iter_t *iter,
- pixman_image_t *image,
- int x, int y, int width, int height,
- uint8_t *buffer, iter_flags_t flags)
-{
-#define FLAGS \
- (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
-
- if ((flags & ITER_NARROW) &&
- (image->common.flags & FLAGS) == FLAGS &&
- x >= 0 && y >= 0 &&
- x + width <= image->bits.width &&
- y + height <= image->bits.height)
- {
- const fetcher_info_t *f;
-
- for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
- {
- if (image->common.extended_format_code == f->format)
- {
- uint8_t *b = (uint8_t *)image->bits.bits;
- int s = image->bits.rowstride * 4;
-
- iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
- iter->stride = s;
- iter->width = width;
- iter->buffer = (uint32_t *)buffer;
-
- iter->get_scanline = f->get_scanline;
- return;
- }
- }
- }
-
- _pixman_implementation_src_iter_init (
- imp->delegate, iter, image, x, y, width, height, buffer, flags);
-}
-
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-pixman_implementation_t *
-_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
-{
- pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
-
- /* SSE2 constants */
- mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
- mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
- mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
- mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
- mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
- mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
- mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
- mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
- mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
- mask_0080 = create_mask_16_128 (0x0080);
- mask_00ff = create_mask_16_128 (0x00ff);
- mask_0101 = create_mask_16_128 (0x0101);
- mask_ffff = create_mask_16_128 (0xffff);
- mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
- mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
-
- /* Set up function pointers */
- imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
- imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
- imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
- imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
- imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
- imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
- imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
- imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
- imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
- imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
-
- imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
-
- imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
- imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
- imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
- imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
- imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
- imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
- imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
-
- imp->blt = sse2_blt;
- imp->fill = sse2_fill;
-
- imp->src_iter_init = sse2_src_iter_init;
-
- return imp;
-}
+/*
+ * Copyright © 2008 Rodrigo Kumpera
+ * Copyright © 2008 André Tupinambá
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. Red Hat makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Rodrigo Kumpera (kumpera@gmail.com)
+ * André Tupinambá (andrelrt@gmail.com)
+ *
+ * Based on work by Owen Taylor and Søren Sandmann
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h> /* for SSE2 intrinsics */
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-fast-path.h"
+
+static __m128i mask_0080;
+static __m128i mask_00ff;
+static __m128i mask_0101;
+static __m128i mask_ffff;
+static __m128i mask_ff000000;
+static __m128i mask_alpha;
+
+static __m128i mask_565_r;
+static __m128i mask_565_g1, mask_565_g2;
+static __m128i mask_565_b;
+static __m128i mask_red;
+static __m128i mask_green;
+static __m128i mask_blue;
+
+static __m128i mask_565_fix_rb;
+static __m128i mask_565_fix_g;
+
+static force_inline __m128i
+unpack_32_1x128 (uint32_t data)
+{
+ return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
+}
+
+static force_inline void
+unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
+{
+ *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
+ *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
+}
+
+static force_inline __m128i
+unpack_565_to_8888 (__m128i lo)
+{
+ __m128i r, g, b, rb, t;
+
+ r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
+ g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
+ b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
+
+ rb = _mm_or_si128 (r, b);
+ t = _mm_and_si128 (rb, mask_565_fix_rb);
+ t = _mm_srli_epi32 (t, 5);
+ rb = _mm_or_si128 (rb, t);
+
+ t = _mm_and_si128 (g, mask_565_fix_g);
+ t = _mm_srli_epi32 (t, 6);
+ g = _mm_or_si128 (g, t);
+
+ return _mm_or_si128 (rb, g);
+}
+
+static force_inline void
+unpack_565_128_4x128 (__m128i data,
+ __m128i* data0,
+ __m128i* data1,
+ __m128i* data2,
+ __m128i* data3)
+{
+ __m128i lo, hi;
+
+ lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
+ hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
+
+ lo = unpack_565_to_8888 (lo);
+ hi = unpack_565_to_8888 (hi);
+
+ unpack_128_2x128 (lo, data0, data1);
+ unpack_128_2x128 (hi, data2, data3);
+}
+
+static force_inline uint16_t
+pack_565_32_16 (uint32_t pixel)
+{
+ return (uint16_t) (((pixel >> 8) & 0xf800) |
+ ((pixel >> 5) & 0x07e0) |
+ ((pixel >> 3) & 0x001f));
+}
+
+static force_inline __m128i
+pack_2x128_128 (__m128i lo, __m128i hi)
+{
+ return _mm_packus_epi16 (lo, hi);
+}
+
+static force_inline __m128i
+pack_565_2x128_128 (__m128i lo, __m128i hi)
+{
+ __m128i data;
+ __m128i r, g1, g2, b;
+
+ data = pack_2x128_128 (lo, hi);
+
+ r = _mm_and_si128 (data, mask_565_r);
+ g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+ g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+ b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
+
+ return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
+}
+
+static force_inline __m128i
+pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
+{
+ return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+ pack_565_2x128_128 (*xmm2, *xmm3));
+}
+
+static force_inline int
+is_opaque (__m128i x)
+{
+ __m128i ffs = _mm_cmpeq_epi8 (x, x);
+
+ return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (__m128i x)
+{
+ return _mm_movemask_epi8 (
+ _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
+}
+
+static force_inline int
+is_transparent (__m128i x)
+{
+ return (_mm_movemask_epi8 (
+ _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
+}
+
+static force_inline __m128i
+expand_pixel_32_1x128 (uint32_t data)
+{
+ return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
+}
+
+static force_inline __m128i
+expand_alpha_1x128 (__m128i data)
+{
+ return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+ _MM_SHUFFLE (3, 3, 3, 3)),
+ _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi)
+{
+ __m128i lo, hi;
+
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+ *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+ *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_rev_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi)
+{
+ __m128i lo, hi;
+
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+ *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+ *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline void
+pix_multiply_2x128 (__m128i* data_lo,
+ __m128i* data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* ret_lo,
+ __m128i* ret_hi)
+{
+ __m128i lo, hi;
+
+ lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
+ hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
+ lo = _mm_adds_epu16 (lo, mask_0080);
+ hi = _mm_adds_epu16 (hi, mask_0080);
+ *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+ *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
+}
+
+static force_inline void
+pix_add_multiply_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_dst_lo,
+ __m128i* alpha_dst_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi,
+ __m128i* alpha_src_lo,
+ __m128i* alpha_src_hi,
+ __m128i* ret_lo,
+ __m128i* ret_hi)
+{
+ __m128i t1_lo, t1_hi;
+ __m128i t2_lo, t2_hi;
+
+ pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
+ pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
+
+ *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
+ *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
+}
+
+static force_inline void
+negate_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* neg_lo,
+ __m128i* neg_hi)
+{
+ *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
+ *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
+}
+
+static force_inline void
+invert_colors_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* inv_lo,
+ __m128i* inv_hi)
+{
+ __m128i lo, hi;
+
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+ *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+ *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline void
+over_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
+{
+ __m128i t1, t2;
+
+ negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+ pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+ *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
+ *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
+}
+
+static force_inline void
+over_rev_non_pre_2x128 (__m128i src_lo,
+ __m128i src_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
+{
+ __m128i lo, hi;
+ __m128i alpha_lo, alpha_hi;
+
+ expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
+
+ lo = _mm_or_si128 (alpha_lo, mask_alpha);
+ hi = _mm_or_si128 (alpha_hi, mask_alpha);
+
+ invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
+
+ pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
+
+ over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
+}
+
+static force_inline void
+in_over_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* mask_lo,
+ __m128i* mask_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
+{
+ __m128i s_lo, s_hi;
+ __m128i a_lo, a_hi;
+
+ pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+ pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
+
+ over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
+}
+
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline __m128i
+load_128_aligned (__m128i* src)
+{
+ return _mm_load_si128 (src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline __m128i
+load_128_unaligned (const __m128i* src)
+{
+ return _mm_loadu_si128 (src);
+}
+
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
+static force_inline void
+save_128_write_combining (__m128i* dst,
+ __m128i data)
+{
+ _mm_stream_si128 (dst, data);
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (__m128i* dst,
+ __m128i data)
+{
+ _mm_store_si128 (dst, data);
+}
+
+/* save 4 pixels on a unaligned address */
+static force_inline void
+save_128_unaligned (__m128i* dst,
+ __m128i data)
+{
+ _mm_storeu_si128 (dst, data);
+}
+
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
+{
+ return _mm_cvtsi32_si128 (data);
+}
+
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
+{
+ return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
+{
+ return _mm_shufflelo_epi16 (
+ unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+ __m128i alpha)
+{
+ return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+ mask_0080),
+ mask_0101);
+}
+
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+ __m128i* alpha_dst,
+ __m128i* dst,
+ __m128i* alpha_src)
+{
+ __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+ __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
+
+ return _mm_adds_epu8 (t1, t2);
+}
+
+static force_inline __m128i
+negate_1x128 (__m128i data)
+{
+ return _mm_xor_si128 (data, mask_00ff);
+}
+
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
+{
+ return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
+{
+ return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
+}
+
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
+{
+ return over_1x128 (pix_multiply_1x128 (*src, *mask),
+ pix_multiply_1x128 (*alpha, *mask),
+ *dst);
+}
+
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
+{
+ __m128i alpha = expand_alpha_1x128 (src);
+
+ return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+ _mm_or_si128 (alpha, mask_alpha)),
+ alpha,
+ dst);
+}
+
+static force_inline uint32_t
+pack_1x128_32 (__m128i data)
+{
+ return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
+}
+
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
+{
+ __m128i m = _mm_cvtsi32_si128 (pixel);
+
+ m = unpack_565_to_8888 (m);
+
+ return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+ uint8_t a;
+ __m128i xmms;
+
+ a = src >> 24;
+
+ if (a == 0xff)
+ {
+ return src;
+ }
+ else if (src)
+ {
+ xmms = unpack_32_1x128 (src);
+ return pack_1x128_32 (
+ over_1x128 (xmms, expand_alpha_1x128 (xmms),
+ unpack_32_1x128 (dst)));
+ }
+
+ return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+ uint32_t s = *ps;
+
+ if (pm)
+ {
+ __m128i ms, mm;
+
+ mm = unpack_32_1x128 (*pm);
+ mm = expand_alpha_1x128 (mm);
+
+ ms = unpack_32_1x128 (s);
+ ms = pix_multiply_1x128 (ms, mm);
+
+ s = pack_1x128_32 (ms);
+ }
+
+ return s;
+}
+
+static force_inline __m128i
+combine4 (const __m128i *ps, const __m128i *pm)
+{
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_msk_lo, xmm_msk_hi;
+ __m128i s;
+
+ if (pm)
+ {
+ xmm_msk_lo = load_128_unaligned (pm);
+
+ if (is_transparent (xmm_msk_lo))
+ return _mm_setzero_si128 ();
+ }
+
+ s = load_128_unaligned (ps);
+
+ if (pm)
+ {
+ unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
+
+ expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_msk_lo, &xmm_msk_hi,
+ &xmm_src_lo, &xmm_src_hi);
+
+ s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
+ }
+
+ return s;
+}
+
+static force_inline void
+core_combine_over_u_sse2_mask (uint32_t * pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
+{
+ uint32_t s, d;
+
+ /* Align dst on a 16-byte boundary */
+ while (w && ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
+ ps++;
+ pm++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m128i mask = load_128_unaligned ((__m128i *)pm);
+
+ if (!is_zero (mask))
+ {
+ __m128i src;
+ __m128i src_hi, src_lo;
+ __m128i mask_hi, mask_lo;
+ __m128i alpha_hi, alpha_lo;
+
+ src = load_128_unaligned ((__m128i *)ps);
+
+ if (is_opaque (_mm_and_si128 (src, mask)))
+ {
+ save_128_aligned ((__m128i *)pd, src);
+ }
+ else
+ {
+ __m128i dst = load_128_aligned ((__m128i *)pd);
+ __m128i dst_hi, dst_lo;
+
+ unpack_128_2x128 (mask, &mask_lo, &mask_hi);
+ unpack_128_2x128 (src, &src_lo, &src_hi);
+
+ expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
+ pix_multiply_2x128 (&src_lo, &src_hi,
+ &mask_lo, &mask_hi,
+ &src_lo, &src_hi);
+
+ unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+ expand_alpha_2x128 (src_lo, src_hi,
+ &alpha_lo, &alpha_hi);
+
+ over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+ &dst_lo, &dst_hi);
+
+ save_128_aligned (
+ (__m128i *)pd,
+ pack_2x128_128 (dst_lo, dst_hi));
+ }
+ }
+
+ pm += 4;
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ }
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
+ ps++;
+ pm++;
+
+ w--;
+ }
+}
+
+static force_inline void
+core_combine_over_u_sse2_no_mask (uint32_t * pd,
+ const uint32_t* ps,
+ int w)
+{
+ uint32_t s, d;
+
+ /* Align dst on a 16-byte boundary */
+ while (w && ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = *ps;
+
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
+ ps++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m128i src;
+ __m128i src_hi, src_lo, dst_hi, dst_lo;
+ __m128i alpha_hi, alpha_lo;
+
+ src = load_128_unaligned ((__m128i *)ps);
+
+ if (!is_zero (src))
+ {
+ if (is_opaque (src))
+ {
+ save_128_aligned ((__m128i *)pd, src);
+ }
+ else
+ {
+ __m128i dst = load_128_aligned ((__m128i *)pd);
+
+ unpack_128_2x128 (src, &src_lo, &src_hi);
+ unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+ expand_alpha_2x128 (src_lo, src_hi,
+ &alpha_lo, &alpha_hi);
+ over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+ &dst_lo, &dst_hi);
+
+ save_128_aligned (
+ (__m128i *)pd,
+ pack_2x128_128 (dst_lo, dst_hi));
+ }
+ }
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ }
+ while (w)
+ {
+ d = *pd;
+ s = *ps;
+
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
+ ps++;
+
+ w--;
+ }
+}
+
+static force_inline void
+sse2_combine_over_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ if (pm)
+ core_combine_over_u_sse2_mask (pd, ps, pm, w);
+ else
+ core_combine_over_u_sse2_no_mask (pd, ps, w);
+}
+
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, d;
+
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ /* Align dst on a 16-byte boundary */
+ while (w &&
+ ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ *pd++ = core_combine_over_u_pixel_sse2 (d, s);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ /* I'm loading unaligned because I'm not sure
+ * about the address alignment.
+ */
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_src_lo, &xmm_src_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_src_lo, xmm_src_hi));
+
+ w -= 4;
+ ps += 4;
+ pd += 4;
+
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ *pd++ = core_combine_over_u_pixel_sse2 (d, s);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline uint32_t
+core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+ uint32_t maska = src >> 24;
+
+ if (maska == 0)
+ {
+ return 0;
+ }
+ else if (maska != 0xff)
+ {
+ return pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (dst),
+ expand_alpha_1x128 (unpack_32_1x128 (src))));
+ }
+
+ return dst;
+}
+
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ while (w && ((unsigned long) pd & 15))
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixel_sse2 (d, s);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixel_sse2 (d, s);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ while (w && ((unsigned long) pd & 15))
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixel_sse2 (s, d);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ while (w && ((unsigned long) pd & 15))
+ {
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
+
+ if (pm)
+ pm++;
+ ps++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
+
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
+ ps++;
+ if (pm)
+ pm++;
+ w--;
+ }
+}
+
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ while (w && ((unsigned long) pd & 15))
+ {
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (d)))));
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (d)))));
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline uint32_t
+core_combine_atop_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
+{
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
+
+ __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+ __m128i da = expand_alpha_1x128 (d);
+
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+ while (w && ((unsigned long) pd & 15))
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
+{
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
+
+ __m128i sa = expand_alpha_1x128 (s);
+ __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+ while (w && ((unsigned long) pd & 15))
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline uint32_t
+core_combine_xor_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
+{
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
+
+ __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+ __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
+
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
+}
+
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ int w = width;
+ uint32_t s, d;
+ uint32_t* pd = dst;
+ const uint32_t* ps = src;
+ const uint32_t* pm = mask;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+ while (w && ((unsigned long) pd & 15))
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
+ xmm_dst = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline void
+sse2_combine_add_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ int w = width;
+ uint32_t s, d;
+ uint32_t* pd = dst;
+ const uint32_t* ps = src;
+ const uint32_t* pm = mask;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ ps++;
+ if (pm)
+ pm++;
+ *pd++ = _mm_cvtsi128_si32 (
+ _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m128i s;
+
+ s = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+ save_128_aligned (
+ (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
+
+ pd += 4;
+ ps += 4;
+ if (pm)
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w--)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ ps++;
+ *pd++ = _mm_cvtsi128_si32 (
+ _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline uint32_t
+core_combine_saturate_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
+{
+ __m128i ms = unpack_32_1x128 (src);
+ __m128i md = unpack_32_1x128 (dst);
+ uint32_t sa = src >> 24;
+ uint32_t da = ~dst >> 24;
+
+ if (sa > da)
+ {
+ ms = pix_multiply_1x128 (
+ ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
+ }
+
+ return pack_1x128_32 (_mm_adds_epu16 (md, ms));
+}
+
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, d;
+
+ uint32_t pack_cmp;
+ __m128i xmm_src, xmm_dst;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)pd);
+ xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpgt_epi32 (
+ _mm_srli_epi32 (xmm_src, 24),
+ _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
+
+ /* if some alpha src is grater than respective ~alpha dst */
+ if (pack_cmp)
+ {
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+ }
+ else
+ {
+ save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
+
+ pd += 4;
+ ps += 4;
+ if (pm)
+ pm += 4;
+ }
+
+ w -= 4;
+ }
+
+ while (w--)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, m;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+ w--;
+ }
+}
+
+static force_inline uint32_t
+core_combine_over_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
+{
+ __m128i s = unpack_32_1x128 (src);
+ __m128i expAlpha = expand_alpha_1x128 (s);
+ __m128i unpk_mask = unpack_32_1x128 (mask);
+ __m128i unpk_dst = unpack_32_1x128 (dst);
+
+ return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+}
+
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+}
+
+static force_inline uint32_t
+core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
+{
+ __m128i d = unpack_32_1x128 (dst);
+
+ return pack_1x128_32 (
+ over_1x128 (d, expand_alpha_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (src),
+ unpack_32_1x128 (mask))));
+}
+
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+}
+
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+ w--;
+ }
+}
+
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
+ w--;
+ }
+}
+
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+
+ w--;
+ }
+}
+
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ negate_1x128 (pix_multiply_1x128 (
+ unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s))))));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ negate_1x128 (pix_multiply_1x128 (
+ unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s))))));
+ w--;
+ }
+}
+
+static force_inline uint32_t
+core_combine_atop_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
+{
+ __m128i m = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
+ __m128i sa = expand_alpha_1x128 (s);
+ __m128i da = expand_alpha_1x128 (d);
+
+ s = pix_multiply_1x128 (s, m);
+ m = negate_1x128 (pix_multiply_1x128 (m, sa));
+
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
+{
+ __m128i m = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
+
+ __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+ __m128i sa = expand_alpha_1x128 (s);
+
+ s = pix_multiply_1x128 (s, m);
+ m = pix_multiply_1x128 (m, sa);
+
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+}
+
+static force_inline uint32_t
+core_combine_xor_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
+{
+ __m128i a = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
+
+ __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+ a, expand_alpha_1x128 (s)));
+ __m128i dest = pix_multiply_1x128 (s, a);
+ __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
+
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d,
+ &alpha_dst,
+ &dest,
+ &alpha_src));
+}
+
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+}
+
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (
+ _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+ _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+ w--;
+ }
+}
+
+static force_inline __m128i
+create_mask_16_128 (uint16_t mask)
+{
+ return _mm_set1_epi16 (mask);
+}
+
+/* Work around a code generation bug in Sun Studio 12. */
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
+# define create_mask_2x32_128(mask0, mask1) \
+ (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+#else
+static force_inline __m128i
+create_mask_2x32_128 (uint32_t mask0,
+ uint32_t mask1)
+{
+ return _mm_set_epi32 (mask0, mask1, mask0, mask1);
+}
+#endif
+
+static void
+sse2_composite_over_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint32_t *dst_line, *dst, d;
+ int32_t w;
+ int dst_stride;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+ while (height--)
+ {
+ dst = dst_line;
+
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ d = *dst;
+ *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ w -= 4;
+ dst += 4;
+ }
+
+ while (w)
+ {
+ d = *dst;
+ *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ unpack_32_1x128 (d)));
+ w--;
+ }
+
+ }
+}
+
+static void
+sse2_composite_over_n_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint16_t *dst_line, *dst, d;
+ int32_t w;
+ int dst_stride;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+ while (height--)
+ {
+ dst = dst_line;
+
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ d = *dst;
+
+ *dst++ = pack_565_32_16 (
+ pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ expand565_16_1x128 (d))));
+ w--;
+ }
+
+ while (w >= 8)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst0, &xmm_dst1);
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst2, &xmm_dst3);
+
+ xmm_dst = pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ save_128_aligned ((__m128i*)dst, xmm_dst);
+
+ dst += 8;
+ w -= 8;
+ }
+
+ while (w--)
+ {
+ d = *dst;
+ *dst++ = pack_565_32_16 (
+ pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+ expand565_16_1x128 (d))));
+ }
+ }
+
+}
+
+static void
+sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst_line, d;
+ uint32_t *mask_line, m;
+ uint32_t pack_cmp;
+ int dst_stride, mask_stride;
+
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+ srca = src >> 24;
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ xmm_src = _mm_unpacklo_epi8 (
+ create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
+
+ while (height--)
+ {
+ int w = width;
+ const uint32_t *pm = (uint32_t *)mask_line;
+ uint32_t *pd = (uint32_t *)dst_line;
+
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
+
+ *pd = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+ mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+ pack_cmp =
+ _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+ if (pack_cmp != 0xffff)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)pd);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_src, &xmm_src,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+ xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
+ }
+
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
+
+ *pd = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+ mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint32_t *dst_line, d;
+ uint32_t *mask_line, m;
+ uint32_t pack_cmp;
+ int dst_stride, mask_stride;
+
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ xmm_src = _mm_unpacklo_epi8 (
+ create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
+
+ while (height--)
+ {
+ int w = width;
+ const uint32_t *pm = (uint32_t *)mask_line;
+ uint32_t *pd = (uint32_t *)dst_line;
+
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
+
+ *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+ pack_cmp =
+ _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+ if (pack_cmp != 0xffff)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)pd);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
+
+ *pd = pack_1x128_32 (
+ in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ uint32_t mask;
+ int32_t w;
+ int dst_stride, src_stride;
+
+ __m128i xmm_mask;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+ xmm_mask = create_mask_16_128 (mask >> 24);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = *src++;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i dest = xmm_mask;
+ __m128i alpha_dst = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+ }
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
+
+ if (!is_zero (xmm_src))
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t s = *src++;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &mask, &dest));
+ }
+
+ dst++;
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int32_t w;
+ int dst_stride, src_stride;
+
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
+
+ xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
+ xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
+ xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
+ xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
+
+ save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ uint32_t mask;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ __m128i xmm_mask, xmm_alpha;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+ xmm_mask = create_mask_16_128 (mask >> 24);
+ xmm_alpha = mask_00ff;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = (*src++) | 0xff000000;
+ uint32_t d = *dst;
+
+ __m128i src = unpack_32_1x128 (s);
+ __m128i alpha = xmm_alpha;
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
+
+ *dst++ = pack_1x128_32 (
+ in_over_1x128 (&src, &alpha, &mask, &dest));
+
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src = _mm_or_si128 (
+ load_128_unaligned ((__m128i*)src), mask_ff000000);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+
+ }
+
+ while (w)
+ {
+ uint32_t s = (*src++) | 0xff000000;
+ uint32_t d = *dst;
+
+ __m128i src = unpack_32_1x128 (s);
+ __m128i alpha = xmm_alpha;
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
+
+ *dst++ = pack_1x128_32 (
+ in_over_1x128 (&src, &alpha, &mask, &dest));
+
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ int dst_stride, src_stride;
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ dst = dst_line;
+ src = src_line;
+
+ while (height--)
+ {
+ sse2_combine_over_u (imp, op, dst, src, NULL, width);
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static force_inline uint16_t
+composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
+{
+ __m128i ms;
+
+ ms = unpack_32_1x128 (src);
+ return pack_565_32_16 (
+ pack_1x128_32 (
+ over_1x128 (
+ ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
+}
+
+static void
+sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ src = src_line;
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Align dst on a 16-byte boundary */
+ while (w &&
+ ((unsigned long)dst & 15))
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = composite_over_8888_0565pixel (s, d);
+ w--;
+ }
+
+ /* It's a 8 pixel loop */
+ while (w >= 8)
+ {
+ /* I'm loading unaligned because I'm not sure
+ * about the address alignment.
+ */
+ xmm_src = load_128_unaligned ((__m128i*) src);
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ /* I'm loading next 4 pixels from memory
+ * before to optimze the memory read.
+ */
+ xmm_src = load_128_unaligned ((__m128i*) (src + 4));
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst0, &xmm_dst1);
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst2, &xmm_dst3);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ src += 8;
+ }
+
+ while (w--)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = composite_over_8888_0565pixel (s, d);
+ }
+ }
+
+}
+
+static void
+sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m, d;
+
+ __m128i xmm_src, xmm_alpha, xmm_def;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ xmm_def = create_mask_2x32_128 (src, src);
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_pixel_8_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ m = *((uint32_t*)mask);
+
+ if (srca == 0xff && m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_def);
+ }
+ else if (m)
+ {
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_pixel_8_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ w--;
+ dst++;
+ }
+ }
+
+}
+
+static pixman_bool_t
+pixman_fill_sse2 (uint32_t *bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t data)
+{
+ uint32_t byte_width;
+ uint8_t *byte_line;
+
+ __m128i xmm_def;
+
+ if (bpp == 8)
+ {
+ uint8_t b;
+ uint16_t w;
+
+ stride = stride * (int) sizeof (uint32_t) / 1;
+ byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+ byte_width = width;
+ stride *= 1;
+
+ b = data & 0xff;
+ w = (b << 8) | b;
+ data = (w << 16) | w;
+ }
+ else if (bpp == 16)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 2;
+ byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+ byte_width = 2 * width;
+ stride *= 2;
+
+ data = (data & 0xffff) * 0x00010001;
+ }
+ else if (bpp == 32)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 4;
+ byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+ byte_width = 4 * width;
+ stride *= 4;
+ }
+ else
+ {
+ return FALSE;
+ }
+
+ xmm_def = create_mask_2x32_128 (data, data);
+
+ while (height--)
+ {
+ int w;
+ uint8_t *d = byte_line;
+ byte_line += stride;
+ w = byte_width;
+
+ while (w >= 1 && ((unsigned long)d & 1))
+ {
+ *(uint8_t *)d = data;
+ w -= 1;
+ d += 1;
+ }
+
+ while (w >= 2 && ((unsigned long)d & 3))
+ {
+ *(uint16_t *)d = data;
+ w -= 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((unsigned long)d & 15))
+ {
+ *(uint32_t *)d = data;
+
+ w -= 4;
+ d += 4;
+ }
+
+ while (w >= 128)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+ save_128_aligned ((__m128i*)(d + 32), xmm_def);
+ save_128_aligned ((__m128i*)(d + 48), xmm_def);
+ save_128_aligned ((__m128i*)(d + 64), xmm_def);
+ save_128_aligned ((__m128i*)(d + 80), xmm_def);
+ save_128_aligned ((__m128i*)(d + 96), xmm_def);
+ save_128_aligned ((__m128i*)(d + 112), xmm_def);
+
+ d += 128;
+ w -= 128;
+ }
+
+ if (w >= 64)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+ save_128_aligned ((__m128i*)(d + 32), xmm_def);
+ save_128_aligned ((__m128i*)(d + 48), xmm_def);
+
+ d += 64;
+ w -= 64;
+ }
+
+ if (w >= 32)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+
+ d += 32;
+ w -= 32;
+ }
+
+ if (w >= 16)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+
+ d += 16;
+ w -= 16;
+ }
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = data;
+
+ w -= 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = data;
+ w -= 2;
+ d += 2;
+ }
+
+ if (w >= 1)
+ {
+ *(uint8_t *)d = data;
+ w -= 1;
+ d += 1;
+ }
+ }
+
+ return TRUE;
+}
+
+static void
+sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m;
+
+ __m128i xmm_src, xmm_def;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ {
+ pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
+ PIXMAN_FORMAT_BPP (dst_image->bits.format),
+ dest_x, dest_y, width, height, 0);
+ return;
+ }
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ xmm_def = create_mask_2x32_128 (src, src);
+ xmm_src = expand_pixel_32_1x128 (src);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ *dst = pack_1x128_32 (
+ pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
+ }
+ else
+ {
+ *dst = 0;
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ m = *((uint32_t*)mask);
+
+ if (srca == 0xff && m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_def);
+ }
+ else if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_src, &xmm_src,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+ }
+ else
+ {
+ save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ *dst = pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_src, expand_pixel_8_1x128 (m)));
+ }
+ else
+ {
+ *dst = 0;
+ }
+
+ w--;
+ dst++;
+ }
+ }
+
+}
+
+static void
+sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint16_t *dst_line, *dst, d;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m;
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ mmx_dest = expand565_16_1x128 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x128_32 (
+ in_over_1x128 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 8)
+ {
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ m = *((uint32_t*)mask);
+ mask += 4;
+
+ if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ m = *((uint32_t*)mask);
+ mask += 4;
+
+ if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ }
+
+ while (w)
+ {
+ m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ mmx_dest = expand565_16_1x128 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x128_32 (
+ in_over_1x128 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ }
+ }
+
+}
+
+static void
+sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint32_t opaque, zero;
+
+ __m128i ms;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = *src++;
+ d = *dst;
+
+ ms = unpack_32_1x128 (s);
+
+ *dst++ = pack_565_32_16 (
+ pack_1x128_32 (
+ over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+ w--;
+ }
+
+ while (w >= 8)
+ {
+ /* First round */
+ xmm_src = load_128_unaligned ((__m128i*)src);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ opaque = is_opaque (xmm_src);
+ zero = is_zero (xmm_src);
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+ /* preload next round*/
+ xmm_src = load_128_unaligned ((__m128i*)(src + 4));
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+ else if (!zero)
+ {
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ /* Second round */
+ opaque = is_opaque (xmm_src);
+ zero = is_zero (xmm_src);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+ else if (!zero)
+ {
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ src += 8;
+ dst += 8;
+ }
+
+ while (w)
+ {
+ s = *src++;
+ d = *dst;
+
+ ms = unpack_32_1x128 (s);
+
+ *dst++ = pack_565_32_16 (
+ pack_1x128_32 (
+ over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint32_t opaque, zero;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = pack_1x128_32 (
+ over_rev_non_pre_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src_hi = load_128_unaligned ((__m128i*)src);
+
+ opaque = is_opaque (xmm_src_hi);
+ zero = is_zero (xmm_src_hi);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ else if (!zero)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ src += 4;
+ }
+
+ while (w)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = pack_1x128_32 (
+ over_rev_non_pre_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint16_t *dst_line, *dst, d;
+ uint32_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
+ int w;
+ uint32_t pack_cmp;
+
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
+
+ while (height--)
+ {
+ w = width;
+ mask = mask_line;
+ dst = dst_line;
+ mask_line += mask_stride;
+ dst_line += dst_stride;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = *(uint32_t *) mask;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = expand565_16_1x128 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x128_32 (
+ in_over_1x128 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ mask++;
+ }
+
+ while (w >= 8)
+ {
+ /* First round */
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ /* preload next round */
+ xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
+
+ /* preload next round */
+ if (pack_cmp != 0xffff)
+ {
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ /* Second round */
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ if (pack_cmp != 0xffff)
+ {
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ mask += 8;
+ }
+
+ while (w)
+ {
+ m = *(uint32_t *) mask;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = expand565_16_1x128 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x128_32 (
+ in_over_1x128 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ mask++;
+ }
+ }
+
+}
+
+static void
+sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ uint32_t d, m;
+ uint32_t src;
+ uint8_t sa;
+ int32_t w;
+
+ __m128i xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ sa = src >> 24;
+
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (xmm_alpha,
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ mask += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_in_n_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ int dst_stride;
+ uint32_t d;
+ uint32_t src;
+ int32_t w;
+
+ __m128i xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+ src = src >> 24;
+
+ if (src == 0xff)
+ return;
+
+ if (src == 0x00)
+ {
+ pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+ 8, dest_x, dest_y, width, height, src);
+
+ return;
+ }
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_alpha,
+ unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_alpha,
+ unpack_32_1x128 (d)));
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_in_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int src_stride, dst_stride;
+ int32_t w;
+ uint32_t s, d;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ s = (uint32_t) *src++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ s = (uint32_t) *src++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t src;
+ uint8_t sa;
+ uint32_t m, d;
+
+ __m128i xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ sa = src >> 24;
+
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ _mm_adds_epu16 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+ xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ mask += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ _mm_adds_epu16 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_add_n_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ int dst_stride;
+ int32_t w;
+ uint32_t src;
+
+ __m128i xmm_src;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ src >>= 24;
+
+ if (src == 0x00)
+ return;
+
+ if (src == 0xff)
+ {
+ pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+ 8, dest_x, dest_y, width, height, 0xff);
+
+ return;
+ }
+
+ src = (src << 24) | (src << 16) | (src << 8) | src;
+ xmm_src = _mm_set_epi32 (src, src, src, src);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ *dst = (uint8_t)_mm_cvtsi128_si32 (
+ _mm_adds_epu8 (
+ xmm_src,
+ _mm_cvtsi32_si128 (*dst)));
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 16)
+ {
+ save_128_aligned (
+ (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
+
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst = (uint8_t)_mm_cvtsi128_si32 (
+ _mm_adds_epu8 (
+ xmm_src,
+ _mm_cvtsi32_si128 (*dst)));
+
+ w--;
+ dst++;
+ }
+ }
+
+}
+
+static void
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint16_t t;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ src = src_line;
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Small head */
+ while (w && (unsigned long)dst & 3)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+
+ sse2_combine_add_u (imp, op,
+ (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+ /* Small tail */
+ dst += w & 0xfffc;
+ src += w & 0xfffc;
+
+ w &= 3;
+
+ while (w)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+
+ sse2_combine_add_u (imp, op, dst, src, NULL, width);
+ }
+
+}
+
+static pixman_bool_t
+pixman_blt_sse2 (uint32_t *src_bits,
+ uint32_t *dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height)
+{
+ uint8_t * src_bytes;
+ uint8_t * dst_bytes;
+ int byte_width;
+
+ if (src_bpp != dst_bpp)
+ return FALSE;
+
+ if (src_bpp == 16)
+ {
+ src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+ dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+ src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+ dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+ byte_width = 2 * width;
+ src_stride *= 2;
+ dst_stride *= 2;
+ }
+ else if (src_bpp == 32)
+ {
+ src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+ dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+ src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+ dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+ byte_width = 4 * width;
+ src_stride *= 4;
+ dst_stride *= 4;
+ }
+ else
+ {
+ return FALSE;
+ }
+
+ while (height--)
+ {
+ int w;
+ uint8_t *s = src_bytes;
+ uint8_t *d = dst_bytes;
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ w = byte_width;
+
+ while (w >= 2 && ((unsigned long)d & 3))
+ {
+ *(uint16_t *)d = *(uint16_t *)s;
+ w -= 2;
+ s += 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((unsigned long)d & 15))
+ {
+ *(uint32_t *)d = *(uint32_t *)s;
+
+ w -= 4;
+ s += 4;
+ d += 4;
+ }
+
+ while (w >= 64)
+ {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+
+ xmm0 = load_128_unaligned ((__m128i*)(s));
+ xmm1 = load_128_unaligned ((__m128i*)(s + 16));
+ xmm2 = load_128_unaligned ((__m128i*)(s + 32));
+ xmm3 = load_128_unaligned ((__m128i*)(s + 48));
+
+ save_128_aligned ((__m128i*)(d), xmm0);
+ save_128_aligned ((__m128i*)(d + 16), xmm1);
+ save_128_aligned ((__m128i*)(d + 32), xmm2);
+ save_128_aligned ((__m128i*)(d + 48), xmm3);
+
+ s += 64;
+ d += 64;
+ w -= 64;
+ }
+
+ while (w >= 16)
+ {
+ save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
+
+ w -= 16;
+ d += 16;
+ s += 16;
+ }
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = *(uint32_t *)s;
+
+ w -= 4;
+ s += 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = *(uint16_t *)s;
+ w -= 2;
+ s += 2;
+ d += 2;
+ }
+ }
+
+
+ return TRUE;
+}
+
+static void
+sse2_composite_copy_area (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ pixman_blt_sse2 (src_image->bits.bits,
+ dst_image->bits.bits,
+ src_image->bits.rowstride,
+ dst_image->bits.rowstride,
+ PIXMAN_FORMAT_BPP (src_image->bits.format),
+ PIXMAN_FORMAT_BPP (dst_image->bits.format),
+ src_x, src_y, dest_x, dest_y, width, height);
+}
+
+static void
+sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint8_t *mask, *mask_line;
+ uint32_t m;
+ int src_stride, mask_stride, dst_stride;
+ int32_t w;
+ __m128i ms;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = 0xff000000 | *src++;
+ m = (uint32_t) *mask++;
+ d = *dst;
+ ms = unpack_32_1x128 (s);
+
+ if (m != 0xff)
+ {
+ __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ __m128i md = unpack_32_1x128 (d);
+
+ ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
+ }
+
+ *dst++ = pack_1x128_32 (ms);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ m = *(uint32_t*) mask;
+ xmm_src = _mm_or_si128 (
+ load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+ if (m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_rev_2x128 (
+ xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ src += 4;
+ dst += 4;
+ mask += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+
+ if (m)
+ {
+ s = 0xff000000 | *src;
+
+ if (m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ma, md, ms;
+
+ d = *dst;
+
+ ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ md = unpack_32_1x128 (d);
+ ms = unpack_32_1x128 (s);
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
+ }
+
+ }
+
+ src++;
+ dst++;
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint8_t *mask, *mask_line;
+ uint32_t m;
+ int src_stride, mask_stride, dst_stride;
+ int32_t w;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t sa;
+
+ s = *src++;
+ m = (uint32_t) *mask++;
+ d = *dst;
+
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ m = *(uint32_t *) mask;
+
+ if (m)
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
+
+ if (m == 0xffffffff && is_opaque (xmm_src))
+ {
+ save_128_aligned ((__m128i *)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i *)dst);
+
+ xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+ &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ }
+
+ src += 4;
+ dst += 4;
+ mask += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t sa;
+
+ s = *src++;
+ m = (uint32_t) *mask++;
+ d = *dst;
+
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+
+ dst++;
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint32_t *dst_line, *dst;
+ __m128i xmm_src;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_dsta_hi, xmm_dsta_lo;
+ int dst_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ xmm_src = expand_pixel_32_1x128 (src);
+
+ while (height--)
+ {
+ dst = dst_line;
+
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ __m128i vd;
+
+ vd = unpack_32_1x128 (*dst);
+
+ *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+ xmm_src));
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ __m128i tmp_lo, tmp_hi;
+
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
+
+ tmp_lo = xmm_src;
+ tmp_hi = xmm_src;
+
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dsta_lo, &xmm_dsta_hi,
+ &tmp_lo, &tmp_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
+
+ w -= 4;
+ dst += 4;
+ }
+
+ while (w)
+ {
+ __m128i vd;
+
+ vd = unpack_32_1x128 (*dst);
+
+ *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+ xmm_src));
+ w--;
+ dst++;
+ }
+
+ }
+
+}
+
+static void
+sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint32_t *mask, *mask_line;
+ uint32_t m;
+ int src_stride, mask_stride, dst_stride;
+ int32_t w;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t sa;
+
+ s = *src++;
+ m = (*mask++) >> 24;
+ d = *dst;
+
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+
+ if (!is_transparent (xmm_mask))
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
+
+ if (is_opaque (xmm_mask) && is_opaque (xmm_src))
+ {
+ save_128_aligned ((__m128i *)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i *)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+ expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+ &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ }
+
+ src += 4;
+ dst += 4;
+ mask += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t sa;
+
+ s = *src++;
+ m = (*mask++) >> 24;
+ d = *dst;
+
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+
+ dst++;
+ w--;
+ }
+ }
+
+}
+
+/* A variant of 'sse2_combine_over_u' with minor tweaks */
+static force_inline void
+scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
+ const uint32_t* ps,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t fully_transparent_src)
+{
+ uint32_t s, d;
+ const uint32_t* pm = NULL;
+
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ if (fully_transparent_src)
+ return;
+
+ /* Align dst on a 16-byte boundary */
+ while (w && ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps + (vx >> 16), pm);
+ vx += unit_x;
+
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m128i tmp;
+ uint32_t tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = ps[vx >> 16];
+ vx += unit_x;
+ tmp2 = ps[vx >> 16];
+ vx += unit_x;
+ tmp3 = ps[vx >> 16];
+ vx += unit_x;
+ tmp4 = ps[vx >> 16];
+ vx += unit_x;
+
+ tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+ xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
+
+ if (is_opaque (xmm_src_hi))
+ {
+ save_128_aligned ((__m128i*)pd, xmm_src_hi);
+ }
+ else if (!is_zero (xmm_src_hi))
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (
+ xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps + (vx >> 16), pm);
+ vx += unit_x;
+
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ w--;
+ }
+}
+
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, PAD)
+
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+ uint32_t * dst,
+ const uint32_t * src,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ __m128i xmm_mask;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ if (zero_src || (*mask >> 24) == 0)
+ return;
+
+ xmm_mask = create_mask_16_128 (*mask >> 24);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i dest = xmm_mask;
+ __m128i alpha_dst = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+ }
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ uint32_t tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp2 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp3 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp4 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+
+ xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+ if (!is_zero (xmm_src))
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ dst += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t s = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &mask, &dest));
+ }
+
+ dst++;
+ w--;
+ }
+
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+
+static void
+bilinear_interpolate_line_sse2 (uint32_t * out,
+ const uint32_t * top,
+ const uint32_t * bottom,
+ int wt,
+ int wb,
+ pixman_fixed_t x,
+ pixman_fixed_t ux,
+ int width)
+{
+ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
+ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
+ const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
+ const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
+ const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
+ const __m128i xmm_zero = _mm_setzero_si128 ();
+ __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
+ uint32_t pix1, pix2, pix3, pix4;
+
+ #define INTERPOLATE_ONE_PIXEL(pix) \
+ do { \
+ __m128i xmm_wh, xmm_lo, xmm_hi, a; \
+ /* fetch 2x2 pixel block into sse2 register */ \
+ uint32_t tl = top [pixman_fixed_to_int (x)]; \
+ uint32_t tr = top [pixman_fixed_to_int (x) + 1]; \
+ uint32_t bl = bottom [pixman_fixed_to_int (x)]; \
+ uint32_t br = bottom [pixman_fixed_to_int (x) + 1]; \
+ a = _mm_set_epi32 (tr, tl, br, bl); \
+ x += ux; \
+ /* vertical interpolation */ \
+ a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \
+ xmm_wt), \
+ _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \
+ xmm_wb)); \
+ /* calculate horizontal weights */ \
+ xmm_wh = _mm_add_epi16 (xmm_addc, \
+ _mm_xor_si128 (xmm_xorc, \
+ _mm_srli_epi16 (xmm_x, 8))); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ /* horizontal interpolation */ \
+ xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
+ xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
+ a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
+ _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
+ /* shift and pack the result */ \
+ a = _mm_srli_epi32 (a, 16); \
+ a = _mm_packs_epi32 (a, a); \
+ a = _mm_packus_epi16 (a, a); \
+ pix = _mm_cvtsi128_si32 (a); \
+ } while (0)
+
+ while ((width -= 4) >= 0)
+ {
+ INTERPOLATE_ONE_PIXEL (pix1);
+ INTERPOLATE_ONE_PIXEL (pix2);
+ INTERPOLATE_ONE_PIXEL (pix3);
+ INTERPOLATE_ONE_PIXEL (pix4);
+ *out++ = pix1;
+ *out++ = pix2;
+ *out++ = pix3;
+ *out++ = pix4;
+ }
+ if (width & 2)
+ {
+ INTERPOLATE_ONE_PIXEL (pix1);
+ INTERPOLATE_ONE_PIXEL (pix2);
+ *out++ = pix1;
+ *out++ = pix2;
+ }
+ if (width & 1)
+ {
+ INTERPOLATE_ONE_PIXEL (pix1);
+ *out = pix1;
+ }
+
+ #undef INTERPOLATE_ONE_PIXEL
+}
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
+ wt, wb, vx, unit_x, w);
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FALSE, FALSE)
+
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+ /* PIXMAN_OP_OVER */
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+
+ /* PIXMAN_OP_OVER_REVERSE */
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
+
+ /* PIXMAN_OP_ADD */
+ PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+
+ /* PIXMAN_OP_SRC */
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
+
+ /* PIXMAN_OP_IN */
+ PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
+ PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
+ PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
+
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
+ { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+sse2_blt (pixman_implementation_t *imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height)
+{
+ if (!pixman_blt_sse2 (
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dst_x, dst_y, width, height))
+
+ {
+ return _pixman_implementation_blt (
+ imp->delegate,
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dst_x, dst_y, width, height);
+ }
+
+ return TRUE;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
+ {
+ return _pixman_implementation_fill (
+ imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+ }
+
+ return TRUE;
+}
+
+static uint32_t *
+sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ __m128i ff000000 = mask_ff000000;
+ uint32_t *dst = iter->buffer;
+ uint32_t *src = (uint32_t *)iter->bits;
+
+ iter->bits += iter->stride;
+
+ while (w && ((unsigned long)dst) & 0x0f)
+ {
+ *dst++ = (*src++) | 0xff000000;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ save_128_aligned (
+ (__m128i *)dst, _mm_or_si128 (
+ load_128_unaligned ((__m128i *)src), ff000000));
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ *dst++ = (*src++) | 0xff000000;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ uint32_t *dst = iter->buffer;
+ uint16_t *src = (uint16_t *)iter->bits;
+ __m128i ff000000 = mask_ff000000;
+
+ iter->bits += iter->stride;
+
+ while (w && ((unsigned long)dst) & 0x0f)
+ {
+ uint16_t s = *src++;
+
+ *dst++ = CONVERT_0565_TO_8888 (s);
+ w--;
+ }
+
+ while (w >= 8)
+ {
+ __m128i lo, hi, s;
+
+ s = _mm_loadu_si128 ((__m128i *)src);
+
+ lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
+ hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
+
+ save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
+ save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
+
+ dst += 8;
+ src += 8;
+ w -= 8;
+ }
+
+ while (w)
+ {
+ uint16_t s = *src++;
+
+ *dst++ = CONVERT_0565_TO_8888 (s);
+ w--;
+ }
+
+ return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ uint32_t *dst = iter->buffer;
+ uint8_t *src = iter->bits;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+ iter->bits += iter->stride;
+
+ while (w && (((unsigned long)dst) & 15))
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm0 = _mm_loadu_si128((__m128i *)src);
+
+ xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
+ xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
+ xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
+ xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
+ xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
+ xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
+
+ _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
+ _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
+ _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
+ _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
+typedef struct
+{
+ pixman_format_code_t format;
+ pixman_iter_get_scanline_t get_scanline;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+ { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
+ { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
+ { PIXMAN_a8, sse2_fetch_a8 },
+ { PIXMAN_null }
+};
+
+static void
+sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+ pixman_image_t *image = iter->image;
+ int x = iter->x;
+ int y = iter->y;
+ int width = iter->width;
+ int height = iter->height;
+
+#define FLAGS \
+ (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+
+ if ((iter->flags & ITER_NARROW) &&
+ (image->common.flags & FLAGS) == FLAGS &&
+ x >= 0 && y >= 0 &&
+ x + width <= image->bits.width &&
+ y + height <= image->bits.height)
+ {
+ const fetcher_info_t *f;
+
+ for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+ {
+ if (image->common.extended_format_code == f->format)
+ {
+ uint8_t *b = (uint8_t *)image->bits.bits;
+ int s = image->bits.rowstride * 4;
+
+ iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
+ iter->stride = s;
+
+ iter->get_scanline = f->get_scanline;
+ return;
+ }
+ }
+ }
+
+ imp->delegate->src_iter_init (imp->delegate, iter);
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
+{
+ pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
+
+ /* SSE2 constants */
+ mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+ mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+ mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+ mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+ mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+ mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+ mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+ mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+ mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
+ mask_0080 = create_mask_16_128 (0x0080);
+ mask_00ff = create_mask_16_128 (0x00ff);
+ mask_0101 = create_mask_16_128 (0x0101);
+ mask_ffff = create_mask_16_128 (0xffff);
+ mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+ mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+
+ /* Set up function pointers */
+ imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+ imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+ imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+ imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+ imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+ imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+ imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+ imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+ imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+ imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+ imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+ imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+ imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+ imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+ imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+ imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+ imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+ imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
+ imp->blt = sse2_blt;
+ imp->fill = sse2_fill;
+
+ imp->src_iter_init = sse2_src_iter_init;
+
+ return imp;
+}