From eca5dee9e7a8dea1edba4d10b60444ac0e884139 Mon Sep 17 00:00:00 2001 From: marha Date: Sun, 20 Mar 2011 16:32:44 +0000 Subject: xserver libX11 libxcb pixman mesa git update 20 Mar 2011 --- pixman/pixman/pixman-bits-image.c | 3210 ++++---- pixman/pixman/pixman-conical-gradient.c | 425 +- pixman/pixman/pixman-general.c | 586 +- pixman/pixman/pixman-implementation.c | 610 +- pixman/pixman/pixman-linear-gradient.c | 578 +- pixman/pixman/pixman-private.h | 51 +- pixman/pixman/pixman-radial-gradient.c | 923 ++- pixman/pixman/pixman-solid-fill.c | 181 +- pixman/pixman/pixman-sse2.c | 12153 +++++++++++++++--------------- 9 files changed, 9319 insertions(+), 9398 deletions(-) (limited to 'pixman') diff --git a/pixman/pixman/pixman-bits-image.c b/pixman/pixman/pixman-bits-image.c index a865d719a..88c2f0eea 100644 --- a/pixman/pixman/pixman-bits-image.c +++ b/pixman/pixman/pixman-bits-image.c @@ -1,1608 +1,1602 @@ -/* - * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. - * 2005 Lars Knoll & Zack Rusin, Trolltech - * 2008 Aaron Plattner, NVIDIA Corporation - * Copyright © 2000 SuSE, Inc. - * Copyright © 2007, 2009 Red Hat, Inc. - * Copyright © 2008 André Tupinambá - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Keith Packard not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. Keith Packard makes no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - */ - -#ifdef HAVE_CONFIG_H -#include -#endif -#include -#include -#include -#include "pixman-private.h" -#include "pixman-combine32.h" - -/* - * By default, just evaluate the image at 32bpp and expand. Individual image - * types can plug in a better scanline getter if they want to. For example - * we could produce smoother gradients by evaluating them at higher color - * depth, but that's a project for the future. - */ -static void -_pixman_image_get_scanline_generic_64 (pixman_image_t * image, - int x, - int y, - int width, - uint32_t * buffer, - const uint32_t * mask) -{ - uint32_t *mask8 = NULL; - - /* Contract the mask image, if one exists, so that the 32-bit fetch - * function can use it. - */ - if (mask) - { - mask8 = pixman_malloc_ab (width, sizeof(uint32_t)); - if (!mask8) - return; - - pixman_contract (mask8, (uint64_t *)mask, width); - } - - /* Fetch the source image into the first half of buffer. */ - image->bits.get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8); - - /* Expand from 32bpp to 64bpp in place. */ - pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width); - - free (mask8); -} - -/* Fetch functions */ - -static force_inline uint32_t -fetch_pixel_no_alpha (bits_image_t *image, - int x, int y, pixman_bool_t check_bounds) -{ - if (check_bounds && - (x < 0 || x >= image->width || y < 0 || y >= image->height)) - { - return 0; - } - - return image->fetch_pixel_32 (image, x, y); -} - -typedef uint32_t (* get_pixel_t) (bits_image_t *image, - int x, int y, pixman_bool_t check_bounds); - -static force_inline void -repeat (pixman_repeat_t repeat, int size, int *coord) -{ - switch (repeat) - { - case PIXMAN_REPEAT_NORMAL: - *coord = MOD (*coord, size); - break; - - case PIXMAN_REPEAT_PAD: - *coord = CLIP (*coord, 0, size - 1); - break; - - case PIXMAN_REPEAT_REFLECT: - *coord = MOD (*coord, size * 2); - - if (*coord >= size) - *coord = size * 2 - *coord - 1; - break; - - case PIXMAN_REPEAT_NONE: - break; - - default: - break; - } -} - -static force_inline uint32_t -bits_image_fetch_pixel_nearest (bits_image_t *image, - pixman_fixed_t x, - pixman_fixed_t y, - get_pixel_t get_pixel) -{ - int x0 = pixman_fixed_to_int (x - pixman_fixed_e); - int y0 = pixman_fixed_to_int (y - pixman_fixed_e); - - if (image->common.repeat != PIXMAN_REPEAT_NONE) - { - repeat (image->common.repeat, image->width, &x0); - repeat (image->common.repeat, image->height, &y0); - - return get_pixel (image, x0, y0, FALSE); - } - else - { - return get_pixel (image, x0, y0, TRUE); - } -} - -#if SIZEOF_LONG > 4 - -static force_inline uint32_t -bilinear_interpolation (uint32_t tl, uint32_t tr, - uint32_t bl, uint32_t br, - int distx, int disty) -{ - uint64_t distxy, distxiy, distixy, distixiy; - uint64_t tl64, tr64, bl64, br64; - uint64_t f, r; - - distxy = distx * disty; - distxiy = distx * (256 - disty); - distixy = (256 - distx) * disty; - distixiy = (256 - distx) * (256 - disty); - - /* Alpha and Blue */ - tl64 = tl & 0xff0000ff; - tr64 = tr & 0xff0000ff; - bl64 = bl & 0xff0000ff; - br64 = br & 0xff0000ff; - - f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy; - r = f & 0x0000ff0000ff0000ull; - - /* Red and Green */ - tl64 = tl; - tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull); - - tr64 = tr; - tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull); - - bl64 = bl; - bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull); - - br64 = br; - br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull); - - f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy; - r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull); - - return (uint32_t)(r >> 16); -} - -#else - -static force_inline uint32_t -bilinear_interpolation (uint32_t tl, uint32_t tr, - uint32_t bl, uint32_t br, - int distx, int disty) -{ - int distxy, distxiy, distixy, distixiy; - uint32_t f, r; - - distxy = distx * disty; - distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */ - distixy = (disty << 8) - distxy; /* disty * (256 - distx) */ - distixiy = - 256 * 256 - (disty << 8) - - (distx << 8) + distxy; /* (256 - distx) * (256 - disty) */ - - /* Blue */ - r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy - + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy; - - /* Green */ - f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy - + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy; - r |= f & 0xff000000; - - tl >>= 16; - tr >>= 16; - bl >>= 16; - br >>= 16; - r >>= 16; - - /* Red */ - f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy - + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy; - r |= f & 0x00ff0000; - - /* Alpha */ - f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy - + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy; - r |= f & 0xff000000; - - return r; -} - -#endif - -static force_inline uint32_t -bits_image_fetch_pixel_bilinear (bits_image_t *image, - pixman_fixed_t x, - pixman_fixed_t y, - get_pixel_t get_pixel) -{ - pixman_repeat_t repeat_mode = image->common.repeat; - int width = image->width; - int height = image->height; - int x1, y1, x2, y2; - uint32_t tl, tr, bl, br; - int32_t distx, disty; - - x1 = x - pixman_fixed_1 / 2; - y1 = y - pixman_fixed_1 / 2; - - distx = (x1 >> 8) & 0xff; - disty = (y1 >> 8) & 0xff; - - x1 = pixman_fixed_to_int (x1); - y1 = pixman_fixed_to_int (y1); - x2 = x1 + 1; - y2 = y1 + 1; - - if (repeat_mode != PIXMAN_REPEAT_NONE) - { - repeat (repeat_mode, width, &x1); - repeat (repeat_mode, height, &y1); - repeat (repeat_mode, width, &x2); - repeat (repeat_mode, height, &y2); - - tl = get_pixel (image, x1, y1, FALSE); - bl = get_pixel (image, x1, y2, FALSE); - tr = get_pixel (image, x2, y1, FALSE); - br = get_pixel (image, x2, y2, FALSE); - } - else - { - tl = get_pixel (image, x1, y1, TRUE); - tr = get_pixel (image, x2, y1, TRUE); - bl = get_pixel (image, x1, y2, TRUE); - br = get_pixel (image, x2, y2, TRUE); - } - - return bilinear_interpolation (tl, tr, bl, br, distx, disty); -} - -static void -bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima, - int offset, - int line, - int width, - uint32_t * buffer, - const uint32_t * mask) -{ - bits_image_t *bits = &ima->bits; - pixman_fixed_t x_top, x_bottom, x; - pixman_fixed_t ux_top, ux_bottom, ux; - pixman_vector_t v; - uint32_t top_mask, bottom_mask; - uint32_t *top_row; - uint32_t *bottom_row; - uint32_t *end; - uint32_t zero[2] = { 0, 0 }; - uint32_t one = 1; - int y, y1, y2; - int disty; - int mask_inc; - int w; - - /* reference point is the center of the pixel */ - v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; - v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; - v.vector[2] = pixman_fixed_1; - - if (!pixman_transform_point_3d (bits->common.transform, &v)) - return; - - ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0]; - x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2; - - y = v.vector[1] - pixman_fixed_1/2; - disty = (y >> 8) & 0xff; - - /* Load the pointers to the first and second lines from the source - * image that bilinear code must read. - * - * The main trick in this code is about the check if any line are - * outside of the image; - * - * When I realize that a line (any one) is outside, I change - * the pointer to a dummy area with zeros. Once I change this, I - * must be sure the pointer will not change, so I set the - * variables to each pointer increments inside the loop. - */ - y1 = pixman_fixed_to_int (y); - y2 = y1 + 1; - - if (y1 < 0 || y1 >= bits->height) - { - top_row = zero; - x_top = 0; - ux_top = 0; - } - else - { - top_row = bits->bits + y1 * bits->rowstride; - x_top = x; - ux_top = ux; - } - - if (y2 < 0 || y2 >= bits->height) - { - bottom_row = zero; - x_bottom = 0; - ux_bottom = 0; - } - else - { - bottom_row = bits->bits + y2 * bits->rowstride; - x_bottom = x; - ux_bottom = ux; - } - - /* Instead of checking whether the operation uses the mast in - * each loop iteration, verify this only once and prepare the - * variables to make the code smaller inside the loop. - */ - if (!mask) - { - mask_inc = 0; - mask = &one; - } - else - { - /* If have a mask, prepare the variables to check it */ - mask_inc = 1; - } - - /* If both are zero, then the whole thing is zero */ - if (top_row == zero && bottom_row == zero) - { - memset (buffer, 0, width * sizeof (uint32_t)); - return; - } - else if (bits->format == PIXMAN_x8r8g8b8) - { - if (top_row == zero) - { - top_mask = 0; - bottom_mask = 0xff000000; - } - else if (bottom_row == zero) - { - top_mask = 0xff000000; - bottom_mask = 0; - } - else - { - top_mask = 0xff000000; - bottom_mask = 0xff000000; - } - } - else - { - top_mask = 0; - bottom_mask = 0; - } - - end = buffer + width; - - /* Zero fill to the left of the image */ - while (buffer < end && x < pixman_fixed_minus_1) - { - *buffer++ = 0; - x += ux; - x_top += ux_top; - x_bottom += ux_bottom; - mask += mask_inc; - } - - /* Left edge - */ - while (buffer < end && x < 0) - { - uint32_t tr, br; - int32_t distx; - - tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask; - br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; - - distx = (x >> 8) & 0xff; - - *buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty); - - x += ux; - x_top += ux_top; - x_bottom += ux_bottom; - mask += mask_inc; - } - - /* Main part */ - w = pixman_int_to_fixed (bits->width - 1); - - while (buffer < end && x < w) - { - if (*mask) - { - uint32_t tl, tr, bl, br; - int32_t distx; - - tl = top_row [pixman_fixed_to_int (x_top)] | top_mask; - tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask; - bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask; - br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; - - distx = (x >> 8) & 0xff; - - *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty); - } - - buffer++; - x += ux; - x_top += ux_top; - x_bottom += ux_bottom; - mask += mask_inc; - } - - /* Right Edge */ - w = pixman_int_to_fixed (bits->width); - while (buffer < end && x < w) - { - if (*mask) - { - uint32_t tl, bl; - int32_t distx; - - tl = top_row [pixman_fixed_to_int (x_top)] | top_mask; - bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask; - - distx = (x >> 8) & 0xff; - - *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty); - } - - buffer++; - x += ux; - x_top += ux_top; - x_bottom += ux_bottom; - mask += mask_inc; - } - - /* Zero fill to the left of the image */ - while (buffer < end) - *buffer++ = 0; -} - -static force_inline uint32_t -bits_image_fetch_pixel_convolution (bits_image_t *image, - pixman_fixed_t x, - pixman_fixed_t y, - get_pixel_t get_pixel) -{ - pixman_fixed_t *params = image->common.filter_params; - int x_off = (params[0] - pixman_fixed_1) >> 1; - int y_off = (params[1] - pixman_fixed_1) >> 1; - int32_t cwidth = pixman_fixed_to_int (params[0]); - int32_t cheight = pixman_fixed_to_int (params[1]); - int32_t srtot, sgtot, sbtot, satot; - int32_t i, j, x1, x2, y1, y2; - pixman_repeat_t repeat_mode = image->common.repeat; - int width = image->width; - int height = image->height; - - params += 2; - - x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off); - y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off); - x2 = x1 + cwidth; - y2 = y1 + cheight; - - srtot = sgtot = sbtot = satot = 0; - - for (i = y1; i < y2; ++i) - { - for (j = x1; j < x2; ++j) - { - int rx = j; - int ry = i; - - pixman_fixed_t f = *params; - - if (f) - { - uint32_t pixel; - - if (repeat_mode != PIXMAN_REPEAT_NONE) - { - repeat (repeat_mode, width, &rx); - repeat (repeat_mode, height, &ry); - - pixel = get_pixel (image, rx, ry, FALSE); - } - else - { - pixel = get_pixel (image, rx, ry, TRUE); - } - - srtot += RED_8 (pixel) * f; - sgtot += GREEN_8 (pixel) * f; - sbtot += BLUE_8 (pixel) * f; - satot += ALPHA_8 (pixel) * f; - } - - params++; - } - } - - satot >>= 16; - srtot >>= 16; - sgtot >>= 16; - sbtot >>= 16; - - satot = CLIP (satot, 0, 0xff); - srtot = CLIP (srtot, 0, 0xff); - sgtot = CLIP (sgtot, 0, 0xff); - sbtot = CLIP (sbtot, 0, 0xff); - - return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot)); -} - -static force_inline uint32_t -bits_image_fetch_pixel_filtered (bits_image_t *image, - pixman_fixed_t x, - pixman_fixed_t y, - get_pixel_t get_pixel) -{ - switch (image->common.filter) - { - case PIXMAN_FILTER_NEAREST: - case PIXMAN_FILTER_FAST: - return bits_image_fetch_pixel_nearest (image, x, y, get_pixel); - break; - - case PIXMAN_FILTER_BILINEAR: - case PIXMAN_FILTER_GOOD: - case PIXMAN_FILTER_BEST: - return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel); - break; - - case PIXMAN_FILTER_CONVOLUTION: - return bits_image_fetch_pixel_convolution (image, x, y, get_pixel); - break; - - default: - break; - } - - return 0; -} - -static void -bits_image_fetch_affine_no_alpha (pixman_image_t * image, - int offset, - int line, - int width, - uint32_t * buffer, - const uint32_t * mask) -{ - pixman_fixed_t x, y; - pixman_fixed_t ux, uy; - pixman_vector_t v; - int i; - - /* reference point is the center of the pixel */ - v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; - v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; - v.vector[2] = pixman_fixed_1; - - if (image->common.transform) - { - if (!pixman_transform_point_3d (image->common.transform, &v)) - return; - - ux = image->common.transform->matrix[0][0]; - uy = image->common.transform->matrix[1][0]; - } - else - { - ux = pixman_fixed_1; - uy = 0; - } - - x = v.vector[0]; - y = v.vector[1]; - - for (i = 0; i < width; ++i) - { - if (!mask || mask[i]) - { - buffer[i] = bits_image_fetch_pixel_filtered ( - &image->bits, x, y, fetch_pixel_no_alpha); - } - - x += ux; - y += uy; - } -} - -/* General fetcher */ -static force_inline uint32_t -fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds) -{ - uint32_t pixel; - - if (check_bounds && - (x < 0 || x >= image->width || y < 0 || y >= image->height)) - { - return 0; - } - - pixel = image->fetch_pixel_32 (image, x, y); - - if (image->common.alpha_map) - { - uint32_t pixel_a; - - x -= image->common.alpha_origin_x; - y -= image->common.alpha_origin_y; - - if (x < 0 || x >= image->common.alpha_map->width || - y < 0 || y >= image->common.alpha_map->height) - { - pixel_a = 0; - } - else - { - pixel_a = image->common.alpha_map->fetch_pixel_32 ( - image->common.alpha_map, x, y); - - pixel_a = ALPHA_8 (pixel_a); - } - - pixel &= 0x00ffffff; - pixel |= (pixel_a << 24); - } - - return pixel; -} - -static void -bits_image_fetch_general (pixman_image_t * image, - int offset, - int line, - int width, - uint32_t * buffer, - const uint32_t * mask) -{ - pixman_fixed_t x, y, w; - pixman_fixed_t ux, uy, uw; - pixman_vector_t v; - int i; - - /* reference point is the center of the pixel */ - v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; - v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; - v.vector[2] = pixman_fixed_1; - - if (image->common.transform) - { - if (!pixman_transform_point_3d (image->common.transform, &v)) - return; - - ux = image->common.transform->matrix[0][0]; - uy = image->common.transform->matrix[1][0]; - uw = image->common.transform->matrix[2][0]; - } - else - { - ux = pixman_fixed_1; - uy = 0; - uw = 0; - } - - x = v.vector[0]; - y = v.vector[1]; - w = v.vector[2]; - - for (i = 0; i < width; ++i) - { - pixman_fixed_t x0, y0; - - if (!mask || mask[i]) - { - if (w != 0) - { - x0 = ((pixman_fixed_48_16_t)x << 16) / w; - y0 = ((pixman_fixed_48_16_t)y << 16) / w; - } - else - { - x0 = 0; - y0 = 0; - } - - buffer[i] = bits_image_fetch_pixel_filtered ( - &image->bits, x0, y0, fetch_pixel_general); - } - - x += ux; - y += uy; - w += uw; - } -} - -static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; - -typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x); - -static force_inline void -bits_image_fetch_bilinear_affine (pixman_image_t * image, - int offset, - int line, - int width, - uint32_t * buffer, - const uint32_t * mask, - - convert_pixel_t convert_pixel, - pixman_format_code_t format, - pixman_repeat_t repeat_mode) -{ - pixman_fixed_t x, y; - pixman_fixed_t ux, uy; - pixman_vector_t v; - bits_image_t *bits = &image->bits; - int i; - - /* reference point is the center of the pixel */ - v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; - v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; - v.vector[2] = pixman_fixed_1; - - if (!pixman_transform_point_3d (image->common.transform, &v)) - return; - - ux = image->common.transform->matrix[0][0]; - uy = image->common.transform->matrix[1][0]; - - x = v.vector[0]; - y = v.vector[1]; - - for (i = 0; i < width; ++i) - { - int x1, y1, x2, y2; - uint32_t tl, tr, bl, br; - int32_t distx, disty; - int width = image->bits.width; - int height = image->bits.height; - const uint8_t *row1; - const uint8_t *row2; - - if (mask && !mask[i]) - goto next; - - x1 = x - pixman_fixed_1 / 2; - y1 = y - pixman_fixed_1 / 2; - - distx = (x1 >> 8) & 0xff; - disty = (y1 >> 8) & 0xff; - - y1 = pixman_fixed_to_int (y1); - y2 = y1 + 1; - x1 = pixman_fixed_to_int (x1); - x2 = x1 + 1; - - if (repeat_mode != PIXMAN_REPEAT_NONE) - { - uint32_t mask; - - mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; - - repeat (repeat_mode, width, &x1); - repeat (repeat_mode, height, &y1); - repeat (repeat_mode, width, &x2); - repeat (repeat_mode, height, &y2); - - row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1; - row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2; - - tl = convert_pixel (row1, x1) | mask; - tr = convert_pixel (row1, x2) | mask; - bl = convert_pixel (row2, x1) | mask; - br = convert_pixel (row2, x2) | mask; - } - else - { - uint32_t mask1, mask2; - int bpp; - - /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value, - * which means if you use it in expressions, those - * expressions become unsigned themselves. Since - * the variables below can be negative in some cases, - * that will lead to crashes on 64 bit architectures. - * - * So this line makes sure bpp is signed - */ - bpp = PIXMAN_FORMAT_BPP (format); - - if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0) - { - buffer[i] = 0; - goto next; - } - - if (y2 == 0) - { - row1 = zero; - mask1 = 0; - } - else - { - row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1; - row1 += bpp / 8 * x1; - - mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; - } - - if (y1 == height - 1) - { - row2 = zero; - mask2 = 0; - } - else - { - row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2; - row2 += bpp / 8 * x1; - - mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; - } - - if (x2 == 0) - { - tl = 0; - bl = 0; - } - else - { - tl = convert_pixel (row1, 0) | mask1; - bl = convert_pixel (row2, 0) | mask2; - } - - if (x1 == width - 1) - { - tr = 0; - br = 0; - } - else - { - tr = convert_pixel (row1, 1) | mask1; - br = convert_pixel (row2, 1) | mask2; - } - } - - buffer[i] = bilinear_interpolation ( - tl, tr, bl, br, distx, disty); - - next: - x += ux; - y += uy; - } -} - -static force_inline void -bits_image_fetch_nearest_affine (pixman_image_t * image, - int offset, - int line, - int width, - uint32_t * buffer, - const uint32_t * mask, - - convert_pixel_t convert_pixel, - pixman_format_code_t format, - pixman_repeat_t repeat_mode) -{ - pixman_fixed_t x, y; - pixman_fixed_t ux, uy; - pixman_vector_t v; - bits_image_t *bits = &image->bits; - int i; - - /* reference point is the center of the pixel */ - v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; - v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; - v.vector[2] = pixman_fixed_1; - - if (!pixman_transform_point_3d (image->common.transform, &v)) - return; - - ux = image->common.transform->matrix[0][0]; - uy = image->common.transform->matrix[1][0]; - - x = v.vector[0]; - y = v.vector[1]; - - for (i = 0; i < width; ++i) - { - int width, height, x0, y0; - const uint8_t *row; - - if (mask && !mask[i]) - goto next; - - width = image->bits.width; - height = image->bits.height; - x0 = pixman_fixed_to_int (x - pixman_fixed_e); - y0 = pixman_fixed_to_int (y - pixman_fixed_e); - - if (repeat_mode == PIXMAN_REPEAT_NONE && - (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width)) - { - buffer[i] = 0; - } - else - { - uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; - - if (repeat_mode != PIXMAN_REPEAT_NONE) - { - repeat (repeat_mode, width, &x0); - repeat (repeat_mode, height, &y0); - } - - row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0; - - buffer[i] = convert_pixel (row, x0) | mask; - } - - next: - x += ux; - y += uy; - } -} - -static force_inline uint32_t -convert_a8r8g8b8 (const uint8_t *row, int x) -{ - return *(((uint32_t *)row) + x); -} - -static force_inline uint32_t -convert_x8r8g8b8 (const uint8_t *row, int x) -{ - return *(((uint32_t *)row) + x); -} - -static force_inline uint32_t -convert_a8 (const uint8_t *row, int x) -{ - return *(row + x) << 24; -} - -static force_inline uint32_t -convert_r5g6b5 (const uint8_t *row, int x) -{ - return CONVERT_0565_TO_0888 (*((uint16_t *)row + x)); -} - -#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode) \ - static void \ - bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image, \ - int offset, \ - int line, \ - int width, \ - uint32_t * buffer, \ - const uint32_t * mask) \ - { \ - bits_image_fetch_bilinear_affine (image, offset, line, \ - width, buffer, mask, \ - convert_ ## format, \ - PIXMAN_ ## format, \ - repeat_mode); \ - } - -#define MAKE_NEAREST_FETCHER(name, format, repeat_mode) \ - static void \ - bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image, \ - int offset, \ - int line, \ - int width, \ - uint32_t * buffer, \ - const uint32_t * mask) \ - { \ - bits_image_fetch_nearest_affine (image, offset, line, \ - width, buffer, mask, \ - convert_ ## format, \ - PIXMAN_ ## format, \ - repeat_mode); \ - } - -#define MAKE_FETCHERS(name, format, repeat_mode) \ - MAKE_NEAREST_FETCHER (name, format, repeat_mode) \ - MAKE_BILINEAR_FETCHER (name, format, repeat_mode) - -MAKE_FETCHERS (pad_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_PAD) -MAKE_FETCHERS (none_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NONE) -MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT) -MAKE_FETCHERS (normal_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NORMAL) -MAKE_FETCHERS (pad_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_PAD) -MAKE_FETCHERS (none_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NONE) -MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT) -MAKE_FETCHERS (normal_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NORMAL) -MAKE_FETCHERS (pad_a8, a8, PIXMAN_REPEAT_PAD) -MAKE_FETCHERS (none_a8, a8, PIXMAN_REPEAT_NONE) -MAKE_FETCHERS (reflect_a8, a8, PIXMAN_REPEAT_REFLECT) -MAKE_FETCHERS (normal_a8, a8, PIXMAN_REPEAT_NORMAL) -MAKE_FETCHERS (pad_r5g6b5, r5g6b5, PIXMAN_REPEAT_PAD) -MAKE_FETCHERS (none_r5g6b5, r5g6b5, PIXMAN_REPEAT_NONE) -MAKE_FETCHERS (reflect_r5g6b5, r5g6b5, PIXMAN_REPEAT_REFLECT) -MAKE_FETCHERS (normal_r5g6b5, r5g6b5, PIXMAN_REPEAT_NORMAL) - -static void -bits_image_fetch_solid_32 (pixman_image_t * image, - int x, - int y, - int width, - uint32_t * buffer, - const uint32_t * mask) -{ - uint32_t color; - uint32_t *end; - - color = image->bits.fetch_pixel_32 (&image->bits, 0, 0); - - end = buffer + width; - while (buffer < end) - *(buffer++) = color; -} - -static void -bits_image_fetch_solid_64 (pixman_image_t * image, - int x, - int y, - int width, - uint32_t * b, - const uint32_t * unused) -{ - uint64_t color; - uint64_t *buffer = (uint64_t *)b; - uint64_t *end; - - color = image->bits.fetch_pixel_64 (&image->bits, 0, 0); - - end = buffer + width; - while (buffer < end) - *(buffer++) = color; -} - -static void -bits_image_fetch_untransformed_repeat_none (bits_image_t *image, - pixman_bool_t wide, - int x, - int y, - int width, - uint32_t * buffer) -{ - uint32_t w; - - if (y < 0 || y >= image->height) - { - memset (buffer, 0, width * (wide? 8 : 4)); - return; - } - - if (x < 0) - { - w = MIN (width, -x); - - memset (buffer, 0, w * (wide ? 8 : 4)); - - width -= w; - buffer += w * (wide? 2 : 1); - x += w; - } - - if (x < image->width) - { - w = MIN (width, image->width - x); - - if (wide) - image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL); - else - image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL); - - width -= w; - buffer += w * (wide? 2 : 1); - x += w; - } - - memset (buffer, 0, width * (wide ? 8 : 4)); -} - -static void -bits_image_fetch_untransformed_repeat_normal (bits_image_t *image, - pixman_bool_t wide, - int x, - int y, - int width, - uint32_t * buffer) -{ - uint32_t w; - - while (y < 0) - y += image->height; - - while (y >= image->height) - y -= image->height; - - while (width) - { - while (x < 0) - x += image->width; - while (x >= image->width) - x -= image->width; - - w = MIN (width, image->width - x); - - if (wide) - image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL); - else - image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL); - - buffer += w * (wide? 2 : 1); - x += w; - width -= w; - } -} - -static void -bits_image_fetch_untransformed_32 (pixman_image_t * image, - int x, - int y, - int width, - uint32_t * buffer, - const uint32_t * mask) -{ - if (image->common.repeat == PIXMAN_REPEAT_NONE) - { - bits_image_fetch_untransformed_repeat_none ( - &image->bits, FALSE, x, y, width, buffer); - } - else - { - bits_image_fetch_untransformed_repeat_normal ( - &image->bits, FALSE, x, y, width, buffer); - } -} - -static void -bits_image_fetch_untransformed_64 (pixman_image_t * image, - int x, - int y, - int width, - uint32_t * buffer, - const uint32_t * unused) -{ - if (image->common.repeat == PIXMAN_REPEAT_NONE) - { - bits_image_fetch_untransformed_repeat_none ( - &image->bits, TRUE, x, y, width, buffer); - } - else - { - bits_image_fetch_untransformed_repeat_normal ( - &image->bits, TRUE, x, y, width, buffer); - } -} - -typedef struct -{ - pixman_format_code_t format; - uint32_t flags; - fetch_scanline_t fetch_32; - fetch_scanline_t fetch_64; -} fetcher_info_t; - -static const fetcher_info_t fetcher_info[] = -{ - { PIXMAN_solid, - FAST_PATH_NO_ALPHA_MAP, - bits_image_fetch_solid_32, - bits_image_fetch_solid_64 - }, - - { PIXMAN_any, - (FAST_PATH_NO_ALPHA_MAP | - FAST_PATH_ID_TRANSFORM | - FAST_PATH_NO_CONVOLUTION_FILTER | - FAST_PATH_NO_PAD_REPEAT | - FAST_PATH_NO_REFLECT_REPEAT), - bits_image_fetch_untransformed_32, - bits_image_fetch_untransformed_64 - }, - -#define FAST_BILINEAR_FLAGS \ - (FAST_PATH_NO_ALPHA_MAP | \ - FAST_PATH_NO_ACCESSORS | \ - FAST_PATH_HAS_TRANSFORM | \ - FAST_PATH_AFFINE_TRANSFORM | \ - FAST_PATH_X_UNIT_POSITIVE | \ - FAST_PATH_Y_UNIT_ZERO | \ - FAST_PATH_NONE_REPEAT | \ - FAST_PATH_BILINEAR_FILTER) - - { PIXMAN_a8r8g8b8, - FAST_BILINEAR_FLAGS, - bits_image_fetch_bilinear_no_repeat_8888, - _pixman_image_get_scanline_generic_64 - }, - - { PIXMAN_x8r8g8b8, - FAST_BILINEAR_FLAGS, - bits_image_fetch_bilinear_no_repeat_8888, - _pixman_image_get_scanline_generic_64 - }, - -#define GENERAL_BILINEAR_FLAGS \ - (FAST_PATH_NO_ALPHA_MAP | \ - FAST_PATH_NO_ACCESSORS | \ - FAST_PATH_HAS_TRANSFORM | \ - FAST_PATH_AFFINE_TRANSFORM | \ - FAST_PATH_BILINEAR_FILTER) - -#define GENERAL_NEAREST_FLAGS \ - (FAST_PATH_NO_ALPHA_MAP | \ - FAST_PATH_NO_ACCESSORS | \ - FAST_PATH_HAS_TRANSFORM | \ - FAST_PATH_AFFINE_TRANSFORM | \ - FAST_PATH_NEAREST_FILTER) - -#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \ - { PIXMAN_ ## format, \ - GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \ - bits_image_fetch_bilinear_affine_ ## name, \ - _pixman_image_get_scanline_generic_64 \ - }, - -#define NEAREST_AFFINE_FAST_PATH(name, format, repeat) \ - { PIXMAN_ ## format, \ - GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \ - bits_image_fetch_nearest_affine_ ## name, \ - _pixman_image_get_scanline_generic_64 \ - }, - -#define AFFINE_FAST_PATHS(name, format, repeat) \ - BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \ - NEAREST_AFFINE_FAST_PATH(name, format, repeat) - - AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD) - AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE) - AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT) - AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL) - AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD) - AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE) - AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT) - AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL) - AFFINE_FAST_PATHS (pad_a8, a8, PAD) - AFFINE_FAST_PATHS (none_a8, a8, NONE) - AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT) - AFFINE_FAST_PATHS (normal_a8, a8, NORMAL) - AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD) - AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE) - AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT) - AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL) - - /* Affine, no alpha */ - { PIXMAN_any, - (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM), - bits_image_fetch_affine_no_alpha, - _pixman_image_get_scanline_generic_64 - }, - - /* General */ - { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 }, - - { PIXMAN_null }, -}; - -static void -bits_image_property_changed (pixman_image_t *image) -{ - uint32_t flags = image->common.flags; - pixman_format_code_t format = image->common.extended_format_code; - const fetcher_info_t *info; - - _pixman_bits_image_setup_accessors (&image->bits); - - info = fetcher_info; - while (info->format != PIXMAN_null) - { - if ((info->format == format || info->format == PIXMAN_any) && - (info->flags & flags) == info->flags) - { - image->bits.get_scanline_32 = info->fetch_32; - image->bits.get_scanline_64 = info->fetch_64; - break; - } - - info++; - } -} - -static uint32_t * -src_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask) -{ - iter->image->bits.get_scanline_32 ( - iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask); - - return iter->buffer; -} - -static uint32_t * -src_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) -{ - iter->image->bits.get_scanline_64 ( - iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask); - - return iter->buffer; -} - -void -_pixman_bits_image_src_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags) -{ - if (flags & ITER_NARROW) - iter->get_scanline = src_get_scanline_narrow; - else - iter->get_scanline = src_get_scanline_wide; -} - -static uint32_t * -dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask) -{ - pixman_image_t *image = iter->image; - int x = iter->x; - int y = iter->y; - int width = iter->width; - uint32_t * buffer = iter->buffer; - - image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask); - if (image->common.alpha_map) - { - x -= image->common.alpha_origin_x; - y -= image->common.alpha_origin_y; - - image->common.alpha_map->fetch_scanline_32 ( - (pixman_image_t *)image->common.alpha_map, - x, y, width, buffer, mask); - } - - return iter->buffer; -} - -static uint32_t * -dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) -{ - bits_image_t * image = &iter->image->bits; - int x = iter->x; - int y = iter->y; - int width = iter->width; - uint32_t * buffer = iter->buffer; - - image->fetch_scanline_64 ( - (pixman_image_t *)image, x, y, width, buffer, mask); - if (image->common.alpha_map) - { - x -= image->common.alpha_origin_x; - y -= image->common.alpha_origin_y; - - image->common.alpha_map->fetch_scanline_64 ( - (pixman_image_t *)image->common.alpha_map, x, y, width, buffer, mask); - } - - return iter->buffer; -} - -static void -dest_write_back_narrow (pixman_iter_t *iter) -{ - bits_image_t * image = &iter->image->bits; - int x = iter->x; - int y = iter->y; - int width = iter->width; - const uint32_t *buffer = iter->buffer; - - image->store_scanline_32 (image, x, y, width, buffer); - - if (image->common.alpha_map) - { - x -= image->common.alpha_origin_x; - y -= image->common.alpha_origin_y; - - image->common.alpha_map->store_scanline_32 ( - image->common.alpha_map, x, y, width, buffer); - } - - iter->y++; -} - -static void -dest_write_back_wide (pixman_iter_t *iter) -{ - bits_image_t * image = &iter->image->bits; - int x = iter->x; - int y = iter->y; - int width = iter->width; - const uint32_t *buffer = iter->buffer; - - image->store_scanline_64 (image, x, y, width, buffer); - - if (image->common.alpha_map) - { - x -= image->common.alpha_origin_x; - y -= image->common.alpha_origin_y; - - image->common.alpha_map->store_scanline_64 ( - image->common.alpha_map, x, y, width, buffer); - } - - iter->y++; -} - -static void -dest_write_back_direct (pixman_iter_t *iter) -{ - iter->buffer += iter->image->bits.rowstride; -} - -void -_pixman_bits_image_dest_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags) -{ - if (flags & ITER_NARROW) - { - if (((image->common.flags & - (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) == - (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) && - (image->bits.format == PIXMAN_a8r8g8b8 || - (image->bits.format == PIXMAN_x8r8g8b8 && - (flags & ITER_LOCALIZED_ALPHA)))) - { - iter->buffer = image->bits.bits + y * image->bits.rowstride + x; - - iter->get_scanline = _pixman_iter_get_scanline_noop; - iter->write_back = dest_write_back_direct; - } - else - { - if ((flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) == - (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) - { - iter->get_scanline = _pixman_iter_get_scanline_noop; - } - else - { - iter->get_scanline = dest_get_scanline_narrow; - } - - iter->write_back = dest_write_back_narrow; - } - } - else - { - iter->get_scanline = dest_get_scanline_wide; - iter->write_back = dest_write_back_wide; - } -} - -static uint32_t * -create_bits (pixman_format_code_t format, - int width, - int height, - int * rowstride_bytes) -{ - int stride; - int buf_size; - int bpp; - - /* what follows is a long-winded way, avoiding any possibility of integer - * overflows, of saying: - * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t); - */ - - bpp = PIXMAN_FORMAT_BPP (format); - if (pixman_multiply_overflows_int (width, bpp)) - return NULL; - - stride = width * bpp; - if (pixman_addition_overflows_int (stride, 0x1f)) - return NULL; - - stride += 0x1f; - stride >>= 5; - - stride *= sizeof (uint32_t); - - if (pixman_multiply_overflows_int (height, stride)) - return NULL; - - buf_size = height * stride; - - if (rowstride_bytes) - *rowstride_bytes = stride; - - return calloc (buf_size, 1); -} - -PIXMAN_EXPORT pixman_image_t * -pixman_image_create_bits (pixman_format_code_t format, - int width, - int height, - uint32_t * bits, - int rowstride_bytes) -{ - pixman_image_t *image; - uint32_t *free_me = NULL; - - /* must be a whole number of uint32_t's - */ - return_val_if_fail ( - bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL); - - return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL); - - if (!bits && width && height) - { - free_me = bits = create_bits (format, width, height, &rowstride_bytes); - if (!bits) - return NULL; - } - - image = _pixman_image_allocate (); - - if (!image) - { - if (free_me) - free (free_me); - - return NULL; - } - - image->type = BITS; - image->bits.format = format; - image->bits.width = width; - image->bits.height = height; - image->bits.bits = bits; - image->bits.free_me = free_me; - image->bits.read_func = NULL; - image->bits.write_func = NULL; - - /* The rowstride is stored in number of uint32_t */ - image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t); - - image->bits.indexed = NULL; - - image->common.property_changed = bits_image_property_changed; - - _pixman_image_reset_clip_region (image); - - return image; -} +/* + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + * 2005 Lars Knoll & Zack Rusin, Trolltech + * 2008 Aaron Plattner, NVIDIA Corporation + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007, 2009 Red Hat, Inc. + * Copyright © 2008 André Tupinambá + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Keith Packard makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include "pixman-private.h" +#include "pixman-combine32.h" + +/* + * By default, just evaluate the image at 32bpp and expand. Individual image + * types can plug in a better scanline getter if they want to. For example + * we could produce smoother gradients by evaluating them at higher color + * depth, but that's a project for the future. + */ +static void +_pixman_image_get_scanline_generic_64 (pixman_image_t * image, + int x, + int y, + int width, + uint32_t * buffer, + const uint32_t * mask) +{ + uint32_t *mask8 = NULL; + + /* Contract the mask image, if one exists, so that the 32-bit fetch + * function can use it. + */ + if (mask) + { + mask8 = pixman_malloc_ab (width, sizeof(uint32_t)); + if (!mask8) + return; + + pixman_contract (mask8, (uint64_t *)mask, width); + } + + /* Fetch the source image into the first half of buffer. */ + image->bits.get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8); + + /* Expand from 32bpp to 64bpp in place. */ + pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width); + + free (mask8); +} + +/* Fetch functions */ + +static force_inline uint32_t +fetch_pixel_no_alpha (bits_image_t *image, + int x, int y, pixman_bool_t check_bounds) +{ + if (check_bounds && + (x < 0 || x >= image->width || y < 0 || y >= image->height)) + { + return 0; + } + + return image->fetch_pixel_32 (image, x, y); +} + +typedef uint32_t (* get_pixel_t) (bits_image_t *image, + int x, int y, pixman_bool_t check_bounds); + +static force_inline void +repeat (pixman_repeat_t repeat, int size, int *coord) +{ + switch (repeat) + { + case PIXMAN_REPEAT_NORMAL: + *coord = MOD (*coord, size); + break; + + case PIXMAN_REPEAT_PAD: + *coord = CLIP (*coord, 0, size - 1); + break; + + case PIXMAN_REPEAT_REFLECT: + *coord = MOD (*coord, size * 2); + + if (*coord >= size) + *coord = size * 2 - *coord - 1; + break; + + case PIXMAN_REPEAT_NONE: + break; + + default: + break; + } +} + +static force_inline uint32_t +bits_image_fetch_pixel_nearest (bits_image_t *image, + pixman_fixed_t x, + pixman_fixed_t y, + get_pixel_t get_pixel) +{ + int x0 = pixman_fixed_to_int (x - pixman_fixed_e); + int y0 = pixman_fixed_to_int (y - pixman_fixed_e); + + if (image->common.repeat != PIXMAN_REPEAT_NONE) + { + repeat (image->common.repeat, image->width, &x0); + repeat (image->common.repeat, image->height, &y0); + + return get_pixel (image, x0, y0, FALSE); + } + else + { + return get_pixel (image, x0, y0, TRUE); + } +} + +#if SIZEOF_LONG > 4 + +static force_inline uint32_t +bilinear_interpolation (uint32_t tl, uint32_t tr, + uint32_t bl, uint32_t br, + int distx, int disty) +{ + uint64_t distxy, distxiy, distixy, distixiy; + uint64_t tl64, tr64, bl64, br64; + uint64_t f, r; + + distxy = distx * disty; + distxiy = distx * (256 - disty); + distixy = (256 - distx) * disty; + distixiy = (256 - distx) * (256 - disty); + + /* Alpha and Blue */ + tl64 = tl & 0xff0000ff; + tr64 = tr & 0xff0000ff; + bl64 = bl & 0xff0000ff; + br64 = br & 0xff0000ff; + + f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy; + r = f & 0x0000ff0000ff0000ull; + + /* Red and Green */ + tl64 = tl; + tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull); + + tr64 = tr; + tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull); + + bl64 = bl; + bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull); + + br64 = br; + br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull); + + f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy; + r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull); + + return (uint32_t)(r >> 16); +} + +#else + +static force_inline uint32_t +bilinear_interpolation (uint32_t tl, uint32_t tr, + uint32_t bl, uint32_t br, + int distx, int disty) +{ + int distxy, distxiy, distixy, distixiy; + uint32_t f, r; + + distxy = distx * disty; + distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */ + distixy = (disty << 8) - distxy; /* disty * (256 - distx) */ + distixiy = + 256 * 256 - (disty << 8) - + (distx << 8) + distxy; /* (256 - distx) * (256 - disty) */ + + /* Blue */ + r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy + + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy; + + /* Green */ + f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy + + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy; + r |= f & 0xff000000; + + tl >>= 16; + tr >>= 16; + bl >>= 16; + br >>= 16; + r >>= 16; + + /* Red */ + f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy + + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy; + r |= f & 0x00ff0000; + + /* Alpha */ + f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy + + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy; + r |= f & 0xff000000; + + return r; +} + +#endif + +static force_inline uint32_t +bits_image_fetch_pixel_bilinear (bits_image_t *image, + pixman_fixed_t x, + pixman_fixed_t y, + get_pixel_t get_pixel) +{ + pixman_repeat_t repeat_mode = image->common.repeat; + int width = image->width; + int height = image->height; + int x1, y1, x2, y2; + uint32_t tl, tr, bl, br; + int32_t distx, disty; + + x1 = x - pixman_fixed_1 / 2; + y1 = y - pixman_fixed_1 / 2; + + distx = (x1 >> 8) & 0xff; + disty = (y1 >> 8) & 0xff; + + x1 = pixman_fixed_to_int (x1); + y1 = pixman_fixed_to_int (y1); + x2 = x1 + 1; + y2 = y1 + 1; + + if (repeat_mode != PIXMAN_REPEAT_NONE) + { + repeat (repeat_mode, width, &x1); + repeat (repeat_mode, height, &y1); + repeat (repeat_mode, width, &x2); + repeat (repeat_mode, height, &y2); + + tl = get_pixel (image, x1, y1, FALSE); + bl = get_pixel (image, x1, y2, FALSE); + tr = get_pixel (image, x2, y1, FALSE); + br = get_pixel (image, x2, y2, FALSE); + } + else + { + tl = get_pixel (image, x1, y1, TRUE); + tr = get_pixel (image, x2, y1, TRUE); + bl = get_pixel (image, x1, y2, TRUE); + br = get_pixel (image, x2, y2, TRUE); + } + + return bilinear_interpolation (tl, tr, bl, br, distx, disty); +} + +static void +bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima, + int offset, + int line, + int width, + uint32_t * buffer, + const uint32_t * mask) +{ + bits_image_t *bits = &ima->bits; + pixman_fixed_t x_top, x_bottom, x; + pixman_fixed_t ux_top, ux_bottom, ux; + pixman_vector_t v; + uint32_t top_mask, bottom_mask; + uint32_t *top_row; + uint32_t *bottom_row; + uint32_t *end; + uint32_t zero[2] = { 0, 0 }; + uint32_t one = 1; + int y, y1, y2; + int disty; + int mask_inc; + int w; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (bits->common.transform, &v)) + return; + + ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0]; + x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2; + + y = v.vector[1] - pixman_fixed_1/2; + disty = (y >> 8) & 0xff; + + /* Load the pointers to the first and second lines from the source + * image that bilinear code must read. + * + * The main trick in this code is about the check if any line are + * outside of the image; + * + * When I realize that a line (any one) is outside, I change + * the pointer to a dummy area with zeros. Once I change this, I + * must be sure the pointer will not change, so I set the + * variables to each pointer increments inside the loop. + */ + y1 = pixman_fixed_to_int (y); + y2 = y1 + 1; + + if (y1 < 0 || y1 >= bits->height) + { + top_row = zero; + x_top = 0; + ux_top = 0; + } + else + { + top_row = bits->bits + y1 * bits->rowstride; + x_top = x; + ux_top = ux; + } + + if (y2 < 0 || y2 >= bits->height) + { + bottom_row = zero; + x_bottom = 0; + ux_bottom = 0; + } + else + { + bottom_row = bits->bits + y2 * bits->rowstride; + x_bottom = x; + ux_bottom = ux; + } + + /* Instead of checking whether the operation uses the mast in + * each loop iteration, verify this only once and prepare the + * variables to make the code smaller inside the loop. + */ + if (!mask) + { + mask_inc = 0; + mask = &one; + } + else + { + /* If have a mask, prepare the variables to check it */ + mask_inc = 1; + } + + /* If both are zero, then the whole thing is zero */ + if (top_row == zero && bottom_row == zero) + { + memset (buffer, 0, width * sizeof (uint32_t)); + return; + } + else if (bits->format == PIXMAN_x8r8g8b8) + { + if (top_row == zero) + { + top_mask = 0; + bottom_mask = 0xff000000; + } + else if (bottom_row == zero) + { + top_mask = 0xff000000; + bottom_mask = 0; + } + else + { + top_mask = 0xff000000; + bottom_mask = 0xff000000; + } + } + else + { + top_mask = 0; + bottom_mask = 0; + } + + end = buffer + width; + + /* Zero fill to the left of the image */ + while (buffer < end && x < pixman_fixed_minus_1) + { + *buffer++ = 0; + x += ux; + x_top += ux_top; + x_bottom += ux_bottom; + mask += mask_inc; + } + + /* Left edge + */ + while (buffer < end && x < 0) + { + uint32_t tr, br; + int32_t distx; + + tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask; + br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; + + distx = (x >> 8) & 0xff; + + *buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty); + + x += ux; + x_top += ux_top; + x_bottom += ux_bottom; + mask += mask_inc; + } + + /* Main part */ + w = pixman_int_to_fixed (bits->width - 1); + + while (buffer < end && x < w) + { + if (*mask) + { + uint32_t tl, tr, bl, br; + int32_t distx; + + tl = top_row [pixman_fixed_to_int (x_top)] | top_mask; + tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask; + bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask; + br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; + + distx = (x >> 8) & 0xff; + + *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty); + } + + buffer++; + x += ux; + x_top += ux_top; + x_bottom += ux_bottom; + mask += mask_inc; + } + + /* Right Edge */ + w = pixman_int_to_fixed (bits->width); + while (buffer < end && x < w) + { + if (*mask) + { + uint32_t tl, bl; + int32_t distx; + + tl = top_row [pixman_fixed_to_int (x_top)] | top_mask; + bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask; + + distx = (x >> 8) & 0xff; + + *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty); + } + + buffer++; + x += ux; + x_top += ux_top; + x_bottom += ux_bottom; + mask += mask_inc; + } + + /* Zero fill to the left of the image */ + while (buffer < end) + *buffer++ = 0; +} + +static force_inline uint32_t +bits_image_fetch_pixel_convolution (bits_image_t *image, + pixman_fixed_t x, + pixman_fixed_t y, + get_pixel_t get_pixel) +{ + pixman_fixed_t *params = image->common.filter_params; + int x_off = (params[0] - pixman_fixed_1) >> 1; + int y_off = (params[1] - pixman_fixed_1) >> 1; + int32_t cwidth = pixman_fixed_to_int (params[0]); + int32_t cheight = pixman_fixed_to_int (params[1]); + int32_t srtot, sgtot, sbtot, satot; + int32_t i, j, x1, x2, y1, y2; + pixman_repeat_t repeat_mode = image->common.repeat; + int width = image->width; + int height = image->height; + + params += 2; + + x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off); + y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off); + x2 = x1 + cwidth; + y2 = y1 + cheight; + + srtot = sgtot = sbtot = satot = 0; + + for (i = y1; i < y2; ++i) + { + for (j = x1; j < x2; ++j) + { + int rx = j; + int ry = i; + + pixman_fixed_t f = *params; + + if (f) + { + uint32_t pixel; + + if (repeat_mode != PIXMAN_REPEAT_NONE) + { + repeat (repeat_mode, width, &rx); + repeat (repeat_mode, height, &ry); + + pixel = get_pixel (image, rx, ry, FALSE); + } + else + { + pixel = get_pixel (image, rx, ry, TRUE); + } + + srtot += RED_8 (pixel) * f; + sgtot += GREEN_8 (pixel) * f; + sbtot += BLUE_8 (pixel) * f; + satot += ALPHA_8 (pixel) * f; + } + + params++; + } + } + + satot >>= 16; + srtot >>= 16; + sgtot >>= 16; + sbtot >>= 16; + + satot = CLIP (satot, 0, 0xff); + srtot = CLIP (srtot, 0, 0xff); + sgtot = CLIP (sgtot, 0, 0xff); + sbtot = CLIP (sbtot, 0, 0xff); + + return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot)); +} + +static force_inline uint32_t +bits_image_fetch_pixel_filtered (bits_image_t *image, + pixman_fixed_t x, + pixman_fixed_t y, + get_pixel_t get_pixel) +{ + switch (image->common.filter) + { + case PIXMAN_FILTER_NEAREST: + case PIXMAN_FILTER_FAST: + return bits_image_fetch_pixel_nearest (image, x, y, get_pixel); + break; + + case PIXMAN_FILTER_BILINEAR: + case PIXMAN_FILTER_GOOD: + case PIXMAN_FILTER_BEST: + return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel); + break; + + case PIXMAN_FILTER_CONVOLUTION: + return bits_image_fetch_pixel_convolution (image, x, y, get_pixel); + break; + + default: + break; + } + + return 0; +} + +static void +bits_image_fetch_affine_no_alpha (pixman_image_t * image, + int offset, + int line, + int width, + uint32_t * buffer, + const uint32_t * mask) +{ + pixman_fixed_t x, y; + pixman_fixed_t ux, uy; + pixman_vector_t v; + int i; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (image->common.transform) + { + if (!pixman_transform_point_3d (image->common.transform, &v)) + return; + + ux = image->common.transform->matrix[0][0]; + uy = image->common.transform->matrix[1][0]; + } + else + { + ux = pixman_fixed_1; + uy = 0; + } + + x = v.vector[0]; + y = v.vector[1]; + + for (i = 0; i < width; ++i) + { + if (!mask || mask[i]) + { + buffer[i] = bits_image_fetch_pixel_filtered ( + &image->bits, x, y, fetch_pixel_no_alpha); + } + + x += ux; + y += uy; + } +} + +/* General fetcher */ +static force_inline uint32_t +fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds) +{ + uint32_t pixel; + + if (check_bounds && + (x < 0 || x >= image->width || y < 0 || y >= image->height)) + { + return 0; + } + + pixel = image->fetch_pixel_32 (image, x, y); + + if (image->common.alpha_map) + { + uint32_t pixel_a; + + x -= image->common.alpha_origin_x; + y -= image->common.alpha_origin_y; + + if (x < 0 || x >= image->common.alpha_map->width || + y < 0 || y >= image->common.alpha_map->height) + { + pixel_a = 0; + } + else + { + pixel_a = image->common.alpha_map->fetch_pixel_32 ( + image->common.alpha_map, x, y); + + pixel_a = ALPHA_8 (pixel_a); + } + + pixel &= 0x00ffffff; + pixel |= (pixel_a << 24); + } + + return pixel; +} + +static void +bits_image_fetch_general (pixman_image_t * image, + int offset, + int line, + int width, + uint32_t * buffer, + const uint32_t * mask) +{ + pixman_fixed_t x, y, w; + pixman_fixed_t ux, uy, uw; + pixman_vector_t v; + int i; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (image->common.transform) + { + if (!pixman_transform_point_3d (image->common.transform, &v)) + return; + + ux = image->common.transform->matrix[0][0]; + uy = image->common.transform->matrix[1][0]; + uw = image->common.transform->matrix[2][0]; + } + else + { + ux = pixman_fixed_1; + uy = 0; + uw = 0; + } + + x = v.vector[0]; + y = v.vector[1]; + w = v.vector[2]; + + for (i = 0; i < width; ++i) + { + pixman_fixed_t x0, y0; + + if (!mask || mask[i]) + { + if (w != 0) + { + x0 = ((pixman_fixed_48_16_t)x << 16) / w; + y0 = ((pixman_fixed_48_16_t)y << 16) / w; + } + else + { + x0 = 0; + y0 = 0; + } + + buffer[i] = bits_image_fetch_pixel_filtered ( + &image->bits, x0, y0, fetch_pixel_general); + } + + x += ux; + y += uy; + w += uw; + } +} + +static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + +typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x); + +static force_inline void +bits_image_fetch_bilinear_affine (pixman_image_t * image, + int offset, + int line, + int width, + uint32_t * buffer, + const uint32_t * mask, + + convert_pixel_t convert_pixel, + pixman_format_code_t format, + pixman_repeat_t repeat_mode) +{ + pixman_fixed_t x, y; + pixman_fixed_t ux, uy; + pixman_vector_t v; + bits_image_t *bits = &image->bits; + int i; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (image->common.transform, &v)) + return; + + ux = image->common.transform->matrix[0][0]; + uy = image->common.transform->matrix[1][0]; + + x = v.vector[0]; + y = v.vector[1]; + + for (i = 0; i < width; ++i) + { + int x1, y1, x2, y2; + uint32_t tl, tr, bl, br; + int32_t distx, disty; + int width = image->bits.width; + int height = image->bits.height; + const uint8_t *row1; + const uint8_t *row2; + + if (mask && !mask[i]) + goto next; + + x1 = x - pixman_fixed_1 / 2; + y1 = y - pixman_fixed_1 / 2; + + distx = (x1 >> 8) & 0xff; + disty = (y1 >> 8) & 0xff; + + y1 = pixman_fixed_to_int (y1); + y2 = y1 + 1; + x1 = pixman_fixed_to_int (x1); + x2 = x1 + 1; + + if (repeat_mode != PIXMAN_REPEAT_NONE) + { + uint32_t mask; + + mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + + repeat (repeat_mode, width, &x1); + repeat (repeat_mode, height, &y1); + repeat (repeat_mode, width, &x2); + repeat (repeat_mode, height, &y2); + + row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1; + row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2; + + tl = convert_pixel (row1, x1) | mask; + tr = convert_pixel (row1, x2) | mask; + bl = convert_pixel (row2, x1) | mask; + br = convert_pixel (row2, x2) | mask; + } + else + { + uint32_t mask1, mask2; + int bpp; + + /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value, + * which means if you use it in expressions, those + * expressions become unsigned themselves. Since + * the variables below can be negative in some cases, + * that will lead to crashes on 64 bit architectures. + * + * So this line makes sure bpp is signed + */ + bpp = PIXMAN_FORMAT_BPP (format); + + if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0) + { + buffer[i] = 0; + goto next; + } + + if (y2 == 0) + { + row1 = zero; + mask1 = 0; + } + else + { + row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1; + row1 += bpp / 8 * x1; + + mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + } + + if (y1 == height - 1) + { + row2 = zero; + mask2 = 0; + } + else + { + row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2; + row2 += bpp / 8 * x1; + + mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + } + + if (x2 == 0) + { + tl = 0; + bl = 0; + } + else + { + tl = convert_pixel (row1, 0) | mask1; + bl = convert_pixel (row2, 0) | mask2; + } + + if (x1 == width - 1) + { + tr = 0; + br = 0; + } + else + { + tr = convert_pixel (row1, 1) | mask1; + br = convert_pixel (row2, 1) | mask2; + } + } + + buffer[i] = bilinear_interpolation ( + tl, tr, bl, br, distx, disty); + + next: + x += ux; + y += uy; + } +} + +static force_inline void +bits_image_fetch_nearest_affine (pixman_image_t * image, + int offset, + int line, + int width, + uint32_t * buffer, + const uint32_t * mask, + + convert_pixel_t convert_pixel, + pixman_format_code_t format, + pixman_repeat_t repeat_mode) +{ + pixman_fixed_t x, y; + pixman_fixed_t ux, uy; + pixman_vector_t v; + bits_image_t *bits = &image->bits; + int i; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (image->common.transform, &v)) + return; + + ux = image->common.transform->matrix[0][0]; + uy = image->common.transform->matrix[1][0]; + + x = v.vector[0]; + y = v.vector[1]; + + for (i = 0; i < width; ++i) + { + int width, height, x0, y0; + const uint8_t *row; + + if (mask && !mask[i]) + goto next; + + width = image->bits.width; + height = image->bits.height; + x0 = pixman_fixed_to_int (x - pixman_fixed_e); + y0 = pixman_fixed_to_int (y - pixman_fixed_e); + + if (repeat_mode == PIXMAN_REPEAT_NONE && + (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width)) + { + buffer[i] = 0; + } + else + { + uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + + if (repeat_mode != PIXMAN_REPEAT_NONE) + { + repeat (repeat_mode, width, &x0); + repeat (repeat_mode, height, &y0); + } + + row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0; + + buffer[i] = convert_pixel (row, x0) | mask; + } + + next: + x += ux; + y += uy; + } +} + +static force_inline uint32_t +convert_a8r8g8b8 (const uint8_t *row, int x) +{ + return *(((uint32_t *)row) + x); +} + +static force_inline uint32_t +convert_x8r8g8b8 (const uint8_t *row, int x) +{ + return *(((uint32_t *)row) + x); +} + +static force_inline uint32_t +convert_a8 (const uint8_t *row, int x) +{ + return *(row + x) << 24; +} + +static force_inline uint32_t +convert_r5g6b5 (const uint8_t *row, int x) +{ + return CONVERT_0565_TO_0888 (*((uint16_t *)row + x)); +} + +#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode) \ + static void \ + bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image, \ + int offset, \ + int line, \ + int width, \ + uint32_t * buffer, \ + const uint32_t * mask) \ + { \ + bits_image_fetch_bilinear_affine (image, offset, line, \ + width, buffer, mask, \ + convert_ ## format, \ + PIXMAN_ ## format, \ + repeat_mode); \ + } + +#define MAKE_NEAREST_FETCHER(name, format, repeat_mode) \ + static void \ + bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image, \ + int offset, \ + int line, \ + int width, \ + uint32_t * buffer, \ + const uint32_t * mask) \ + { \ + bits_image_fetch_nearest_affine (image, offset, line, \ + width, buffer, mask, \ + convert_ ## format, \ + PIXMAN_ ## format, \ + repeat_mode); \ + } + +#define MAKE_FETCHERS(name, format, repeat_mode) \ + MAKE_NEAREST_FETCHER (name, format, repeat_mode) \ + MAKE_BILINEAR_FETCHER (name, format, repeat_mode) + +MAKE_FETCHERS (pad_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NORMAL) +MAKE_FETCHERS (pad_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NORMAL) +MAKE_FETCHERS (pad_a8, a8, PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_a8, a8, PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_a8, a8, PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_a8, a8, PIXMAN_REPEAT_NORMAL) +MAKE_FETCHERS (pad_r5g6b5, r5g6b5, PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_r5g6b5, r5g6b5, PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_r5g6b5, r5g6b5, PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_r5g6b5, r5g6b5, PIXMAN_REPEAT_NORMAL) + +static void +bits_image_fetch_solid_32 (pixman_image_t * image, + int x, + int y, + int width, + uint32_t * buffer, + const uint32_t * mask) +{ + uint32_t color; + uint32_t *end; + + color = image->bits.fetch_pixel_32 (&image->bits, 0, 0); + + end = buffer + width; + while (buffer < end) + *(buffer++) = color; +} + +static void +bits_image_fetch_solid_64 (pixman_image_t * image, + int x, + int y, + int width, + uint32_t * b, + const uint32_t * unused) +{ + uint64_t color; + uint64_t *buffer = (uint64_t *)b; + uint64_t *end; + + color = image->bits.fetch_pixel_64 (&image->bits, 0, 0); + + end = buffer + width; + while (buffer < end) + *(buffer++) = color; +} + +static void +bits_image_fetch_untransformed_repeat_none (bits_image_t *image, + pixman_bool_t wide, + int x, + int y, + int width, + uint32_t * buffer) +{ + uint32_t w; + + if (y < 0 || y >= image->height) + { + memset (buffer, 0, width * (wide? 8 : 4)); + return; + } + + if (x < 0) + { + w = MIN (width, -x); + + memset (buffer, 0, w * (wide ? 8 : 4)); + + width -= w; + buffer += w * (wide? 2 : 1); + x += w; + } + + if (x < image->width) + { + w = MIN (width, image->width - x); + + if (wide) + image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL); + else + image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL); + + width -= w; + buffer += w * (wide? 2 : 1); + x += w; + } + + memset (buffer, 0, width * (wide ? 8 : 4)); +} + +static void +bits_image_fetch_untransformed_repeat_normal (bits_image_t *image, + pixman_bool_t wide, + int x, + int y, + int width, + uint32_t * buffer) +{ + uint32_t w; + + while (y < 0) + y += image->height; + + while (y >= image->height) + y -= image->height; + + while (width) + { + while (x < 0) + x += image->width; + while (x >= image->width) + x -= image->width; + + w = MIN (width, image->width - x); + + if (wide) + image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL); + else + image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL); + + buffer += w * (wide? 2 : 1); + x += w; + width -= w; + } +} + +static void +bits_image_fetch_untransformed_32 (pixman_image_t * image, + int x, + int y, + int width, + uint32_t * buffer, + const uint32_t * mask) +{ + if (image->common.repeat == PIXMAN_REPEAT_NONE) + { + bits_image_fetch_untransformed_repeat_none ( + &image->bits, FALSE, x, y, width, buffer); + } + else + { + bits_image_fetch_untransformed_repeat_normal ( + &image->bits, FALSE, x, y, width, buffer); + } +} + +static void +bits_image_fetch_untransformed_64 (pixman_image_t * image, + int x, + int y, + int width, + uint32_t * buffer, + const uint32_t * unused) +{ + if (image->common.repeat == PIXMAN_REPEAT_NONE) + { + bits_image_fetch_untransformed_repeat_none ( + &image->bits, TRUE, x, y, width, buffer); + } + else + { + bits_image_fetch_untransformed_repeat_normal ( + &image->bits, TRUE, x, y, width, buffer); + } +} + +typedef struct +{ + pixman_format_code_t format; + uint32_t flags; + fetch_scanline_t fetch_32; + fetch_scanline_t fetch_64; +} fetcher_info_t; + +static const fetcher_info_t fetcher_info[] = +{ + { PIXMAN_solid, + FAST_PATH_NO_ALPHA_MAP, + bits_image_fetch_solid_32, + bits_image_fetch_solid_64 + }, + + { PIXMAN_any, + (FAST_PATH_NO_ALPHA_MAP | + FAST_PATH_ID_TRANSFORM | + FAST_PATH_NO_CONVOLUTION_FILTER | + FAST_PATH_NO_PAD_REPEAT | + FAST_PATH_NO_REFLECT_REPEAT), + bits_image_fetch_untransformed_32, + bits_image_fetch_untransformed_64 + }, + +#define FAST_BILINEAR_FLAGS \ + (FAST_PATH_NO_ALPHA_MAP | \ + FAST_PATH_NO_ACCESSORS | \ + FAST_PATH_HAS_TRANSFORM | \ + FAST_PATH_AFFINE_TRANSFORM | \ + FAST_PATH_X_UNIT_POSITIVE | \ + FAST_PATH_Y_UNIT_ZERO | \ + FAST_PATH_NONE_REPEAT | \ + FAST_PATH_BILINEAR_FILTER) + + { PIXMAN_a8r8g8b8, + FAST_BILINEAR_FLAGS, + bits_image_fetch_bilinear_no_repeat_8888, + _pixman_image_get_scanline_generic_64 + }, + + { PIXMAN_x8r8g8b8, + FAST_BILINEAR_FLAGS, + bits_image_fetch_bilinear_no_repeat_8888, + _pixman_image_get_scanline_generic_64 + }, + +#define GENERAL_BILINEAR_FLAGS \ + (FAST_PATH_NO_ALPHA_MAP | \ + FAST_PATH_NO_ACCESSORS | \ + FAST_PATH_HAS_TRANSFORM | \ + FAST_PATH_AFFINE_TRANSFORM | \ + FAST_PATH_BILINEAR_FILTER) + +#define GENERAL_NEAREST_FLAGS \ + (FAST_PATH_NO_ALPHA_MAP | \ + FAST_PATH_NO_ACCESSORS | \ + FAST_PATH_HAS_TRANSFORM | \ + FAST_PATH_AFFINE_TRANSFORM | \ + FAST_PATH_NEAREST_FILTER) + +#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \ + { PIXMAN_ ## format, \ + GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \ + bits_image_fetch_bilinear_affine_ ## name, \ + _pixman_image_get_scanline_generic_64 \ + }, + +#define NEAREST_AFFINE_FAST_PATH(name, format, repeat) \ + { PIXMAN_ ## format, \ + GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \ + bits_image_fetch_nearest_affine_ ## name, \ + _pixman_image_get_scanline_generic_64 \ + }, + +#define AFFINE_FAST_PATHS(name, format, repeat) \ + BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \ + NEAREST_AFFINE_FAST_PATH(name, format, repeat) + + AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD) + AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE) + AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT) + AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL) + AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD) + AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE) + AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT) + AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL) + AFFINE_FAST_PATHS (pad_a8, a8, PAD) + AFFINE_FAST_PATHS (none_a8, a8, NONE) + AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT) + AFFINE_FAST_PATHS (normal_a8, a8, NORMAL) + AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD) + AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE) + AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT) + AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL) + + /* Affine, no alpha */ + { PIXMAN_any, + (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM), + bits_image_fetch_affine_no_alpha, + _pixman_image_get_scanline_generic_64 + }, + + /* General */ + { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 }, + + { PIXMAN_null }, +}; + +static void +bits_image_property_changed (pixman_image_t *image) +{ + uint32_t flags = image->common.flags; + pixman_format_code_t format = image->common.extended_format_code; + const fetcher_info_t *info; + + _pixman_bits_image_setup_accessors (&image->bits); + + info = fetcher_info; + while (info->format != PIXMAN_null) + { + if ((info->format == format || info->format == PIXMAN_any) && + (info->flags & flags) == info->flags) + { + image->bits.get_scanline_32 = info->fetch_32; + image->bits.get_scanline_64 = info->fetch_64; + break; + } + + info++; + } +} + +static uint32_t * +src_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask) +{ + iter->image->bits.get_scanline_32 ( + iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask); + + return iter->buffer; +} + +static uint32_t * +src_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) +{ + iter->image->bits.get_scanline_64 ( + iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask); + + return iter->buffer; +} + +void +_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter) +{ + if (iter->flags & ITER_NARROW) + iter->get_scanline = src_get_scanline_narrow; + else + iter->get_scanline = src_get_scanline_wide; +} + +static uint32_t * +dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask) +{ + pixman_image_t *image = iter->image; + int x = iter->x; + int y = iter->y; + int width = iter->width; + uint32_t * buffer = iter->buffer; + + image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask); + if (image->common.alpha_map) + { + x -= image->common.alpha_origin_x; + y -= image->common.alpha_origin_y; + + image->common.alpha_map->fetch_scanline_32 ( + (pixman_image_t *)image->common.alpha_map, + x, y, width, buffer, mask); + } + + return iter->buffer; +} + +static uint32_t * +dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) +{ + bits_image_t * image = &iter->image->bits; + int x = iter->x; + int y = iter->y; + int width = iter->width; + uint32_t * buffer = iter->buffer; + + image->fetch_scanline_64 ( + (pixman_image_t *)image, x, y, width, buffer, mask); + if (image->common.alpha_map) + { + x -= image->common.alpha_origin_x; + y -= image->common.alpha_origin_y; + + image->common.alpha_map->fetch_scanline_64 ( + (pixman_image_t *)image->common.alpha_map, x, y, width, buffer, mask); + } + + return iter->buffer; +} + +static void +dest_write_back_narrow (pixman_iter_t *iter) +{ + bits_image_t * image = &iter->image->bits; + int x = iter->x; + int y = iter->y; + int width = iter->width; + const uint32_t *buffer = iter->buffer; + + image->store_scanline_32 (image, x, y, width, buffer); + + if (image->common.alpha_map) + { + x -= image->common.alpha_origin_x; + y -= image->common.alpha_origin_y; + + image->common.alpha_map->store_scanline_32 ( + image->common.alpha_map, x, y, width, buffer); + } + + iter->y++; +} + +static void +dest_write_back_wide (pixman_iter_t *iter) +{ + bits_image_t * image = &iter->image->bits; + int x = iter->x; + int y = iter->y; + int width = iter->width; + const uint32_t *buffer = iter->buffer; + + image->store_scanline_64 (image, x, y, width, buffer); + + if (image->common.alpha_map) + { + x -= image->common.alpha_origin_x; + y -= image->common.alpha_origin_y; + + image->common.alpha_map->store_scanline_64 ( + image->common.alpha_map, x, y, width, buffer); + } + + iter->y++; +} + +static void +dest_write_back_direct (pixman_iter_t *iter) +{ + iter->buffer += iter->image->bits.rowstride; +} + +void +_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter) +{ + if (iter->flags & ITER_NARROW) + { + if (((image->common.flags & + (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) == + (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) && + (image->bits.format == PIXMAN_a8r8g8b8 || + (image->bits.format == PIXMAN_x8r8g8b8 && + (iter->flags & ITER_LOCALIZED_ALPHA)))) + { + iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x; + + iter->get_scanline = _pixman_iter_get_scanline_noop; + iter->write_back = dest_write_back_direct; + } + else + { + if ((iter->flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) == + (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) + { + iter->get_scanline = _pixman_iter_get_scanline_noop; + } + else + { + iter->get_scanline = dest_get_scanline_narrow; + } + + iter->write_back = dest_write_back_narrow; + } + } + else + { + iter->get_scanline = dest_get_scanline_wide; + iter->write_back = dest_write_back_wide; + } +} + +static uint32_t * +create_bits (pixman_format_code_t format, + int width, + int height, + int * rowstride_bytes) +{ + int stride; + int buf_size; + int bpp; + + /* what follows is a long-winded way, avoiding any possibility of integer + * overflows, of saying: + * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t); + */ + + bpp = PIXMAN_FORMAT_BPP (format); + if (pixman_multiply_overflows_int (width, bpp)) + return NULL; + + stride = width * bpp; + if (pixman_addition_overflows_int (stride, 0x1f)) + return NULL; + + stride += 0x1f; + stride >>= 5; + + stride *= sizeof (uint32_t); + + if (pixman_multiply_overflows_int (height, stride)) + return NULL; + + buf_size = height * stride; + + if (rowstride_bytes) + *rowstride_bytes = stride; + + return calloc (buf_size, 1); +} + +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_bits (pixman_format_code_t format, + int width, + int height, + uint32_t * bits, + int rowstride_bytes) +{ + pixman_image_t *image; + uint32_t *free_me = NULL; + + /* must be a whole number of uint32_t's + */ + return_val_if_fail ( + bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL); + + return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL); + + if (!bits && width && height) + { + free_me = bits = create_bits (format, width, height, &rowstride_bytes); + if (!bits) + return NULL; + } + + image = _pixman_image_allocate (); + + if (!image) + { + if (free_me) + free (free_me); + + return NULL; + } + + image->type = BITS; + image->bits.format = format; + image->bits.width = width; + image->bits.height = height; + image->bits.bits = bits; + image->bits.free_me = free_me; + image->bits.read_func = NULL; + image->bits.write_func = NULL; + + /* The rowstride is stored in number of uint32_t */ + image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t); + + image->bits.indexed = NULL; + + image->common.property_changed = bits_image_property_changed; + + _pixman_image_reset_clip_region (image); + + return image; +} diff --git a/pixman/pixman/pixman-conical-gradient.c b/pixman/pixman/pixman-conical-gradient.c index 9d7d2e8b5..e3f230262 100644 --- a/pixman/pixman/pixman-conical-gradient.c +++ b/pixman/pixman/pixman-conical-gradient.c @@ -1,214 +1,211 @@ -/* - * Copyright © 2000 SuSE, Inc. - * Copyright © 2007 Red Hat, Inc. - * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. - * 2005 Lars Knoll & Zack Rusin, Trolltech - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Keith Packard not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. Keith Packard makes no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - */ - -#ifdef HAVE_CONFIG_H -#include -#endif - -#include -#include -#include "pixman-private.h" - -static force_inline double -coordinates_to_parameter (double x, double y, double angle) -{ - double t; - - t = atan2 (y, x) + angle; - - while (t < 0) - t += 2 * M_PI; - - while (t >= 2 * M_PI) - t -= 2 * M_PI; - - return 1 - t * (1 / (2 * M_PI)); /* Scale t to [0, 1] and - * make rotation CCW - */ -} - -static uint32_t * -conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask) -{ - pixman_image_t *image = iter->image; - int x = iter->x; - int y = iter->y; - int width = iter->width; - uint32_t *buffer = iter->buffer; - - gradient_t *gradient = (gradient_t *)image; - conical_gradient_t *conical = (conical_gradient_t *)image; - uint32_t *end = buffer + width; - pixman_gradient_walker_t walker; - pixman_bool_t affine = TRUE; - double cx = 1.; - double cy = 0.; - double cz = 0.; - double rx = x + 0.5; - double ry = y + 0.5; - double rz = 1.; - - _pixman_gradient_walker_init (&walker, gradient, image->common.repeat); - - if (image->common.transform) - { - pixman_vector_t v; - - /* reference point is the center of the pixel */ - v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2; - v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2; - v.vector[2] = pixman_fixed_1; - - if (!pixman_transform_point_3d (image->common.transform, &v)) - return iter->buffer; - - cx = image->common.transform->matrix[0][0] / 65536.; - cy = image->common.transform->matrix[1][0] / 65536.; - cz = image->common.transform->matrix[2][0] / 65536.; - - rx = v.vector[0] / 65536.; - ry = v.vector[1] / 65536.; - rz = v.vector[2] / 65536.; - - affine = - image->common.transform->matrix[2][0] == 0 && - v.vector[2] == pixman_fixed_1; - } - - if (affine) - { - rx -= conical->center.x / 65536.; - ry -= conical->center.y / 65536.; - - while (buffer < end) - { - if (!mask || *mask++) - { - double t = coordinates_to_parameter (rx, ry, conical->angle); - - *buffer = _pixman_gradient_walker_pixel ( - &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t)); - } - - ++buffer; - - rx += cx; - ry += cy; - } - } - else - { - while (buffer < end) - { - double x, y; - - if (!mask || *mask++) - { - double t; - - if (rz != 0) - { - x = rx / rz; - y = ry / rz; - } - else - { - x = y = 0.; - } - - x -= conical->center.x / 65536.; - y -= conical->center.y / 65536.; - - t = coordinates_to_parameter (x, y, conical->angle); - - *buffer = _pixman_gradient_walker_pixel ( - &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t)); - } - - ++buffer; - - rx += cx; - ry += cy; - rz += cz; - } - } - - iter->y++; - return iter->buffer; -} - -static uint32_t * -conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) -{ - uint32_t *buffer = conical_get_scanline_narrow (iter, NULL); - - pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width); - - return buffer; -} - -void -_pixman_conical_gradient_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags) -{ - if (flags & ITER_NARROW) - iter->get_scanline = conical_get_scanline_narrow; - else - iter->get_scanline = conical_get_scanline_wide; -} - -PIXMAN_EXPORT pixman_image_t * -pixman_image_create_conical_gradient (pixman_point_fixed_t * center, - pixman_fixed_t angle, - const pixman_gradient_stop_t *stops, - int n_stops) -{ - pixman_image_t *image = _pixman_image_allocate (); - conical_gradient_t *conical; - - if (!image) - return NULL; - - conical = &image->conical; - - if (!_pixman_init_gradient (&conical->common, stops, n_stops)) - { - free (image); - return NULL; - } - - angle = MOD (angle, pixman_int_to_fixed (360)); - - image->type = CONICAL; - - conical->center = *center; - conical->angle = (pixman_fixed_to_double (angle) / 180.0) * M_PI; - - return image; -} - +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + * 2005 Lars Knoll & Zack Rusin, Trolltech + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Keith Packard makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include "pixman-private.h" + +static force_inline double +coordinates_to_parameter (double x, double y, double angle) +{ + double t; + + t = atan2 (y, x) + angle; + + while (t < 0) + t += 2 * M_PI; + + while (t >= 2 * M_PI) + t -= 2 * M_PI; + + return 1 - t * (1 / (2 * M_PI)); /* Scale t to [0, 1] and + * make rotation CCW + */ +} + +static uint32_t * +conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask) +{ + pixman_image_t *image = iter->image; + int x = iter->x; + int y = iter->y; + int width = iter->width; + uint32_t *buffer = iter->buffer; + + gradient_t *gradient = (gradient_t *)image; + conical_gradient_t *conical = (conical_gradient_t *)image; + uint32_t *end = buffer + width; + pixman_gradient_walker_t walker; + pixman_bool_t affine = TRUE; + double cx = 1.; + double cy = 0.; + double cz = 0.; + double rx = x + 0.5; + double ry = y + 0.5; + double rz = 1.; + + _pixman_gradient_walker_init (&walker, gradient, image->common.repeat); + + if (image->common.transform) + { + pixman_vector_t v; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (image->common.transform, &v)) + return iter->buffer; + + cx = image->common.transform->matrix[0][0] / 65536.; + cy = image->common.transform->matrix[1][0] / 65536.; + cz = image->common.transform->matrix[2][0] / 65536.; + + rx = v.vector[0] / 65536.; + ry = v.vector[1] / 65536.; + rz = v.vector[2] / 65536.; + + affine = + image->common.transform->matrix[2][0] == 0 && + v.vector[2] == pixman_fixed_1; + } + + if (affine) + { + rx -= conical->center.x / 65536.; + ry -= conical->center.y / 65536.; + + while (buffer < end) + { + if (!mask || *mask++) + { + double t = coordinates_to_parameter (rx, ry, conical->angle); + + *buffer = _pixman_gradient_walker_pixel ( + &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t)); + } + + ++buffer; + + rx += cx; + ry += cy; + } + } + else + { + while (buffer < end) + { + double x, y; + + if (!mask || *mask++) + { + double t; + + if (rz != 0) + { + x = rx / rz; + y = ry / rz; + } + else + { + x = y = 0.; + } + + x -= conical->center.x / 65536.; + y -= conical->center.y / 65536.; + + t = coordinates_to_parameter (x, y, conical->angle); + + *buffer = _pixman_gradient_walker_pixel ( + &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t)); + } + + ++buffer; + + rx += cx; + ry += cy; + rz += cz; + } + } + + iter->y++; + return iter->buffer; +} + +static uint32_t * +conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) +{ + uint32_t *buffer = conical_get_scanline_narrow (iter, NULL); + + pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width); + + return buffer; +} + +void +_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter) +{ + if (iter->flags & ITER_NARROW) + iter->get_scanline = conical_get_scanline_narrow; + else + iter->get_scanline = conical_get_scanline_wide; +} + +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_conical_gradient (pixman_point_fixed_t * center, + pixman_fixed_t angle, + const pixman_gradient_stop_t *stops, + int n_stops) +{ + pixman_image_t *image = _pixman_image_allocate (); + conical_gradient_t *conical; + + if (!image) + return NULL; + + conical = &image->conical; + + if (!_pixman_init_gradient (&conical->common, stops, n_stops)) + { + free (image); + return NULL; + } + + angle = MOD (angle, pixman_int_to_fixed (360)); + + image->type = CONICAL; + + conical->center = *center; + conical->angle = (pixman_fixed_to_double (angle) / 180.0) * M_PI; + + return image; +} + diff --git a/pixman/pixman/pixman-general.c b/pixman/pixman/pixman-general.c index 872fb7e9f..5bac6c65a 100644 --- a/pixman/pixman/pixman-general.c +++ b/pixman/pixman/pixman-general.c @@ -1,311 +1,275 @@ -/* - * Copyright © 2009 Red Hat, Inc. - * Copyright © 2000 SuSE, Inc. - * Copyright © 2007 Red Hat, Inc. - * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. - * 2005 Lars Knoll & Zack Rusin, Trolltech - * 2008 Aaron Plattner, NVIDIA Corporation - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Red Hat not be used in advertising or - * publicity pertaining to distribution of the software without specific, - * written prior permission. Red Hat makes no representations about the - * suitability of this software for any purpose. It is provided "as is" - * without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - */ -#ifdef HAVE_CONFIG_H -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include "pixman-private.h" - -static void -general_src_iter_init (pixman_implementation_t *imp, - pixman_iter_t *iter, - pixman_image_t *image, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags) -{ - iter->image = image; - iter->x = x; - iter->y = y; - iter->width = width; - iter->buffer = (uint32_t *)buffer; - - if (image->type == SOLID) - { - _pixman_solid_fill_iter_init ( - image, iter, x, y, width, height, buffer, flags); - } - else if (image->type == LINEAR) - { - _pixman_linear_gradient_iter_init ( - image, iter, x, y, width, height, buffer, flags); - } - else if (image->type == RADIAL) - { - _pixman_radial_gradient_iter_init ( - image, iter, x, y, width, height, buffer, flags); - } - else if (image->type == CONICAL) - { - _pixman_conical_gradient_iter_init ( - image, iter, x, y, width, height, buffer, flags); - } - else if (image->type == BITS) - { - _pixman_bits_image_src_iter_init ( - image, iter, x, y, width, height, buffer, flags); - } - else - { - _pixman_log_error (FUNC, "Pixman bug: unknown image type\n"); - } -} - -static void -general_dest_iter_init (pixman_implementation_t *imp, - pixman_iter_t *iter, - pixman_image_t *image, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags) -{ - iter->image = image; - iter->x = x; - iter->y = y; - iter->width = width; - iter->buffer = (uint32_t *)buffer; - - if (image->type == BITS) - { - _pixman_bits_image_dest_iter_init ( - image, iter, x, y, width, height, buffer, flags); - } - else - { - _pixman_log_error (FUNC, "Trying to write to a non-writable image"); - } -} - -typedef struct op_info_t op_info_t; -struct op_info_t -{ - uint8_t src, dst; -}; - -#define ITER_IGNORE_BOTH \ - (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_LOCALIZED_ALPHA) - -static const op_info_t op_flags[PIXMAN_N_OPERATORS] = -{ - /* Src Dst */ - { ITER_IGNORE_BOTH, ITER_IGNORE_BOTH }, /* CLEAR */ - { ITER_LOCALIZED_ALPHA, ITER_IGNORE_BOTH }, /* SRC */ - { ITER_IGNORE_BOTH, ITER_LOCALIZED_ALPHA }, /* DST */ - { 0, ITER_LOCALIZED_ALPHA }, /* OVER */ - { ITER_LOCALIZED_ALPHA, 0 }, /* OVER_REVERSE */ - { ITER_LOCALIZED_ALPHA, ITER_IGNORE_RGB }, /* IN */ - { ITER_IGNORE_RGB, ITER_LOCALIZED_ALPHA }, /* IN_REVERSE */ - { ITER_LOCALIZED_ALPHA, ITER_IGNORE_RGB }, /* OUT */ - { ITER_IGNORE_RGB, ITER_LOCALIZED_ALPHA }, /* OUT_REVERSE */ - { 0, 0 }, /* ATOP */ - { 0, 0 }, /* ATOP_REVERSE */ - { 0, 0 }, /* XOR */ - { ITER_LOCALIZED_ALPHA, ITER_LOCALIZED_ALPHA }, /* ADD */ - { 0, 0 }, /* SATURATE */ -}; - -#define SCANLINE_BUFFER_LENGTH 8192 - -static void -general_composite_rect (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src, - pixman_image_t * mask, - pixman_image_t * dest, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8]; - uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer; - uint8_t *src_buffer, *mask_buffer, *dest_buffer; - pixman_iter_t src_iter, mask_iter, dest_iter; - pixman_combine_32_func_t compose; - pixman_bool_t component_alpha; - iter_flags_t narrow, src_flags; - int Bpp; - int i; - - if ((src->common.flags & FAST_PATH_NARROW_FORMAT) && - (!mask || mask->common.flags & FAST_PATH_NARROW_FORMAT) && - (dest->common.flags & FAST_PATH_NARROW_FORMAT)) - { - narrow = ITER_NARROW; - Bpp = 4; - } - else - { - narrow = 0; - Bpp = 8; - } - - if (width * Bpp > SCANLINE_BUFFER_LENGTH) - { - scanline_buffer = pixman_malloc_abc (width, 3, Bpp); - - if (!scanline_buffer) - return; - } - - src_buffer = scanline_buffer; - mask_buffer = src_buffer + width * Bpp; - dest_buffer = mask_buffer + width * Bpp; - - /* src iter */ - src_flags = narrow | op_flags[op].src; - - _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src, - src_x, src_y, width, height, - src_buffer, src_flags); - - /* mask iter */ - if ((src_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) == - (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) - { - /* If it doesn't matter what the source is, then it doesn't matter - * what the mask is - */ - mask = NULL; - } - - component_alpha = - mask && - mask->common.type == BITS && - mask->common.component_alpha && - PIXMAN_FORMAT_RGB (mask->bits.format); - - _pixman_implementation_src_iter_init ( - imp->toplevel, &mask_iter, mask, mask_x, mask_y, width, height, - mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB)); - - /* dest iter */ - _pixman_implementation_dest_iter_init (imp->toplevel, &dest_iter, dest, - dest_x, dest_y, width, height, - dest_buffer, - narrow | op_flags[op].dst); - - if (narrow) - { - if (component_alpha) - compose = _pixman_implementation_combine_32_ca; - else - compose = _pixman_implementation_combine_32; - } - else - { - if (component_alpha) - compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64_ca; - else - compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64; - } - - if (!compose) - return; - - for (i = 0; i < height; ++i) - { - uint32_t *s, *m, *d; - - m = mask_iter.get_scanline (&mask_iter, NULL); - s = src_iter.get_scanline (&src_iter, m); - d = dest_iter.get_scanline (&dest_iter, NULL); - - compose (imp->toplevel, op, d, s, m, width); - - dest_iter.write_back (&dest_iter); - } - - if (scanline_buffer != (uint8_t *) stack_scanline_buffer) - free (scanline_buffer); -} - -static const pixman_fast_path_t general_fast_path[] = -{ - { PIXMAN_OP_any, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, general_composite_rect }, - { PIXMAN_OP_NONE } -}; - -static pixman_bool_t -general_blt (pixman_implementation_t *imp, - uint32_t * src_bits, - uint32_t * dst_bits, - int src_stride, - int dst_stride, - int src_bpp, - int dst_bpp, - int src_x, - int src_y, - int dst_x, - int dst_y, - int width, - int height) -{ - /* We can't blit unless we have sse2 or mmx */ - - return FALSE; -} - -static pixman_bool_t -general_fill (pixman_implementation_t *imp, - uint32_t * bits, - int stride, - int bpp, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - return FALSE; -} - -pixman_implementation_t * -_pixman_implementation_create_general (void) -{ - pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path); - - _pixman_setup_combiner_functions_32 (imp); - _pixman_setup_combiner_functions_64 (imp); - - imp->blt = general_blt; - imp->fill = general_fill; - imp->src_iter_init = general_src_iter_init; - imp->dest_iter_init = general_dest_iter_init; - - return imp; -} - +/* + * Copyright © 2009 Red Hat, Inc. + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + * 2005 Lars Knoll & Zack Rusin, Trolltech + * 2008 Aaron Plattner, NVIDIA Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Red Hat not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. Red Hat makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include "pixman-private.h" + +static void +general_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) +{ + pixman_image_t *image = iter->image; + + if (image->type == SOLID) + _pixman_solid_fill_iter_init (image, iter); + else if (image->type == LINEAR) + _pixman_linear_gradient_iter_init (image, iter); + else if (image->type == RADIAL) + _pixman_radial_gradient_iter_init (image, iter); + else if (image->type == CONICAL) + _pixman_conical_gradient_iter_init (image, iter); + else if (image->type == BITS) + _pixman_bits_image_src_iter_init (image, iter); + else + _pixman_log_error (FUNC, "Pixman bug: unknown image type\n"); +} + +static void +general_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) +{ + if (iter->image->type == BITS) + { + _pixman_bits_image_dest_iter_init (iter->image, iter); + } + else + { + _pixman_log_error (FUNC, "Trying to write to a non-writable image"); + } +} + +typedef struct op_info_t op_info_t; +struct op_info_t +{ + uint8_t src, dst; +}; + +#define ITER_IGNORE_BOTH \ + (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_LOCALIZED_ALPHA) + +static const op_info_t op_flags[PIXMAN_N_OPERATORS] = +{ + /* Src Dst */ + { ITER_IGNORE_BOTH, ITER_IGNORE_BOTH }, /* CLEAR */ + { ITER_LOCALIZED_ALPHA, ITER_IGNORE_BOTH }, /* SRC */ + { ITER_IGNORE_BOTH, ITER_LOCALIZED_ALPHA }, /* DST */ + { 0, ITER_LOCALIZED_ALPHA }, /* OVER */ + { ITER_LOCALIZED_ALPHA, 0 }, /* OVER_REVERSE */ + { ITER_LOCALIZED_ALPHA, ITER_IGNORE_RGB }, /* IN */ + { ITER_IGNORE_RGB, ITER_LOCALIZED_ALPHA }, /* IN_REVERSE */ + { ITER_LOCALIZED_ALPHA, ITER_IGNORE_RGB }, /* OUT */ + { ITER_IGNORE_RGB, ITER_LOCALIZED_ALPHA }, /* OUT_REVERSE */ + { 0, 0 }, /* ATOP */ + { 0, 0 }, /* ATOP_REVERSE */ + { 0, 0 }, /* XOR */ + { ITER_LOCALIZED_ALPHA, ITER_LOCALIZED_ALPHA }, /* ADD */ + { 0, 0 }, /* SATURATE */ +}; + +#define SCANLINE_BUFFER_LENGTH 8192 + +static void +general_composite_rect (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src, + pixman_image_t * mask, + pixman_image_t * dest, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8]; + uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer; + uint8_t *src_buffer, *mask_buffer, *dest_buffer; + pixman_iter_t src_iter, mask_iter, dest_iter; + pixman_combine_32_func_t compose; + pixman_bool_t component_alpha; + iter_flags_t narrow, src_flags; + int Bpp; + int i; + + if ((src->common.flags & FAST_PATH_NARROW_FORMAT) && + (!mask || mask->common.flags & FAST_PATH_NARROW_FORMAT) && + (dest->common.flags & FAST_PATH_NARROW_FORMAT)) + { + narrow = ITER_NARROW; + Bpp = 4; + } + else + { + narrow = 0; + Bpp = 8; + } + + if (width * Bpp > SCANLINE_BUFFER_LENGTH) + { + scanline_buffer = pixman_malloc_abc (width, 3, Bpp); + + if (!scanline_buffer) + return; + } + + src_buffer = scanline_buffer; + mask_buffer = src_buffer + width * Bpp; + dest_buffer = mask_buffer + width * Bpp; + + /* src iter */ + src_flags = narrow | op_flags[op].src; + + _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src, + src_x, src_y, width, height, + src_buffer, src_flags); + + /* mask iter */ + if ((src_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) == + (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) + { + /* If it doesn't matter what the source is, then it doesn't matter + * what the mask is + */ + mask = NULL; + } + + component_alpha = + mask && + mask->common.type == BITS && + mask->common.component_alpha && + PIXMAN_FORMAT_RGB (mask->bits.format); + + _pixman_implementation_src_iter_init ( + imp->toplevel, &mask_iter, mask, mask_x, mask_y, width, height, + mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB)); + + /* dest iter */ + _pixman_implementation_dest_iter_init (imp->toplevel, &dest_iter, dest, + dest_x, dest_y, width, height, + dest_buffer, + narrow | op_flags[op].dst); + + if (narrow) + { + if (component_alpha) + compose = _pixman_implementation_combine_32_ca; + else + compose = _pixman_implementation_combine_32; + } + else + { + if (component_alpha) + compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64_ca; + else + compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64; + } + + if (!compose) + return; + + for (i = 0; i < height; ++i) + { + uint32_t *s, *m, *d; + + m = mask_iter.get_scanline (&mask_iter, NULL); + s = src_iter.get_scanline (&src_iter, m); + d = dest_iter.get_scanline (&dest_iter, NULL); + + compose (imp->toplevel, op, d, s, m, width); + + dest_iter.write_back (&dest_iter); + } + + if (scanline_buffer != (uint8_t *) stack_scanline_buffer) + free (scanline_buffer); +} + +static const pixman_fast_path_t general_fast_path[] = +{ + { PIXMAN_OP_any, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, general_composite_rect }, + { PIXMAN_OP_NONE } +}; + +static pixman_bool_t +general_blt (pixman_implementation_t *imp, + uint32_t * src_bits, + uint32_t * dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + /* We can't blit unless we have sse2 or mmx */ + + return FALSE; +} + +static pixman_bool_t +general_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + return FALSE; +} + +pixman_implementation_t * +_pixman_implementation_create_general (void) +{ + pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path); + + _pixman_setup_combiner_functions_32 (imp); + _pixman_setup_combiner_functions_64 (imp); + + imp->blt = general_blt; + imp->fill = general_fill; + imp->src_iter_init = general_src_iter_init; + imp->dest_iter_init = general_dest_iter_init; + + return imp; +} + diff --git a/pixman/pixman/pixman-implementation.c b/pixman/pixman/pixman-implementation.c index adaf9c61e..caade9332 100644 --- a/pixman/pixman/pixman-implementation.c +++ b/pixman/pixman/pixman-implementation.c @@ -1,306 +1,304 @@ -/* - * Copyright © 2009 Red Hat, Inc. - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Red Hat not be used in advertising or - * publicity pertaining to distribution of the software without specific, - * written prior permission. Red Hat makes no representations about the - * suitability of this software for any purpose. It is provided "as is" - * without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - */ - -#ifdef HAVE_CONFIG_H -#include -#endif -#include -#include "pixman-private.h" - -static void -delegate_combine_32 (pixman_implementation_t * imp, - pixman_op_t op, - uint32_t * dest, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - _pixman_implementation_combine_32 (imp->delegate, - op, dest, src, mask, width); -} - -static void -delegate_combine_64 (pixman_implementation_t * imp, - pixman_op_t op, - uint64_t * dest, - const uint64_t * src, - const uint64_t * mask, - int width) -{ - _pixman_implementation_combine_64 (imp->delegate, - op, dest, src, mask, width); -} - -static void -delegate_combine_32_ca (pixman_implementation_t * imp, - pixman_op_t op, - uint32_t * dest, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - _pixman_implementation_combine_32_ca (imp->delegate, - op, dest, src, mask, width); -} - -static void -delegate_combine_64_ca (pixman_implementation_t * imp, - pixman_op_t op, - uint64_t * dest, - const uint64_t * src, - const uint64_t * mask, - int width) -{ - _pixman_implementation_combine_64_ca (imp->delegate, - op, dest, src, mask, width); -} - -static pixman_bool_t -delegate_blt (pixman_implementation_t * imp, - uint32_t * src_bits, - uint32_t * dst_bits, - int src_stride, - int dst_stride, - int src_bpp, - int dst_bpp, - int src_x, - int src_y, - int dst_x, - int dst_y, - int width, - int height) -{ - return _pixman_implementation_blt ( - imp->delegate, src_bits, dst_bits, src_stride, dst_stride, - src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y, - width, height); -} - -static pixman_bool_t -delegate_fill (pixman_implementation_t *imp, - uint32_t * bits, - int stride, - int bpp, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - return _pixman_implementation_fill ( - imp->delegate, bits, stride, bpp, x, y, width, height, xor); -} - -static void -delegate_src_iter_init (pixman_implementation_t *imp, - pixman_iter_t * iter, - pixman_image_t * image, - int x, - int y, - int width, - int height, - uint8_t * buffer, - iter_flags_t flags) -{ - _pixman_implementation_src_iter_init ( - imp->delegate, iter, image, x, y, width, height, buffer, flags); -} - -static void -delegate_dest_iter_init (pixman_implementation_t *imp, - pixman_iter_t * iter, - pixman_image_t * image, - int x, - int y, - int width, - int height, - uint8_t * buffer, - iter_flags_t flags) -{ - _pixman_implementation_dest_iter_init ( - imp->delegate, iter, image, x, y, width, height, buffer, flags); -} - -pixman_implementation_t * -_pixman_implementation_create (pixman_implementation_t *delegate, - const pixman_fast_path_t *fast_paths) -{ - pixman_implementation_t *imp = malloc (sizeof (pixman_implementation_t)); - pixman_implementation_t *d; - int i; - - if (!imp) - return NULL; - - assert (fast_paths); - - /* Make sure the whole delegate chain has the right toplevel */ - imp->delegate = delegate; - for (d = imp; d != NULL; d = d->delegate) - d->toplevel = imp; - - /* Fill out function pointers with ones that just delegate - */ - imp->blt = delegate_blt; - imp->fill = delegate_fill; - imp->src_iter_init = delegate_src_iter_init; - imp->dest_iter_init = delegate_dest_iter_init; - - for (i = 0; i < PIXMAN_N_OPERATORS; ++i) - { - imp->combine_32[i] = delegate_combine_32; - imp->combine_64[i] = delegate_combine_64; - imp->combine_32_ca[i] = delegate_combine_32_ca; - imp->combine_64_ca[i] = delegate_combine_64_ca; - } - - imp->fast_paths = fast_paths; - - return imp; -} - -void -_pixman_implementation_combine_32 (pixman_implementation_t * imp, - pixman_op_t op, - uint32_t * dest, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - (*imp->combine_32[op]) (imp, op, dest, src, mask, width); -} - -void -_pixman_implementation_combine_64 (pixman_implementation_t * imp, - pixman_op_t op, - uint64_t * dest, - const uint64_t * src, - const uint64_t * mask, - int width) -{ - (*imp->combine_64[op]) (imp, op, dest, src, mask, width); -} - -void -_pixman_implementation_combine_32_ca (pixman_implementation_t * imp, - pixman_op_t op, - uint32_t * dest, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - (*imp->combine_32_ca[op]) (imp, op, dest, src, mask, width); -} - -void -_pixman_implementation_combine_64_ca (pixman_implementation_t * imp, - pixman_op_t op, - uint64_t * dest, - const uint64_t * src, - const uint64_t * mask, - int width) -{ - (*imp->combine_64_ca[op]) (imp, op, dest, src, mask, width); -} - -pixman_bool_t -_pixman_implementation_blt (pixman_implementation_t * imp, - uint32_t * src_bits, - uint32_t * dst_bits, - int src_stride, - int dst_stride, - int src_bpp, - int dst_bpp, - int src_x, - int src_y, - int dst_x, - int dst_y, - int width, - int height) -{ - return (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride, - src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y, - width, height); -} - -pixman_bool_t -_pixman_implementation_fill (pixman_implementation_t *imp, - uint32_t * bits, - int stride, - int bpp, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - return (*imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor); -} - -static uint32_t * -get_scanline_null (pixman_iter_t *iter, const uint32_t *mask) -{ - return NULL; -} - -void -_pixman_implementation_src_iter_init (pixman_implementation_t *imp, - pixman_iter_t *iter, - pixman_image_t *image, - int x, - int y, - int width, - int height, - uint8_t *buffer, - iter_flags_t flags) -{ - if (!image) - { - iter->get_scanline = get_scanline_null; - } - else if ((flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) == - (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) - { - iter->get_scanline = _pixman_iter_get_scanline_noop; - } - else - { - (*imp->src_iter_init) ( - imp, iter, image, x, y, width, height, buffer, flags); - } -} - -void -_pixman_implementation_dest_iter_init (pixman_implementation_t *imp, - pixman_iter_t *iter, - pixman_image_t *image, - int x, - int y, - int width, - int height, - uint8_t *buffer, - iter_flags_t flags) -{ - (*imp->dest_iter_init) ( - imp, iter, image, x, y, width, height, buffer, flags); -} +/* + * Copyright © 2009 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Red Hat not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. Red Hat makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include "pixman-private.h" + +static void +delegate_combine_32 (pixman_implementation_t * imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + _pixman_implementation_combine_32 (imp->delegate, + op, dest, src, mask, width); +} + +static void +delegate_combine_64 (pixman_implementation_t * imp, + pixman_op_t op, + uint64_t * dest, + const uint64_t * src, + const uint64_t * mask, + int width) +{ + _pixman_implementation_combine_64 (imp->delegate, + op, dest, src, mask, width); +} + +static void +delegate_combine_32_ca (pixman_implementation_t * imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + _pixman_implementation_combine_32_ca (imp->delegate, + op, dest, src, mask, width); +} + +static void +delegate_combine_64_ca (pixman_implementation_t * imp, + pixman_op_t op, + uint64_t * dest, + const uint64_t * src, + const uint64_t * mask, + int width) +{ + _pixman_implementation_combine_64_ca (imp->delegate, + op, dest, src, mask, width); +} + +static pixman_bool_t +delegate_blt (pixman_implementation_t * imp, + uint32_t * src_bits, + uint32_t * dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + return _pixman_implementation_blt ( + imp->delegate, src_bits, dst_bits, src_stride, dst_stride, + src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y, + width, height); +} + +static pixman_bool_t +delegate_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + return _pixman_implementation_fill ( + imp->delegate, bits, stride, bpp, x, y, width, height, xor); +} + +static void +delegate_src_iter_init (pixman_implementation_t *imp, + pixman_iter_t * iter) +{ + imp->delegate->src_iter_init (imp->delegate, iter); +} + +static void +delegate_dest_iter_init (pixman_implementation_t *imp, + pixman_iter_t * iter) +{ + imp->delegate->dest_iter_init (imp->delegate, iter); +} + +pixman_implementation_t * +_pixman_implementation_create (pixman_implementation_t *delegate, + const pixman_fast_path_t *fast_paths) +{ + pixman_implementation_t *imp = malloc (sizeof (pixman_implementation_t)); + pixman_implementation_t *d; + int i; + + if (!imp) + return NULL; + + assert (fast_paths); + + /* Make sure the whole delegate chain has the right toplevel */ + imp->delegate = delegate; + for (d = imp; d != NULL; d = d->delegate) + d->toplevel = imp; + + /* Fill out function pointers with ones that just delegate + */ + imp->blt = delegate_blt; + imp->fill = delegate_fill; + imp->src_iter_init = delegate_src_iter_init; + imp->dest_iter_init = delegate_dest_iter_init; + + for (i = 0; i < PIXMAN_N_OPERATORS; ++i) + { + imp->combine_32[i] = delegate_combine_32; + imp->combine_64[i] = delegate_combine_64; + imp->combine_32_ca[i] = delegate_combine_32_ca; + imp->combine_64_ca[i] = delegate_combine_64_ca; + } + + imp->fast_paths = fast_paths; + + return imp; +} + +void +_pixman_implementation_combine_32 (pixman_implementation_t * imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + (*imp->combine_32[op]) (imp, op, dest, src, mask, width); +} + +void +_pixman_implementation_combine_64 (pixman_implementation_t * imp, + pixman_op_t op, + uint64_t * dest, + const uint64_t * src, + const uint64_t * mask, + int width) +{ + (*imp->combine_64[op]) (imp, op, dest, src, mask, width); +} + +void +_pixman_implementation_combine_32_ca (pixman_implementation_t * imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + (*imp->combine_32_ca[op]) (imp, op, dest, src, mask, width); +} + +void +_pixman_implementation_combine_64_ca (pixman_implementation_t * imp, + pixman_op_t op, + uint64_t * dest, + const uint64_t * src, + const uint64_t * mask, + int width) +{ + (*imp->combine_64_ca[op]) (imp, op, dest, src, mask, width); +} + +pixman_bool_t +_pixman_implementation_blt (pixman_implementation_t * imp, + uint32_t * src_bits, + uint32_t * dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + return (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride, + src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y, + width, height); +} + +pixman_bool_t +_pixman_implementation_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + return (*imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor); +} + +static uint32_t * +get_scanline_null (pixman_iter_t *iter, const uint32_t *mask) +{ + return NULL; +} + +void +_pixman_implementation_src_iter_init (pixman_implementation_t *imp, + pixman_iter_t *iter, + pixman_image_t *image, + int x, + int y, + int width, + int height, + uint8_t *buffer, + iter_flags_t flags) +{ + iter->image = image; + iter->buffer = (uint32_t *)buffer; + iter->x = x; + iter->y = y; + iter->width = width; + iter->height = height; + iter->flags = flags; + + if (!image) + { + iter->get_scanline = get_scanline_null; + } + else if ((flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) == + (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) + { + iter->get_scanline = _pixman_iter_get_scanline_noop; + } + else + { + (*imp->src_iter_init) (imp, iter); + } +} + +void +_pixman_implementation_dest_iter_init (pixman_implementation_t *imp, + pixman_iter_t *iter, + pixman_image_t *image, + int x, + int y, + int width, + int height, + uint8_t *buffer, + iter_flags_t flags) +{ + iter->image = image; + iter->buffer = (uint32_t *)buffer; + iter->x = x; + iter->y = y; + iter->width = width; + iter->height = height; + iter->flags = flags; + + (*imp->dest_iter_init) (imp, iter); +} diff --git a/pixman/pixman/pixman-linear-gradient.c b/pixman/pixman/pixman-linear-gradient.c index 07303fc03..3d5bbf63d 100644 --- a/pixman/pixman/pixman-linear-gradient.c +++ b/pixman/pixman/pixman-linear-gradient.c @@ -1,292 +1,286 @@ -/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ -/* - * Copyright © 2000 SuSE, Inc. - * Copyright © 2007 Red Hat, Inc. - * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. - * 2005 Lars Knoll & Zack Rusin, Trolltech - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Keith Packard not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. Keith Packard makes no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - */ - -#ifdef HAVE_CONFIG_H -#include -#endif -#include -#include "pixman-private.h" - -static pixman_bool_t -linear_gradient_is_horizontal (pixman_image_t *image, - int x, - int y, - int width, - int height) -{ - linear_gradient_t *linear = (linear_gradient_t *)image; - pixman_vector_t v; - pixman_fixed_32_32_t l; - pixman_fixed_48_16_t dx, dy; - double inc; - - if (image->common.transform) - { - /* projective transformation */ - if (image->common.transform->matrix[2][0] != 0 || - image->common.transform->matrix[2][1] != 0 || - image->common.transform->matrix[2][2] == 0) - { - return FALSE; - } - - v.vector[0] = image->common.transform->matrix[0][1]; - v.vector[1] = image->common.transform->matrix[1][1]; - v.vector[2] = image->common.transform->matrix[2][2]; - } - else - { - v.vector[0] = 0; - v.vector[1] = pixman_fixed_1; - v.vector[2] = pixman_fixed_1; - } - - dx = linear->p2.x - linear->p1.x; - dy = linear->p2.y - linear->p1.y; - - l = dx * dx + dy * dy; - - if (l == 0) - return FALSE; - - /* - * compute how much the input of the gradient walked changes - * when moving vertically through the whole image - */ - inc = height * (double) pixman_fixed_1 * pixman_fixed_1 * - (dx * v.vector[0] + dy * v.vector[1]) / - (v.vector[2] * (double) l); - - /* check that casting to integer would result in 0 */ - if (-1 < inc && inc < 1) - return TRUE; - - return FALSE; -} - -static uint32_t * -linear_get_scanline_narrow (pixman_iter_t *iter, - const uint32_t *mask) -{ - pixman_image_t *image = iter->image; - int x = iter->x; - int y = iter->y; - int width = iter->width; - uint32_t * buffer = iter->buffer; - - pixman_vector_t v, unit; - pixman_fixed_32_32_t l; - pixman_fixed_48_16_t dx, dy; - gradient_t *gradient = (gradient_t *)image; - linear_gradient_t *linear = (linear_gradient_t *)image; - uint32_t *end = buffer + width; - pixman_gradient_walker_t walker; - - _pixman_gradient_walker_init (&walker, gradient, image->common.repeat); - - /* reference point is the center of the pixel */ - v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2; - v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2; - v.vector[2] = pixman_fixed_1; - - if (image->common.transform) - { - if (!pixman_transform_point_3d (image->common.transform, &v)) - return iter->buffer; - - unit.vector[0] = image->common.transform->matrix[0][0]; - unit.vector[1] = image->common.transform->matrix[1][0]; - unit.vector[2] = image->common.transform->matrix[2][0]; - } - else - { - unit.vector[0] = pixman_fixed_1; - unit.vector[1] = 0; - unit.vector[2] = 0; - } - - dx = linear->p2.x - linear->p1.x; - dy = linear->p2.y - linear->p1.y; - - l = dx * dx + dy * dy; - - if (l == 0 || unit.vector[2] == 0) - { - /* affine transformation only */ - pixman_fixed_32_32_t t, next_inc; - double inc; - - if (l == 0 || v.vector[2] == 0) - { - t = 0; - inc = 0; - } - else - { - double invden, v2; - - invden = pixman_fixed_1 * (double) pixman_fixed_1 / - (l * (double) v.vector[2]); - v2 = v.vector[2] * (1. / pixman_fixed_1); - t = ((dx * v.vector[0] + dy * v.vector[1]) - - (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden; - inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden; - } - next_inc = 0; - - if (((pixman_fixed_32_32_t )(inc * width)) == 0) - { - register uint32_t color; - - color = _pixman_gradient_walker_pixel (&walker, t); - while (buffer < end) - *buffer++ = color; - } - else - { - int i; - - i = 0; - while (buffer < end) - { - if (!mask || *mask++) - { - *buffer = _pixman_gradient_walker_pixel (&walker, - t + next_inc); - } - i++; - next_inc = inc * i; - buffer++; - } - } - } - else - { - /* projective transformation */ - double t; - - t = 0; - - while (buffer < end) - { - if (!mask || *mask++) - { - if (v.vector[2] != 0) - { - double invden, v2; - - invden = pixman_fixed_1 * (double) pixman_fixed_1 / - (l * (double) v.vector[2]); - v2 = v.vector[2] * (1. / pixman_fixed_1); - t = ((dx * v.vector[0] + dy * v.vector[1]) - - (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden; - } - - *buffer = _pixman_gradient_walker_pixel (&walker, t); - } - - ++buffer; - - v.vector[0] += unit.vector[0]; - v.vector[1] += unit.vector[1]; - v.vector[2] += unit.vector[2]; - } - } - - iter->y++; - - return iter->buffer; -} - -static uint32_t * -linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) -{ - uint32_t *buffer = linear_get_scanline_narrow (iter, NULL); - - pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width); - - return buffer; -} - -void -_pixman_linear_gradient_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, - int y, - int width, - int height, - uint8_t *buffer, - iter_flags_t flags) -{ - if (linear_gradient_is_horizontal (image, x, y, width, height)) - { - if (flags & ITER_NARROW) - linear_get_scanline_narrow (iter, NULL); - else - linear_get_scanline_wide (iter, NULL); - - iter->get_scanline = _pixman_iter_get_scanline_noop; - } - else - { - if (flags & ITER_NARROW) - iter->get_scanline = linear_get_scanline_narrow; - else - iter->get_scanline = linear_get_scanline_wide; - } -} - -PIXMAN_EXPORT pixman_image_t * -pixman_image_create_linear_gradient (pixman_point_fixed_t * p1, - pixman_point_fixed_t * p2, - const pixman_gradient_stop_t *stops, - int n_stops) -{ - pixman_image_t *image; - linear_gradient_t *linear; - - image = _pixman_image_allocate (); - - if (!image) - return NULL; - - linear = &image->linear; - - if (!_pixman_init_gradient (&linear->common, stops, n_stops)) - { - free (image); - return NULL; - } - - linear->p1 = *p1; - linear->p2 = *p2; - - image->type = LINEAR; - - return image; -} - +/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + * 2005 Lars Knoll & Zack Rusin, Trolltech + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Keith Packard makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include "pixman-private.h" + +static pixman_bool_t +linear_gradient_is_horizontal (pixman_image_t *image, + int x, + int y, + int width, + int height) +{ + linear_gradient_t *linear = (linear_gradient_t *)image; + pixman_vector_t v; + pixman_fixed_32_32_t l; + pixman_fixed_48_16_t dx, dy; + double inc; + + if (image->common.transform) + { + /* projective transformation */ + if (image->common.transform->matrix[2][0] != 0 || + image->common.transform->matrix[2][1] != 0 || + image->common.transform->matrix[2][2] == 0) + { + return FALSE; + } + + v.vector[0] = image->common.transform->matrix[0][1]; + v.vector[1] = image->common.transform->matrix[1][1]; + v.vector[2] = image->common.transform->matrix[2][2]; + } + else + { + v.vector[0] = 0; + v.vector[1] = pixman_fixed_1; + v.vector[2] = pixman_fixed_1; + } + + dx = linear->p2.x - linear->p1.x; + dy = linear->p2.y - linear->p1.y; + + l = dx * dx + dy * dy; + + if (l == 0) + return FALSE; + + /* + * compute how much the input of the gradient walked changes + * when moving vertically through the whole image + */ + inc = height * (double) pixman_fixed_1 * pixman_fixed_1 * + (dx * v.vector[0] + dy * v.vector[1]) / + (v.vector[2] * (double) l); + + /* check that casting to integer would result in 0 */ + if (-1 < inc && inc < 1) + return TRUE; + + return FALSE; +} + +static uint32_t * +linear_get_scanline_narrow (pixman_iter_t *iter, + const uint32_t *mask) +{ + pixman_image_t *image = iter->image; + int x = iter->x; + int y = iter->y; + int width = iter->width; + uint32_t * buffer = iter->buffer; + + pixman_vector_t v, unit; + pixman_fixed_32_32_t l; + pixman_fixed_48_16_t dx, dy; + gradient_t *gradient = (gradient_t *)image; + linear_gradient_t *linear = (linear_gradient_t *)image; + uint32_t *end = buffer + width; + pixman_gradient_walker_t walker; + + _pixman_gradient_walker_init (&walker, gradient, image->common.repeat); + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (image->common.transform) + { + if (!pixman_transform_point_3d (image->common.transform, &v)) + return iter->buffer; + + unit.vector[0] = image->common.transform->matrix[0][0]; + unit.vector[1] = image->common.transform->matrix[1][0]; + unit.vector[2] = image->common.transform->matrix[2][0]; + } + else + { + unit.vector[0] = pixman_fixed_1; + unit.vector[1] = 0; + unit.vector[2] = 0; + } + + dx = linear->p2.x - linear->p1.x; + dy = linear->p2.y - linear->p1.y; + + l = dx * dx + dy * dy; + + if (l == 0 || unit.vector[2] == 0) + { + /* affine transformation only */ + pixman_fixed_32_32_t t, next_inc; + double inc; + + if (l == 0 || v.vector[2] == 0) + { + t = 0; + inc = 0; + } + else + { + double invden, v2; + + invden = pixman_fixed_1 * (double) pixman_fixed_1 / + (l * (double) v.vector[2]); + v2 = v.vector[2] * (1. / pixman_fixed_1); + t = ((dx * v.vector[0] + dy * v.vector[1]) - + (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden; + inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden; + } + next_inc = 0; + + if (((pixman_fixed_32_32_t )(inc * width)) == 0) + { + register uint32_t color; + + color = _pixman_gradient_walker_pixel (&walker, t); + while (buffer < end) + *buffer++ = color; + } + else + { + int i; + + i = 0; + while (buffer < end) + { + if (!mask || *mask++) + { + *buffer = _pixman_gradient_walker_pixel (&walker, + t + next_inc); + } + i++; + next_inc = inc * i; + buffer++; + } + } + } + else + { + /* projective transformation */ + double t; + + t = 0; + + while (buffer < end) + { + if (!mask || *mask++) + { + if (v.vector[2] != 0) + { + double invden, v2; + + invden = pixman_fixed_1 * (double) pixman_fixed_1 / + (l * (double) v.vector[2]); + v2 = v.vector[2] * (1. / pixman_fixed_1); + t = ((dx * v.vector[0] + dy * v.vector[1]) - + (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden; + } + + *buffer = _pixman_gradient_walker_pixel (&walker, t); + } + + ++buffer; + + v.vector[0] += unit.vector[0]; + v.vector[1] += unit.vector[1]; + v.vector[2] += unit.vector[2]; + } + } + + iter->y++; + + return iter->buffer; +} + +static uint32_t * +linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) +{ + uint32_t *buffer = linear_get_scanline_narrow (iter, NULL); + + pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width); + + return buffer; +} + +void +_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter) +{ + if (linear_gradient_is_horizontal ( + iter->image, iter->x, iter->y, iter->width, iter->height)) + { + if (iter->flags & ITER_NARROW) + linear_get_scanline_narrow (iter, NULL); + else + linear_get_scanline_wide (iter, NULL); + + iter->get_scanline = _pixman_iter_get_scanline_noop; + } + else + { + if (iter->flags & ITER_NARROW) + iter->get_scanline = linear_get_scanline_narrow; + else + iter->get_scanline = linear_get_scanline_wide; + } +} + +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_linear_gradient (pixman_point_fixed_t * p1, + pixman_point_fixed_t * p2, + const pixman_gradient_stop_t *stops, + int n_stops) +{ + pixman_image_t *image; + linear_gradient_t *linear; + + image = _pixman_image_allocate (); + + if (!image) + return NULL; + + linear = &image->linear; + + if (!_pixman_init_gradient (&linear->common, stops, n_stops)) + { + free (image); + return NULL; + } + + linear->p1 = *p1; + linear->p2 = *p2; + + image->type = LINEAR; + + return image; +} + diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h index ee7f4d676..658aeea8a 100644 --- a/pixman/pixman/pixman-private.h +++ b/pixman/pixman/pixman-private.h @@ -212,14 +212,19 @@ typedef enum struct pixman_iter_t { - pixman_iter_get_scanline_t get_scanline; - pixman_iter_write_back_t write_back; - + /* These are initialized by _pixman_implementation_{src,dest}_init */ pixman_image_t * image; uint32_t * buffer; int x, y; int width; + int height; + iter_flags_t flags; + /* These function pointers are initialized by the implementation */ + pixman_iter_get_scanline_t get_scanline; + pixman_iter_write_back_t write_back; + + /* These fields are scratch data that implementations can use */ uint8_t * bits; int stride; }; @@ -228,39 +233,22 @@ void _pixman_bits_image_setup_accessors (bits_image_t *image); void -_pixman_bits_image_src_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags); +_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter); + void -_pixman_bits_image_dest_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags); +_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter); void -_pixman_solid_fill_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags); +_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t *iter); void -_pixman_linear_gradient_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags); +_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter); void -_pixman_radial_gradient_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags); +_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter); void -_pixman_conical_gradient_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags); +_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter); pixman_image_t * _pixman_image_allocate (void); @@ -408,14 +396,7 @@ typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp, int height, uint32_t xor); typedef void (*pixman_iter_init_func_t) (pixman_implementation_t *imp, - pixman_iter_t *iter, - pixman_image_t *image, - int x, - int y, - int width, - int height, - uint8_t *buffer, - iter_flags_t flags); + pixman_iter_t *iter); void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp); void _pixman_setup_combiner_functions_64 (pixman_implementation_t *imp); diff --git a/pixman/pixman/pixman-radial-gradient.c b/pixman/pixman/pixman-radial-gradient.c index 6523b8259..63c712cc2 100644 --- a/pixman/pixman/pixman-radial-gradient.c +++ b/pixman/pixman/pixman-radial-gradient.c @@ -1,463 +1,460 @@ -/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ -/* - * - * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. - * Copyright © 2000 SuSE, Inc. - * 2005 Lars Knoll & Zack Rusin, Trolltech - * Copyright © 2007 Red Hat, Inc. - * - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Keith Packard not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. Keith Packard makes no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - */ - -#ifdef HAVE_CONFIG_H -#include -#endif -#include -#include -#include "pixman-private.h" - -static inline pixman_fixed_32_32_t -dot (pixman_fixed_48_16_t x1, - pixman_fixed_48_16_t y1, - pixman_fixed_48_16_t z1, - pixman_fixed_48_16_t x2, - pixman_fixed_48_16_t y2, - pixman_fixed_48_16_t z2) -{ - /* - * Exact computation, assuming that the input values can - * be represented as pixman_fixed_16_16_t - */ - return x1 * x2 + y1 * y2 + z1 * z2; -} - -static inline double -fdot (double x1, - double y1, - double z1, - double x2, - double y2, - double z2) -{ - /* - * Error can be unbound in some special cases. - * Using clever dot product algorithms (for example compensated - * dot product) would improve this but make the code much less - * obvious - */ - return x1 * x2 + y1 * y2 + z1 * z2; -} - -static uint32_t -radial_compute_color (double a, - double b, - double c, - double inva, - double dr, - double mindr, - pixman_gradient_walker_t *walker, - pixman_repeat_t repeat) -{ - /* - * In this function error propagation can lead to bad results: - * - det can have an unbound error (if b*b-a*c is very small), - * potentially making it the opposite sign of what it should have been - * (thus clearing a pixel that would have been colored or vice-versa) - * or propagating the error to sqrtdet; - * if det has the wrong sign or b is very small, this can lead to bad - * results - * - * - the algorithm used to compute the solutions of the quadratic - * equation is not numerically stable (but saves one division compared - * to the numerically stable one); - * this can be a problem if a*c is much smaller than b*b - * - * - the above problems are worse if a is small (as inva becomes bigger) - */ - double det; - - if (a == 0) - { - double t; - - if (b == 0) - return 0; - - t = pixman_fixed_1 / 2 * c / b; - if (repeat == PIXMAN_REPEAT_NONE) - { - if (0 <= t && t <= pixman_fixed_1) - return _pixman_gradient_walker_pixel (walker, t); - } - else - { - if (t * dr > mindr) - return _pixman_gradient_walker_pixel (walker, t); - } - - return 0; - } - - det = fdot (b, a, 0, b, -c, 0); - if (det >= 0) - { - double sqrtdet, t0, t1; - - sqrtdet = sqrt (det); - t0 = (b + sqrtdet) * inva; - t1 = (b - sqrtdet) * inva; - - if (repeat == PIXMAN_REPEAT_NONE) - { - if (0 <= t0 && t0 <= pixman_fixed_1) - return _pixman_gradient_walker_pixel (walker, t0); - else if (0 <= t1 && t1 <= pixman_fixed_1) - return _pixman_gradient_walker_pixel (walker, t1); - } - else - { - if (t0 * dr > mindr) - return _pixman_gradient_walker_pixel (walker, t0); - else if (t1 * dr > mindr) - return _pixman_gradient_walker_pixel (walker, t1); - } - } - - return 0; -} - -static uint32_t * -radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask) -{ - /* - * Implementation of radial gradients following the PDF specification. - * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference - * Manual (PDF 32000-1:2008 at the time of this writing). - * - * In the radial gradient problem we are given two circles (c₁,r₁) and - * (c₂,r₂) that define the gradient itself. - * - * Mathematically the gradient can be defined as the family of circles - * - * ((1-t)·c₁ + t·(c₂), (1-t)·r₁ + t·r₂) - * - * excluding those circles whose radius would be < 0. When a point - * belongs to more than one circle, the one with a bigger t is the only - * one that contributes to its color. When a point does not belong - * to any of the circles, it is transparent black, i.e. RGBA (0, 0, 0, 0). - * Further limitations on the range of values for t are imposed when - * the gradient is not repeated, namely t must belong to [0,1]. - * - * The graphical result is the same as drawing the valid (radius > 0) - * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient - * is not repeated) using SOURCE operatior composition. - * - * It looks like a cone pointing towards the viewer if the ending circle - * is smaller than the starting one, a cone pointing inside the page if - * the starting circle is the smaller one and like a cylinder if they - * have the same radius. - * - * What we actually do is, given the point whose color we are interested - * in, compute the t values for that point, solving for t in: - * - * length((1-t)·c₁ + t·(c₂) - p) = (1-t)·r₁ + t·r₂ - * - * Let's rewrite it in a simpler way, by defining some auxiliary - * variables: - * - * cd = c₂ - c₁ - * pd = p - c₁ - * dr = r₂ - r₁ - * lenght(t·cd - pd) = r₁ + t·dr - * - * which actually means - * - * hypot(t·cdx - pdx, t·cdy - pdy) = r₁ + t·dr - * - * or - * - * ⎷((t·cdx - pdx)² + (t·cdy - pdy)²) = r₁ + t·dr. - * - * If we impose (as stated earlier) that r₁ + t·dr >= 0, it becomes: - * - * (t·cdx - pdx)² + (t·cdy - pdy)² = (r₁ + t·dr)² - * - * where we can actually expand the squares and solve for t: - * - * t²cdx² - 2t·cdx·pdx + pdx² + t²cdy² - 2t·cdy·pdy + pdy² = - * = r₁² + 2·r₁·t·dr + t²·dr² - * - * (cdx² + cdy² - dr²)t² - 2(cdx·pdx + cdy·pdy + r₁·dr)t + - * (pdx² + pdy² - r₁²) = 0 - * - * A = cdx² + cdy² - dr² - * B = pdx·cdx + pdy·cdy + r₁·dr - * C = pdx² + pdy² - r₁² - * At² - 2Bt + C = 0 - * - * The solutions (unless the equation degenerates because of A = 0) are: - * - * t = (B ± ⎷(B² - A·C)) / A - * - * The solution we are going to prefer is the bigger one, unless the - * radius associated to it is negative (or it falls outside the valid t - * range). - * - * Additional observations (useful for optimizations): - * A does not depend on p - * - * A < 0 <=> one of the two circles completely contains the other one - * <=> for every p, the radiuses associated with the two t solutions - * have opposite sign - */ - pixman_image_t *image = iter->image; - int x = iter->x; - int y = iter->y; - int width = iter->width; - uint32_t *buffer = iter->buffer; - - gradient_t *gradient = (gradient_t *)image; - radial_gradient_t *radial = (radial_gradient_t *)image; - uint32_t *end = buffer + width; - pixman_gradient_walker_t walker; - pixman_vector_t v, unit; - - /* reference point is the center of the pixel */ - v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2; - v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2; - v.vector[2] = pixman_fixed_1; - - _pixman_gradient_walker_init (&walker, gradient, image->common.repeat); - - if (image->common.transform) - { - if (!pixman_transform_point_3d (image->common.transform, &v)) - return iter->buffer; - - unit.vector[0] = image->common.transform->matrix[0][0]; - unit.vector[1] = image->common.transform->matrix[1][0]; - unit.vector[2] = image->common.transform->matrix[2][0]; - } - else - { - unit.vector[0] = pixman_fixed_1; - unit.vector[1] = 0; - unit.vector[2] = 0; - } - - if (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1) - { - /* - * Given: - * - * t = (B ± ⎷(B² - A·C)) / A - * - * where - * - * A = cdx² + cdy² - dr² - * B = pdx·cdx + pdy·cdy + r₁·dr - * C = pdx² + pdy² - r₁² - * det = B² - A·C - * - * Since we have an affine transformation, we know that (pdx, pdy) - * increase linearly with each pixel, - * - * pdx = pdx₀ + n·ux, - * pdy = pdy₀ + n·uy, - * - * we can then express B, C and det through multiple differentiation. - */ - pixman_fixed_32_32_t b, db, c, dc, ddc; - - /* warning: this computation may overflow */ - v.vector[0] -= radial->c1.x; - v.vector[1] -= radial->c1.y; - - /* - * B and C are computed and updated exactly. - * If fdot was used instead of dot, in the worst case it would - * lose 11 bits of precision in each of the multiplication and - * summing up would zero out all the bit that were preserved, - * thus making the result 0 instead of the correct one. - * This would mean a worst case of unbound relative error or - * about 2^10 absolute error - */ - b = dot (v.vector[0], v.vector[1], radial->c1.radius, - radial->delta.x, radial->delta.y, radial->delta.radius); - db = dot (unit.vector[0], unit.vector[1], 0, - radial->delta.x, radial->delta.y, 0); - - c = dot (v.vector[0], v.vector[1], - -((pixman_fixed_48_16_t) radial->c1.radius), - v.vector[0], v.vector[1], radial->c1.radius); - dc = dot (2 * (pixman_fixed_48_16_t) v.vector[0] + unit.vector[0], - 2 * (pixman_fixed_48_16_t) v.vector[1] + unit.vector[1], - 0, - unit.vector[0], unit.vector[1], 0); - ddc = 2 * dot (unit.vector[0], unit.vector[1], 0, - unit.vector[0], unit.vector[1], 0); - - while (buffer < end) - { - if (!mask || *mask++) - { - *buffer = radial_compute_color (radial->a, b, c, - radial->inva, - radial->delta.radius, - radial->mindr, - &walker, - image->common.repeat); - } - - b += db; - c += dc; - dc += ddc; - ++buffer; - } - } - else - { - /* projective */ - /* Warning: - * error propagation guarantees are much looser than in the affine case - */ - while (buffer < end) - { - if (!mask || *mask++) - { - if (v.vector[2] != 0) - { - double pdx, pdy, invv2, b, c; - - invv2 = 1. * pixman_fixed_1 / v.vector[2]; - - pdx = v.vector[0] * invv2 - radial->c1.x; - /* / pixman_fixed_1 */ - - pdy = v.vector[1] * invv2 - radial->c1.y; - /* / pixman_fixed_1 */ - - b = fdot (pdx, pdy, radial->c1.radius, - radial->delta.x, radial->delta.y, - radial->delta.radius); - /* / pixman_fixed_1 / pixman_fixed_1 */ - - c = fdot (pdx, pdy, -radial->c1.radius, - pdx, pdy, radial->c1.radius); - /* / pixman_fixed_1 / pixman_fixed_1 */ - - *buffer = radial_compute_color (radial->a, b, c, - radial->inva, - radial->delta.radius, - radial->mindr, - &walker, - image->common.repeat); - } - else - { - *buffer = 0; - } - } - - ++buffer; - - v.vector[0] += unit.vector[0]; - v.vector[1] += unit.vector[1]; - v.vector[2] += unit.vector[2]; - } - } - - iter->y++; - return iter->buffer; -} - -static uint32_t * -radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) -{ - uint32_t *buffer = radial_get_scanline_narrow (iter, NULL); - - pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width); - - return buffer; -} - -void -_pixman_radial_gradient_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags) -{ - if (flags & ITER_NARROW) - iter->get_scanline = radial_get_scanline_narrow; - else - iter->get_scanline = radial_get_scanline_wide; -} - -PIXMAN_EXPORT pixman_image_t * -pixman_image_create_radial_gradient (pixman_point_fixed_t * inner, - pixman_point_fixed_t * outer, - pixman_fixed_t inner_radius, - pixman_fixed_t outer_radius, - const pixman_gradient_stop_t *stops, - int n_stops) -{ - pixman_image_t *image; - radial_gradient_t *radial; - - image = _pixman_image_allocate (); - - if (!image) - return NULL; - - radial = &image->radial; - - if (!_pixman_init_gradient (&radial->common, stops, n_stops)) - { - free (image); - return NULL; - } - - image->type = RADIAL; - - radial->c1.x = inner->x; - radial->c1.y = inner->y; - radial->c1.radius = inner_radius; - radial->c2.x = outer->x; - radial->c2.y = outer->y; - radial->c2.radius = outer_radius; - - /* warning: this computations may overflow */ - radial->delta.x = radial->c2.x - radial->c1.x; - radial->delta.y = radial->c2.y - radial->c1.y; - radial->delta.radius = radial->c2.radius - radial->c1.radius; - - /* computed exactly, then cast to double -> every bit of the double - representation is correct (53 bits) */ - radial->a = dot (radial->delta.x, radial->delta.y, -radial->delta.radius, - radial->delta.x, radial->delta.y, radial->delta.radius); - if (radial->a != 0) - radial->inva = 1. * pixman_fixed_1 / radial->a; - - radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius; - - return image; -} - +/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + * Copyright © 2000 SuSE, Inc. + * 2005 Lars Knoll & Zack Rusin, Trolltech + * Copyright © 2007 Red Hat, Inc. + * + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Keith Packard makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include "pixman-private.h" + +static inline pixman_fixed_32_32_t +dot (pixman_fixed_48_16_t x1, + pixman_fixed_48_16_t y1, + pixman_fixed_48_16_t z1, + pixman_fixed_48_16_t x2, + pixman_fixed_48_16_t y2, + pixman_fixed_48_16_t z2) +{ + /* + * Exact computation, assuming that the input values can + * be represented as pixman_fixed_16_16_t + */ + return x1 * x2 + y1 * y2 + z1 * z2; +} + +static inline double +fdot (double x1, + double y1, + double z1, + double x2, + double y2, + double z2) +{ + /* + * Error can be unbound in some special cases. + * Using clever dot product algorithms (for example compensated + * dot product) would improve this but make the code much less + * obvious + */ + return x1 * x2 + y1 * y2 + z1 * z2; +} + +static uint32_t +radial_compute_color (double a, + double b, + double c, + double inva, + double dr, + double mindr, + pixman_gradient_walker_t *walker, + pixman_repeat_t repeat) +{ + /* + * In this function error propagation can lead to bad results: + * - det can have an unbound error (if b*b-a*c is very small), + * potentially making it the opposite sign of what it should have been + * (thus clearing a pixel that would have been colored or vice-versa) + * or propagating the error to sqrtdet; + * if det has the wrong sign or b is very small, this can lead to bad + * results + * + * - the algorithm used to compute the solutions of the quadratic + * equation is not numerically stable (but saves one division compared + * to the numerically stable one); + * this can be a problem if a*c is much smaller than b*b + * + * - the above problems are worse if a is small (as inva becomes bigger) + */ + double det; + + if (a == 0) + { + double t; + + if (b == 0) + return 0; + + t = pixman_fixed_1 / 2 * c / b; + if (repeat == PIXMAN_REPEAT_NONE) + { + if (0 <= t && t <= pixman_fixed_1) + return _pixman_gradient_walker_pixel (walker, t); + } + else + { + if (t * dr > mindr) + return _pixman_gradient_walker_pixel (walker, t); + } + + return 0; + } + + det = fdot (b, a, 0, b, -c, 0); + if (det >= 0) + { + double sqrtdet, t0, t1; + + sqrtdet = sqrt (det); + t0 = (b + sqrtdet) * inva; + t1 = (b - sqrtdet) * inva; + + if (repeat == PIXMAN_REPEAT_NONE) + { + if (0 <= t0 && t0 <= pixman_fixed_1) + return _pixman_gradient_walker_pixel (walker, t0); + else if (0 <= t1 && t1 <= pixman_fixed_1) + return _pixman_gradient_walker_pixel (walker, t1); + } + else + { + if (t0 * dr > mindr) + return _pixman_gradient_walker_pixel (walker, t0); + else if (t1 * dr > mindr) + return _pixman_gradient_walker_pixel (walker, t1); + } + } + + return 0; +} + +static uint32_t * +radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask) +{ + /* + * Implementation of radial gradients following the PDF specification. + * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference + * Manual (PDF 32000-1:2008 at the time of this writing). + * + * In the radial gradient problem we are given two circles (c₁,r₁) and + * (c₂,r₂) that define the gradient itself. + * + * Mathematically the gradient can be defined as the family of circles + * + * ((1-t)·c₁ + t·(c₂), (1-t)·r₁ + t·r₂) + * + * excluding those circles whose radius would be < 0. When a point + * belongs to more than one circle, the one with a bigger t is the only + * one that contributes to its color. When a point does not belong + * to any of the circles, it is transparent black, i.e. RGBA (0, 0, 0, 0). + * Further limitations on the range of values for t are imposed when + * the gradient is not repeated, namely t must belong to [0,1]. + * + * The graphical result is the same as drawing the valid (radius > 0) + * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient + * is not repeated) using SOURCE operatior composition. + * + * It looks like a cone pointing towards the viewer if the ending circle + * is smaller than the starting one, a cone pointing inside the page if + * the starting circle is the smaller one and like a cylinder if they + * have the same radius. + * + * What we actually do is, given the point whose color we are interested + * in, compute the t values for that point, solving for t in: + * + * length((1-t)·c₁ + t·(c₂) - p) = (1-t)·r₁ + t·r₂ + * + * Let's rewrite it in a simpler way, by defining some auxiliary + * variables: + * + * cd = c₂ - c₁ + * pd = p - c₁ + * dr = r₂ - r₁ + * lenght(t·cd - pd) = r₁ + t·dr + * + * which actually means + * + * hypot(t·cdx - pdx, t·cdy - pdy) = r₁ + t·dr + * + * or + * + * ⎷((t·cdx - pdx)² + (t·cdy - pdy)²) = r₁ + t·dr. + * + * If we impose (as stated earlier) that r₁ + t·dr >= 0, it becomes: + * + * (t·cdx - pdx)² + (t·cdy - pdy)² = (r₁ + t·dr)² + * + * where we can actually expand the squares and solve for t: + * + * t²cdx² - 2t·cdx·pdx + pdx² + t²cdy² - 2t·cdy·pdy + pdy² = + * = r₁² + 2·r₁·t·dr + t²·dr² + * + * (cdx² + cdy² - dr²)t² - 2(cdx·pdx + cdy·pdy + r₁·dr)t + + * (pdx² + pdy² - r₁²) = 0 + * + * A = cdx² + cdy² - dr² + * B = pdx·cdx + pdy·cdy + r₁·dr + * C = pdx² + pdy² - r₁² + * At² - 2Bt + C = 0 + * + * The solutions (unless the equation degenerates because of A = 0) are: + * + * t = (B ± ⎷(B² - A·C)) / A + * + * The solution we are going to prefer is the bigger one, unless the + * radius associated to it is negative (or it falls outside the valid t + * range). + * + * Additional observations (useful for optimizations): + * A does not depend on p + * + * A < 0 <=> one of the two circles completely contains the other one + * <=> for every p, the radiuses associated with the two t solutions + * have opposite sign + */ + pixman_image_t *image = iter->image; + int x = iter->x; + int y = iter->y; + int width = iter->width; + uint32_t *buffer = iter->buffer; + + gradient_t *gradient = (gradient_t *)image; + radial_gradient_t *radial = (radial_gradient_t *)image; + uint32_t *end = buffer + width; + pixman_gradient_walker_t walker; + pixman_vector_t v, unit; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + _pixman_gradient_walker_init (&walker, gradient, image->common.repeat); + + if (image->common.transform) + { + if (!pixman_transform_point_3d (image->common.transform, &v)) + return iter->buffer; + + unit.vector[0] = image->common.transform->matrix[0][0]; + unit.vector[1] = image->common.transform->matrix[1][0]; + unit.vector[2] = image->common.transform->matrix[2][0]; + } + else + { + unit.vector[0] = pixman_fixed_1; + unit.vector[1] = 0; + unit.vector[2] = 0; + } + + if (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1) + { + /* + * Given: + * + * t = (B ± ⎷(B² - A·C)) / A + * + * where + * + * A = cdx² + cdy² - dr² + * B = pdx·cdx + pdy·cdy + r₁·dr + * C = pdx² + pdy² - r₁² + * det = B² - A·C + * + * Since we have an affine transformation, we know that (pdx, pdy) + * increase linearly with each pixel, + * + * pdx = pdx₀ + n·ux, + * pdy = pdy₀ + n·uy, + * + * we can then express B, C and det through multiple differentiation. + */ + pixman_fixed_32_32_t b, db, c, dc, ddc; + + /* warning: this computation may overflow */ + v.vector[0] -= radial->c1.x; + v.vector[1] -= radial->c1.y; + + /* + * B and C are computed and updated exactly. + * If fdot was used instead of dot, in the worst case it would + * lose 11 bits of precision in each of the multiplication and + * summing up would zero out all the bit that were preserved, + * thus making the result 0 instead of the correct one. + * This would mean a worst case of unbound relative error or + * about 2^10 absolute error + */ + b = dot (v.vector[0], v.vector[1], radial->c1.radius, + radial->delta.x, radial->delta.y, radial->delta.radius); + db = dot (unit.vector[0], unit.vector[1], 0, + radial->delta.x, radial->delta.y, 0); + + c = dot (v.vector[0], v.vector[1], + -((pixman_fixed_48_16_t) radial->c1.radius), + v.vector[0], v.vector[1], radial->c1.radius); + dc = dot (2 * (pixman_fixed_48_16_t) v.vector[0] + unit.vector[0], + 2 * (pixman_fixed_48_16_t) v.vector[1] + unit.vector[1], + 0, + unit.vector[0], unit.vector[1], 0); + ddc = 2 * dot (unit.vector[0], unit.vector[1], 0, + unit.vector[0], unit.vector[1], 0); + + while (buffer < end) + { + if (!mask || *mask++) + { + *buffer = radial_compute_color (radial->a, b, c, + radial->inva, + radial->delta.radius, + radial->mindr, + &walker, + image->common.repeat); + } + + b += db; + c += dc; + dc += ddc; + ++buffer; + } + } + else + { + /* projective */ + /* Warning: + * error propagation guarantees are much looser than in the affine case + */ + while (buffer < end) + { + if (!mask || *mask++) + { + if (v.vector[2] != 0) + { + double pdx, pdy, invv2, b, c; + + invv2 = 1. * pixman_fixed_1 / v.vector[2]; + + pdx = v.vector[0] * invv2 - radial->c1.x; + /* / pixman_fixed_1 */ + + pdy = v.vector[1] * invv2 - radial->c1.y; + /* / pixman_fixed_1 */ + + b = fdot (pdx, pdy, radial->c1.radius, + radial->delta.x, radial->delta.y, + radial->delta.radius); + /* / pixman_fixed_1 / pixman_fixed_1 */ + + c = fdot (pdx, pdy, -radial->c1.radius, + pdx, pdy, radial->c1.radius); + /* / pixman_fixed_1 / pixman_fixed_1 */ + + *buffer = radial_compute_color (radial->a, b, c, + radial->inva, + radial->delta.radius, + radial->mindr, + &walker, + image->common.repeat); + } + else + { + *buffer = 0; + } + } + + ++buffer; + + v.vector[0] += unit.vector[0]; + v.vector[1] += unit.vector[1]; + v.vector[2] += unit.vector[2]; + } + } + + iter->y++; + return iter->buffer; +} + +static uint32_t * +radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) +{ + uint32_t *buffer = radial_get_scanline_narrow (iter, NULL); + + pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width); + + return buffer; +} + +void +_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter) +{ + if (iter->flags & ITER_NARROW) + iter->get_scanline = radial_get_scanline_narrow; + else + iter->get_scanline = radial_get_scanline_wide; +} + +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_radial_gradient (pixman_point_fixed_t * inner, + pixman_point_fixed_t * outer, + pixman_fixed_t inner_radius, + pixman_fixed_t outer_radius, + const pixman_gradient_stop_t *stops, + int n_stops) +{ + pixman_image_t *image; + radial_gradient_t *radial; + + image = _pixman_image_allocate (); + + if (!image) + return NULL; + + radial = &image->radial; + + if (!_pixman_init_gradient (&radial->common, stops, n_stops)) + { + free (image); + return NULL; + } + + image->type = RADIAL; + + radial->c1.x = inner->x; + radial->c1.y = inner->y; + radial->c1.radius = inner_radius; + radial->c2.x = outer->x; + radial->c2.y = outer->y; + radial->c2.radius = outer_radius; + + /* warning: this computations may overflow */ + radial->delta.x = radial->c2.x - radial->c1.x; + radial->delta.y = radial->c2.y - radial->c1.y; + radial->delta.radius = radial->c2.radius - radial->c1.radius; + + /* computed exactly, then cast to double -> every bit of the double + representation is correct (53 bits) */ + radial->a = dot (radial->delta.x, radial->delta.y, -radial->delta.radius, + radial->delta.x, radial->delta.y, radial->delta.radius); + if (radial->a != 0) + radial->inva = 1. * pixman_fixed_1 / radial->a; + + radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius; + + return image; +} + diff --git a/pixman/pixman/pixman-solid-fill.c b/pixman/pixman/pixman-solid-fill.c index 67681f2c0..fcda3abb5 100644 --- a/pixman/pixman/pixman-solid-fill.c +++ b/pixman/pixman/pixman-solid-fill.c @@ -1,92 +1,89 @@ -/* - * Copyright © 2000 SuSE, Inc. - * Copyright © 2007, 2009 Red Hat, Inc. - * Copyright © 2009 Soren Sandmann - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of SuSE not be used in advertising or - * publicity pertaining to distribution of the software without specific, - * written prior permission. SuSE makes no representations about the - * suitability of this software for any purpose. It is provided "as is" - * without express or implied warranty. - * - * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE - * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifdef HAVE_CONFIG_H -#include -#endif -#include "pixman-private.h" - -void -_pixman_solid_fill_iter_init (pixman_image_t *image, - pixman_iter_t *iter, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags) -{ - if (flags & ITER_NARROW) - { - uint32_t *b = (uint32_t *)buffer; - uint32_t *e = b + width; - uint32_t color = image->solid.color_32; - - while (b < e) - *(b++) = color; - } - else - { - uint64_t *b = (uint64_t *)buffer; - uint64_t *e = b + width; - uint64_t color = image->solid.color_64; - - while (b < e) - *(b++) = color; - } - - iter->get_scanline = _pixman_iter_get_scanline_noop; -} - -static uint32_t -color_to_uint32 (const pixman_color_t *color) -{ - return - (color->alpha >> 8 << 24) | - (color->red >> 8 << 16) | - (color->green & 0xff00) | - (color->blue >> 8); -} - -static uint64_t -color_to_uint64 (const pixman_color_t *color) -{ - return - ((uint64_t)color->alpha << 48) | - ((uint64_t)color->red << 32) | - ((uint64_t)color->green << 16) | - ((uint64_t)color->blue); -} - -PIXMAN_EXPORT pixman_image_t * -pixman_image_create_solid_fill (pixman_color_t *color) -{ - pixman_image_t *img = _pixman_image_allocate (); - - if (!img) - return NULL; - - img->type = SOLID; - img->solid.color = *color; - img->solid.color_32 = color_to_uint32 (color); - img->solid.color_64 = color_to_uint64 (color); - - return img; -} - +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007, 2009 Red Hat, Inc. + * Copyright © 2009 Soren Sandmann + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. SuSE makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include "pixman-private.h" + +void +_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t *iter) +{ + if (iter->flags & ITER_NARROW) + { + uint32_t *b = (uint32_t *)iter->buffer; + uint32_t *e = b + iter->width; + uint32_t color = iter->image->solid.color_32; + + while (b < e) + *(b++) = color; + } + else + { + uint64_t *b = (uint64_t *)iter->buffer; + uint64_t *e = b + iter->width; + uint64_t color = image->solid.color_64; + + while (b < e) + *(b++) = color; + } + + iter->get_scanline = _pixman_iter_get_scanline_noop; +} + +static uint32_t +color_to_uint32 (const pixman_color_t *color) +{ + return + (color->alpha >> 8 << 24) | + (color->red >> 8 << 16) | + (color->green & 0xff00) | + (color->blue >> 8); +} + +static uint64_t +color_to_uint64 (const pixman_color_t *color) +{ + return + ((uint64_t)color->alpha << 48) | + ((uint64_t)color->red << 32) | + ((uint64_t)color->green << 16) | + ((uint64_t)color->blue); +} + +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_solid_fill (pixman_color_t *color) +{ + pixman_image_t *img = _pixman_image_allocate (); + + if (!img) + return NULL; + + img->type = SOLID; + img->solid.color = *color; + img->solid.color_32 = color_to_uint32 (color); + img->solid.color_64 = color_to_uint64 (color); + + return img; +} + diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index 696005f75..a52a959f5 100644 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -1,6077 +1,6076 @@ -/* - * Copyright © 2008 Rodrigo Kumpera - * Copyright © 2008 André Tupinambá - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Red Hat not be used in advertising or - * publicity pertaining to distribution of the software without specific, - * written prior permission. Red Hat makes no representations about the - * suitability of this software for any purpose. It is provided "as is" - * without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Rodrigo Kumpera (kumpera@gmail.com) - * André Tupinambá (andrelrt@gmail.com) - * - * Based on work by Owen Taylor and Søren Sandmann - */ -#ifdef HAVE_CONFIG_H -#include -#endif - -#include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ -#include /* for SSE2 intrinsics */ -#include "pixman-private.h" -#include "pixman-combine32.h" -#include "pixman-fast-path.h" - -static __m128i mask_0080; -static __m128i mask_00ff; -static __m128i mask_0101; -static __m128i mask_ffff; -static __m128i mask_ff000000; -static __m128i mask_alpha; - -static __m128i mask_565_r; -static __m128i mask_565_g1, mask_565_g2; -static __m128i mask_565_b; -static __m128i mask_red; -static __m128i mask_green; -static __m128i mask_blue; - -static __m128i mask_565_fix_rb; -static __m128i mask_565_fix_g; - -static force_inline __m128i -unpack_32_1x128 (uint32_t data) -{ - return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); -} - -static force_inline void -unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) -{ - *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); - *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); -} - -static force_inline __m128i -unpack_565_to_8888 (__m128i lo) -{ - __m128i r, g, b, rb, t; - - r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); - g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); - b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); - - rb = _mm_or_si128 (r, b); - t = _mm_and_si128 (rb, mask_565_fix_rb); - t = _mm_srli_epi32 (t, 5); - rb = _mm_or_si128 (rb, t); - - t = _mm_and_si128 (g, mask_565_fix_g); - t = _mm_srli_epi32 (t, 6); - g = _mm_or_si128 (g, t); - - return _mm_or_si128 (rb, g); -} - -static force_inline void -unpack_565_128_4x128 (__m128i data, - __m128i* data0, - __m128i* data1, - __m128i* data2, - __m128i* data3) -{ - __m128i lo, hi; - - lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); - hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); - - lo = unpack_565_to_8888 (lo); - hi = unpack_565_to_8888 (hi); - - unpack_128_2x128 (lo, data0, data1); - unpack_128_2x128 (hi, data2, data3); -} - -static force_inline uint16_t -pack_565_32_16 (uint32_t pixel) -{ - return (uint16_t) (((pixel >> 8) & 0xf800) | - ((pixel >> 5) & 0x07e0) | - ((pixel >> 3) & 0x001f)); -} - -static force_inline __m128i -pack_2x128_128 (__m128i lo, __m128i hi) -{ - return _mm_packus_epi16 (lo, hi); -} - -static force_inline __m128i -pack_565_2x128_128 (__m128i lo, __m128i hi) -{ - __m128i data; - __m128i r, g1, g2, b; - - data = pack_2x128_128 (lo, hi); - - r = _mm_and_si128 (data, mask_565_r); - g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); - g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); - b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); - - return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); -} - -static force_inline __m128i -pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) -{ - return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), - pack_565_2x128_128 (*xmm2, *xmm3)); -} - -static force_inline int -is_opaque (__m128i x) -{ - __m128i ffs = _mm_cmpeq_epi8 (x, x); - - return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; -} - -static force_inline int -is_zero (__m128i x) -{ - return _mm_movemask_epi8 ( - _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; -} - -static force_inline int -is_transparent (__m128i x) -{ - return (_mm_movemask_epi8 ( - _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; -} - -static force_inline __m128i -expand_pixel_32_1x128 (uint32_t data) -{ - return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); -} - -static force_inline __m128i -expand_alpha_1x128 (__m128i data) -{ - return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, - _MM_SHUFFLE (3, 3, 3, 3)), - _MM_SHUFFLE (3, 3, 3, 3)); -} - -static force_inline void -expand_alpha_2x128 (__m128i data_lo, - __m128i data_hi, - __m128i* alpha_lo, - __m128i* alpha_hi) -{ - __m128i lo, hi; - - lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); - hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); - - *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); - *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); -} - -static force_inline void -expand_alpha_rev_2x128 (__m128i data_lo, - __m128i data_hi, - __m128i* alpha_lo, - __m128i* alpha_hi) -{ - __m128i lo, hi; - - lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); - hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); - *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); - *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); -} - -static force_inline void -pix_multiply_2x128 (__m128i* data_lo, - __m128i* data_hi, - __m128i* alpha_lo, - __m128i* alpha_hi, - __m128i* ret_lo, - __m128i* ret_hi) -{ - __m128i lo, hi; - - lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); - hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); - lo = _mm_adds_epu16 (lo, mask_0080); - hi = _mm_adds_epu16 (hi, mask_0080); - *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); - *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); -} - -static force_inline void -pix_add_multiply_2x128 (__m128i* src_lo, - __m128i* src_hi, - __m128i* alpha_dst_lo, - __m128i* alpha_dst_hi, - __m128i* dst_lo, - __m128i* dst_hi, - __m128i* alpha_src_lo, - __m128i* alpha_src_hi, - __m128i* ret_lo, - __m128i* ret_hi) -{ - __m128i t1_lo, t1_hi; - __m128i t2_lo, t2_hi; - - pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); - pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); - - *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); - *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); -} - -static force_inline void -negate_2x128 (__m128i data_lo, - __m128i data_hi, - __m128i* neg_lo, - __m128i* neg_hi) -{ - *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); - *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); -} - -static force_inline void -invert_colors_2x128 (__m128i data_lo, - __m128i data_hi, - __m128i* inv_lo, - __m128i* inv_hi) -{ - __m128i lo, hi; - - lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); - hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); - *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); - *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); -} - -static force_inline void -over_2x128 (__m128i* src_lo, - __m128i* src_hi, - __m128i* alpha_lo, - __m128i* alpha_hi, - __m128i* dst_lo, - __m128i* dst_hi) -{ - __m128i t1, t2; - - negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); - - pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); - - *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); - *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); -} - -static force_inline void -over_rev_non_pre_2x128 (__m128i src_lo, - __m128i src_hi, - __m128i* dst_lo, - __m128i* dst_hi) -{ - __m128i lo, hi; - __m128i alpha_lo, alpha_hi; - - expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); - - lo = _mm_or_si128 (alpha_lo, mask_alpha); - hi = _mm_or_si128 (alpha_hi, mask_alpha); - - invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); - - pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); - - over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); -} - -static force_inline void -in_over_2x128 (__m128i* src_lo, - __m128i* src_hi, - __m128i* alpha_lo, - __m128i* alpha_hi, - __m128i* mask_lo, - __m128i* mask_hi, - __m128i* dst_lo, - __m128i* dst_hi) -{ - __m128i s_lo, s_hi; - __m128i a_lo, a_hi; - - pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); - pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); - - over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); -} - -/* load 4 pixels from a 16-byte boundary aligned address */ -static force_inline __m128i -load_128_aligned (__m128i* src) -{ - return _mm_load_si128 (src); -} - -/* load 4 pixels from a unaligned address */ -static force_inline __m128i -load_128_unaligned (const __m128i* src) -{ - return _mm_loadu_si128 (src); -} - -/* save 4 pixels using Write Combining memory on a 16-byte - * boundary aligned address - */ -static force_inline void -save_128_write_combining (__m128i* dst, - __m128i data) -{ - _mm_stream_si128 (dst, data); -} - -/* save 4 pixels on a 16-byte boundary aligned address */ -static force_inline void -save_128_aligned (__m128i* dst, - __m128i data) -{ - _mm_store_si128 (dst, data); -} - -/* save 4 pixels on a unaligned address */ -static force_inline void -save_128_unaligned (__m128i* dst, - __m128i data) -{ - _mm_storeu_si128 (dst, data); -} - -static force_inline __m128i -load_32_1x128 (uint32_t data) -{ - return _mm_cvtsi32_si128 (data); -} - -static force_inline __m128i -expand_alpha_rev_1x128 (__m128i data) -{ - return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); -} - -static force_inline __m128i -expand_pixel_8_1x128 (uint8_t data) -{ - return _mm_shufflelo_epi16 ( - unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); -} - -static force_inline __m128i -pix_multiply_1x128 (__m128i data, - __m128i alpha) -{ - return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), - mask_0080), - mask_0101); -} - -static force_inline __m128i -pix_add_multiply_1x128 (__m128i* src, - __m128i* alpha_dst, - __m128i* dst, - __m128i* alpha_src) -{ - __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); - __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); - - return _mm_adds_epu8 (t1, t2); -} - -static force_inline __m128i -negate_1x128 (__m128i data) -{ - return _mm_xor_si128 (data, mask_00ff); -} - -static force_inline __m128i -invert_colors_1x128 (__m128i data) -{ - return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); -} - -static force_inline __m128i -over_1x128 (__m128i src, __m128i alpha, __m128i dst) -{ - return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); -} - -static force_inline __m128i -in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) -{ - return over_1x128 (pix_multiply_1x128 (*src, *mask), - pix_multiply_1x128 (*alpha, *mask), - *dst); -} - -static force_inline __m128i -over_rev_non_pre_1x128 (__m128i src, __m128i dst) -{ - __m128i alpha = expand_alpha_1x128 (src); - - return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), - _mm_or_si128 (alpha, mask_alpha)), - alpha, - dst); -} - -static force_inline uint32_t -pack_1x128_32 (__m128i data) -{ - return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); -} - -static force_inline __m128i -expand565_16_1x128 (uint16_t pixel) -{ - __m128i m = _mm_cvtsi32_si128 (pixel); - - m = unpack_565_to_8888 (m); - - return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); -} - -static force_inline uint32_t -core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) -{ - uint8_t a; - __m128i xmms; - - a = src >> 24; - - if (a == 0xff) - { - return src; - } - else if (src) - { - xmms = unpack_32_1x128 (src); - return pack_1x128_32 ( - over_1x128 (xmms, expand_alpha_1x128 (xmms), - unpack_32_1x128 (dst))); - } - - return dst; -} - -static force_inline uint32_t -combine1 (const uint32_t *ps, const uint32_t *pm) -{ - uint32_t s = *ps; - - if (pm) - { - __m128i ms, mm; - - mm = unpack_32_1x128 (*pm); - mm = expand_alpha_1x128 (mm); - - ms = unpack_32_1x128 (s); - ms = pix_multiply_1x128 (ms, mm); - - s = pack_1x128_32 (ms); - } - - return s; -} - -static force_inline __m128i -combine4 (const __m128i *ps, const __m128i *pm) -{ - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_msk_lo, xmm_msk_hi; - __m128i s; - - if (pm) - { - xmm_msk_lo = load_128_unaligned (pm); - - if (is_transparent (xmm_msk_lo)) - return _mm_setzero_si128 (); - } - - s = load_128_unaligned (ps); - - if (pm) - { - unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); - - expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); - - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_msk_lo, &xmm_msk_hi, - &xmm_src_lo, &xmm_src_hi); - - s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); - } - - return s; -} - -static force_inline void -core_combine_over_u_sse2_mask (uint32_t * pd, - const uint32_t* ps, - const uint32_t* pm, - int w) -{ - uint32_t s, d; - - /* Align dst on a 16-byte boundary */ - while (w && ((unsigned long)pd & 15)) - { - d = *pd; - s = combine1 (ps, pm); - - if (s) - *pd = core_combine_over_u_pixel_sse2 (s, d); - pd++; - ps++; - pm++; - w--; - } - - while (w >= 4) - { - __m128i mask = load_128_unaligned ((__m128i *)pm); - - if (!is_zero (mask)) - { - __m128i src; - __m128i src_hi, src_lo; - __m128i mask_hi, mask_lo; - __m128i alpha_hi, alpha_lo; - - src = load_128_unaligned ((__m128i *)ps); - - if (is_opaque (_mm_and_si128 (src, mask))) - { - save_128_aligned ((__m128i *)pd, src); - } - else - { - __m128i dst = load_128_aligned ((__m128i *)pd); - __m128i dst_hi, dst_lo; - - unpack_128_2x128 (mask, &mask_lo, &mask_hi); - unpack_128_2x128 (src, &src_lo, &src_hi); - - expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi); - pix_multiply_2x128 (&src_lo, &src_hi, - &mask_lo, &mask_hi, - &src_lo, &src_hi); - - unpack_128_2x128 (dst, &dst_lo, &dst_hi); - - expand_alpha_2x128 (src_lo, src_hi, - &alpha_lo, &alpha_hi); - - over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, - &dst_lo, &dst_hi); - - save_128_aligned ( - (__m128i *)pd, - pack_2x128_128 (dst_lo, dst_hi)); - } - } - - pm += 4; - ps += 4; - pd += 4; - w -= 4; - } - while (w) - { - d = *pd; - s = combine1 (ps, pm); - - if (s) - *pd = core_combine_over_u_pixel_sse2 (s, d); - pd++; - ps++; - pm++; - - w--; - } -} - -static force_inline void -core_combine_over_u_sse2_no_mask (uint32_t * pd, - const uint32_t* ps, - int w) -{ - uint32_t s, d; - - /* Align dst on a 16-byte boundary */ - while (w && ((unsigned long)pd & 15)) - { - d = *pd; - s = *ps; - - if (s) - *pd = core_combine_over_u_pixel_sse2 (s, d); - pd++; - ps++; - w--; - } - - while (w >= 4) - { - __m128i src; - __m128i src_hi, src_lo, dst_hi, dst_lo; - __m128i alpha_hi, alpha_lo; - - src = load_128_unaligned ((__m128i *)ps); - - if (!is_zero (src)) - { - if (is_opaque (src)) - { - save_128_aligned ((__m128i *)pd, src); - } - else - { - __m128i dst = load_128_aligned ((__m128i *)pd); - - unpack_128_2x128 (src, &src_lo, &src_hi); - unpack_128_2x128 (dst, &dst_lo, &dst_hi); - - expand_alpha_2x128 (src_lo, src_hi, - &alpha_lo, &alpha_hi); - over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, - &dst_lo, &dst_hi); - - save_128_aligned ( - (__m128i *)pd, - pack_2x128_128 (dst_lo, dst_hi)); - } - } - - ps += 4; - pd += 4; - w -= 4; - } - while (w) - { - d = *pd; - s = *ps; - - if (s) - *pd = core_combine_over_u_pixel_sse2 (s, d); - pd++; - ps++; - - w--; - } -} - -static force_inline void -sse2_combine_over_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - if (pm) - core_combine_over_u_sse2_mask (pd, ps, pm, w); - else - core_combine_over_u_sse2_no_mask (pd, ps, w); -} - -static void -sse2_combine_over_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, d; - - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_alpha_lo, xmm_alpha_hi; - - /* Align dst on a 16-byte boundary */ - while (w && - ((unsigned long)pd & 15)) - { - d = *pd; - s = combine1 (ps, pm); - - *pd++ = core_combine_over_u_pixel_sse2 (d, s); - w--; - ps++; - if (pm) - pm++; - } - - while (w >= 4) - { - /* I'm loading unaligned because I'm not sure - * about the address alignment. - */ - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); - - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - over_2x128 (&xmm_dst_lo, &xmm_dst_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_src_lo, &xmm_src_hi); - - /* rebuid the 4 pixel data and save*/ - save_128_aligned ((__m128i*)pd, - pack_2x128_128 (xmm_src_lo, xmm_src_hi)); - - w -= 4; - ps += 4; - pd += 4; - - if (pm) - pm += 4; - } - - while (w) - { - d = *pd; - s = combine1 (ps, pm); - - *pd++ = core_combine_over_u_pixel_sse2 (d, s); - ps++; - w--; - if (pm) - pm++; - } -} - -static force_inline uint32_t -core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) -{ - uint32_t maska = src >> 24; - - if (maska == 0) - { - return 0; - } - else if (maska != 0xff) - { - return pack_1x128_32 ( - pix_multiply_1x128 (unpack_32_1x128 (dst), - expand_alpha_1x128 (unpack_32_1x128 (src)))); - } - - return dst; -} - -static void -sse2_combine_in_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, d; - - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - - while (w && ((unsigned long) pd & 15)) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_in_u_pixel_sse2 (d, s); - w--; - ps++; - if (pm) - pm++; - } - - while (w >= 4) - { - xmm_dst_hi = load_128_aligned ((__m128i*) pd); - xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_dst_lo, &xmm_dst_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ((__m128i*)pd, - pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - w -= 4; - if (pm) - pm += 4; - } - - while (w) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_in_u_pixel_sse2 (d, s); - w--; - ps++; - if (pm) - pm++; - } -} - -static void -sse2_combine_in_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, d; - - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - - while (w && ((unsigned long) pd & 15)) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_in_u_pixel_sse2 (s, d); - ps++; - w--; - if (pm) - pm++; - } - - while (w >= 4) - { - xmm_dst_hi = load_128_aligned ((__m128i*) pd); - xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); - - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, - &xmm_src_lo, &xmm_src_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - w -= 4; - if (pm) - pm += 4; - } - - while (w) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_in_u_pixel_sse2 (s, d); - w--; - ps++; - if (pm) - pm++; - } -} - -static void -sse2_combine_out_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - while (w && ((unsigned long) pd & 15)) - { - uint32_t s = combine1 (ps, pm); - uint32_t d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - unpack_32_1x128 (d), negate_1x128 ( - expand_alpha_1x128 (unpack_32_1x128 (s))))); - - if (pm) - pm++; - ps++; - w--; - } - - while (w >= 4) - { - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); - - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - - pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, - &xmm_src_lo, &xmm_src_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - if (pm) - pm += 4; - - w -= 4; - } - - while (w) - { - uint32_t s = combine1 (ps, pm); - uint32_t d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - unpack_32_1x128 (d), negate_1x128 ( - expand_alpha_1x128 (unpack_32_1x128 (s))))); - ps++; - if (pm) - pm++; - w--; - } -} - -static void -sse2_combine_out_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - while (w && ((unsigned long) pd & 15)) - { - uint32_t s = combine1 (ps, pm); - uint32_t d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - unpack_32_1x128 (s), negate_1x128 ( - expand_alpha_1x128 (unpack_32_1x128 (d))))); - w--; - ps++; - if (pm) - pm++; - } - - while (w >= 4) - { - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - - xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); - - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_dst_lo, &xmm_dst_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - w -= 4; - if (pm) - pm += 4; - } - - while (w) - { - uint32_t s = combine1 (ps, pm); - uint32_t d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - unpack_32_1x128 (s), negate_1x128 ( - expand_alpha_1x128 (unpack_32_1x128 (d))))); - w--; - ps++; - if (pm) - pm++; - } -} - -static force_inline uint32_t -core_combine_atop_u_pixel_sse2 (uint32_t src, - uint32_t dst) -{ - __m128i s = unpack_32_1x128 (src); - __m128i d = unpack_32_1x128 (dst); - - __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); - __m128i da = expand_alpha_1x128 (d); - - return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); -} - -static void -sse2_combine_atop_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, d; - - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; - __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - - while (w && ((unsigned long) pd & 15)) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_atop_u_pixel_sse2 (s, d); - w--; - ps++; - if (pm) - pm++; - } - - while (w >= 4) - { - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); - - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_src_lo, &xmm_alpha_src_hi); - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, - &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); - - negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, - &xmm_alpha_src_lo, &xmm_alpha_src_hi); - - pix_add_multiply_2x128 ( - &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, - &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - w -= 4; - if (pm) - pm += 4; - } - - while (w) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_atop_u_pixel_sse2 (s, d); - w--; - ps++; - if (pm) - pm++; - } -} - -static force_inline uint32_t -core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, - uint32_t dst) -{ - __m128i s = unpack_32_1x128 (src); - __m128i d = unpack_32_1x128 (dst); - - __m128i sa = expand_alpha_1x128 (s); - __m128i da = negate_1x128 (expand_alpha_1x128 (d)); - - return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); -} - -static void -sse2_combine_atop_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, d; - - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; - __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - - while (w && ((unsigned long) pd & 15)) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); - ps++; - w--; - if (pm) - pm++; - } - - while (w >= 4) - { - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); - - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_src_lo, &xmm_alpha_src_hi); - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, - &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); - - negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, - &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); - - pix_add_multiply_2x128 ( - &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, - &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - w -= 4; - if (pm) - pm += 4; - } - - while (w) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); - ps++; - w--; - if (pm) - pm++; - } -} - -static force_inline uint32_t -core_combine_xor_u_pixel_sse2 (uint32_t src, - uint32_t dst) -{ - __m128i s = unpack_32_1x128 (src); - __m128i d = unpack_32_1x128 (dst); - - __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); - __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); - - return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); -} - -static void -sse2_combine_xor_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - int w = width; - uint32_t s, d; - uint32_t* pd = dst; - const uint32_t* ps = src; - const uint32_t* pm = mask; - - __m128i xmm_src, xmm_src_lo, xmm_src_hi; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; - __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - - while (w && ((unsigned long) pd & 15)) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_xor_u_pixel_sse2 (s, d); - w--; - ps++; - if (pm) - pm++; - } - - while (w >= 4) - { - xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); - xmm_dst = load_128_aligned ((__m128i*) pd); - - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_src_lo, &xmm_alpha_src_hi); - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, - &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); - - negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, - &xmm_alpha_src_lo, &xmm_alpha_src_hi); - negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, - &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); - - pix_add_multiply_2x128 ( - &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, - &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - w -= 4; - if (pm) - pm += 4; - } - - while (w) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_xor_u_pixel_sse2 (s, d); - w--; - ps++; - if (pm) - pm++; - } -} - -static force_inline void -sse2_combine_add_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - int w = width; - uint32_t s, d; - uint32_t* pd = dst; - const uint32_t* ps = src; - const uint32_t* pm = mask; - - while (w && (unsigned long)pd & 15) - { - s = combine1 (ps, pm); - d = *pd; - - ps++; - if (pm) - pm++; - *pd++ = _mm_cvtsi128_si32 ( - _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); - w--; - } - - while (w >= 4) - { - __m128i s; - - s = combine4 ((__m128i*)ps, (__m128i*)pm); - - save_128_aligned ( - (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); - - pd += 4; - ps += 4; - if (pm) - pm += 4; - w -= 4; - } - - while (w--) - { - s = combine1 (ps, pm); - d = *pd; - - ps++; - *pd++ = _mm_cvtsi128_si32 ( - _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); - if (pm) - pm++; - } -} - -static force_inline uint32_t -core_combine_saturate_u_pixel_sse2 (uint32_t src, - uint32_t dst) -{ - __m128i ms = unpack_32_1x128 (src); - __m128i md = unpack_32_1x128 (dst); - uint32_t sa = src >> 24; - uint32_t da = ~dst >> 24; - - if (sa > da) - { - ms = pix_multiply_1x128 ( - ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); - } - - return pack_1x128_32 (_mm_adds_epu16 (md, ms)); -} - -static void -sse2_combine_saturate_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, d; - - uint32_t pack_cmp; - __m128i xmm_src, xmm_dst; - - while (w && (unsigned long)pd & 15) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); - w--; - ps++; - if (pm) - pm++; - } - - while (w >= 4) - { - xmm_dst = load_128_aligned ((__m128i*)pd); - xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); - - pack_cmp = _mm_movemask_epi8 ( - _mm_cmpgt_epi32 ( - _mm_srli_epi32 (xmm_src, 24), - _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); - - /* if some alpha src is grater than respective ~alpha dst */ - if (pack_cmp) - { - s = combine1 (ps++, pm); - d = *pd; - *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); - if (pm) - pm++; - - s = combine1 (ps++, pm); - d = *pd; - *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); - if (pm) - pm++; - - s = combine1 (ps++, pm); - d = *pd; - *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); - if (pm) - pm++; - - s = combine1 (ps++, pm); - d = *pd; - *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); - if (pm) - pm++; - } - else - { - save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); - - pd += 4; - ps += 4; - if (pm) - pm += 4; - } - - w -= 4; - } - - while (w--) - { - s = combine1 (ps, pm); - d = *pd; - - *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); - ps++; - if (pm) - pm++; - } -} - -static void -sse2_combine_src_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, m; - - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_mask_lo, xmm_mask_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - - while (w && (unsigned long)pd & 15) - { - s = *ps++; - m = *pm++; - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); - w--; - } - - while (w >= 4) - { - xmm_src_hi = load_128_unaligned ((__m128i*)ps); - xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - s = *ps++; - m = *pm++; - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); - w--; - } -} - -static force_inline uint32_t -core_combine_over_ca_pixel_sse2 (uint32_t src, - uint32_t mask, - uint32_t dst) -{ - __m128i s = unpack_32_1x128 (src); - __m128i expAlpha = expand_alpha_1x128 (s); - __m128i unpk_mask = unpack_32_1x128 (mask); - __m128i unpk_dst = unpack_32_1x128 (dst); - - return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); -} - -static void -sse2_combine_over_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, m, d; - - __m128i xmm_alpha_lo, xmm_alpha_hi; - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask_lo, xmm_mask_hi; - - while (w && (unsigned long)pd & 15) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); - w--; - } - - while (w >= 4) - { - xmm_dst_hi = load_128_aligned ((__m128i*)pd); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); - xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - in_over_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); - w--; - } -} - -static force_inline uint32_t -core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, - uint32_t mask, - uint32_t dst) -{ - __m128i d = unpack_32_1x128 (dst); - - return pack_1x128_32 ( - over_1x128 (d, expand_alpha_1x128 (d), - pix_multiply_1x128 (unpack_32_1x128 (src), - unpack_32_1x128 (mask)))); -} - -static void -sse2_combine_over_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, m, d; - - __m128i xmm_alpha_lo, xmm_alpha_hi; - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask_lo, xmm_mask_hi; - - while (w && (unsigned long)pd & 15) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); - w--; - } - - while (w >= 4) - { - xmm_dst_hi = load_128_aligned ((__m128i*)pd); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); - xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_mask_lo, &xmm_mask_hi); - - over_2x128 (&xmm_dst_lo, &xmm_dst_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_mask_lo, &xmm_mask_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); - w--; - } -} - -static void -sse2_combine_in_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, m, d; - - __m128i xmm_alpha_lo, xmm_alpha_hi; - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask_lo, xmm_mask_hi; - - while (w && (unsigned long)pd & 15) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), - expand_alpha_1x128 (unpack_32_1x128 (d)))); - - w--; - } - - while (w >= 4) - { - xmm_dst_hi = load_128_aligned ((__m128i*)pd); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); - xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst_lo, &xmm_dst_hi); - - pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - pix_multiply_1x128 ( - unpack_32_1x128 (s), unpack_32_1x128 (m)), - expand_alpha_1x128 (unpack_32_1x128 (d)))); - - w--; - } -} - -static void -sse2_combine_in_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, m, d; - - __m128i xmm_alpha_lo, xmm_alpha_hi; - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask_lo, xmm_mask_hi; - - while (w && (unsigned long)pd & 15) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - unpack_32_1x128 (d), - pix_multiply_1x128 (unpack_32_1x128 (m), - expand_alpha_1x128 (unpack_32_1x128 (s))))); - w--; - } - - while (w >= 4) - { - xmm_dst_hi = load_128_aligned ((__m128i*)pd); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); - xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - unpack_32_1x128 (d), - pix_multiply_1x128 (unpack_32_1x128 (m), - expand_alpha_1x128 (unpack_32_1x128 (s))))); - w--; - } -} - -static void -sse2_combine_out_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, m, d; - - __m128i xmm_alpha_lo, xmm_alpha_hi; - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask_lo, xmm_mask_hi; - - while (w && (unsigned long)pd & 15) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - pix_multiply_1x128 ( - unpack_32_1x128 (s), unpack_32_1x128 (m)), - negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); - w--; - } - - while (w >= 4) - { - xmm_dst_hi = load_128_aligned ((__m128i*)pd); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); - xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst_lo, &xmm_dst_hi); - pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - pix_multiply_1x128 ( - unpack_32_1x128 (s), unpack_32_1x128 (m)), - negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); - - w--; - } -} - -static void -sse2_combine_out_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, m, d; - - __m128i xmm_alpha_lo, xmm_alpha_hi; - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask_lo, xmm_mask_hi; - - while (w && (unsigned long)pd & 15) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - unpack_32_1x128 (d), - negate_1x128 (pix_multiply_1x128 ( - unpack_32_1x128 (m), - expand_alpha_1x128 (unpack_32_1x128 (s)))))); - w--; - } - - while (w >= 4) - { - xmm_dst_hi = load_128_aligned ((__m128i*)pd); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); - xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_mask_lo, &xmm_mask_hi); - - negate_2x128 (xmm_mask_lo, xmm_mask_hi, - &xmm_mask_lo, &xmm_mask_hi); - - pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x128_32 ( - pix_multiply_1x128 ( - unpack_32_1x128 (d), - negate_1x128 (pix_multiply_1x128 ( - unpack_32_1x128 (m), - expand_alpha_1x128 (unpack_32_1x128 (s)))))); - w--; - } -} - -static force_inline uint32_t -core_combine_atop_ca_pixel_sse2 (uint32_t src, - uint32_t mask, - uint32_t dst) -{ - __m128i m = unpack_32_1x128 (mask); - __m128i s = unpack_32_1x128 (src); - __m128i d = unpack_32_1x128 (dst); - __m128i sa = expand_alpha_1x128 (s); - __m128i da = expand_alpha_1x128 (d); - - s = pix_multiply_1x128 (s, m); - m = negate_1x128 (pix_multiply_1x128 (m, sa)); - - return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); -} - -static void -sse2_combine_atop_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, m, d; - - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; - __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - __m128i xmm_mask_lo, xmm_mask_hi; - - while (w && (unsigned long)pd & 15) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); - w--; - } - - while (w >= 4) - { - xmm_dst_hi = load_128_aligned ((__m128i*)pd); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); - xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_src_lo, &xmm_alpha_src_hi); - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, - &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); - - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_src_lo, &xmm_src_hi); - pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, - &xmm_alpha_src_lo, &xmm_alpha_src_hi, - &xmm_mask_lo, &xmm_mask_hi); - - negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - pix_add_multiply_2x128 ( - &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, - &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); - w--; - } -} - -static force_inline uint32_t -core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, - uint32_t mask, - uint32_t dst) -{ - __m128i m = unpack_32_1x128 (mask); - __m128i s = unpack_32_1x128 (src); - __m128i d = unpack_32_1x128 (dst); - - __m128i da = negate_1x128 (expand_alpha_1x128 (d)); - __m128i sa = expand_alpha_1x128 (s); - - s = pix_multiply_1x128 (s, m); - m = pix_multiply_1x128 (m, sa); - - return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); -} - -static void -sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, m, d; - - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; - __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - __m128i xmm_mask_lo, xmm_mask_hi; - - while (w && (unsigned long)pd & 15) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); - w--; - } - - while (w >= 4) - { - xmm_dst_hi = load_128_aligned ((__m128i*)pd); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); - xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_src_lo, &xmm_alpha_src_hi); - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, - &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); - - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_src_lo, &xmm_src_hi); - pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, - &xmm_alpha_src_lo, &xmm_alpha_src_hi, - &xmm_mask_lo, &xmm_mask_hi); - - negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, - &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); - - pix_add_multiply_2x128 ( - &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, - &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); - w--; - } -} - -static force_inline uint32_t -core_combine_xor_ca_pixel_sse2 (uint32_t src, - uint32_t mask, - uint32_t dst) -{ - __m128i a = unpack_32_1x128 (mask); - __m128i s = unpack_32_1x128 (src); - __m128i d = unpack_32_1x128 (dst); - - __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( - a, expand_alpha_1x128 (s))); - __m128i dest = pix_multiply_1x128 (s, a); - __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); - - return pack_1x128_32 (pix_add_multiply_1x128 (&d, - &alpha_dst, - &dest, - &alpha_src)); -} - -static void -sse2_combine_xor_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, m, d; - - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; - __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - __m128i xmm_mask_lo, xmm_mask_hi; - - while (w && (unsigned long)pd & 15) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); - w--; - } - - while (w >= 4) - { - xmm_dst_hi = load_128_aligned ((__m128i*)pd); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); - xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_src_lo, &xmm_alpha_src_hi); - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, - &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); - - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_src_lo, &xmm_src_hi); - pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, - &xmm_alpha_src_lo, &xmm_alpha_src_hi, - &xmm_mask_lo, &xmm_mask_hi); - - negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, - &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); - negate_2x128 (xmm_mask_lo, xmm_mask_hi, - &xmm_mask_lo, &xmm_mask_hi); - - pix_add_multiply_2x128 ( - &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, - &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); - w--; - } -} - -static void -sse2_combine_add_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * pd, - const uint32_t * ps, - const uint32_t * pm, - int w) -{ - uint32_t s, m, d; - - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask_lo, xmm_mask_hi; - - while (w && (unsigned long)pd & 15) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x128_32 ( - _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), - unpack_32_1x128 (m)), - unpack_32_1x128 (d))); - w--; - } - - while (w >= 4) - { - xmm_src_hi = load_128_unaligned ((__m128i*)ps); - xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); - - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_src_lo, &xmm_src_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 ( - _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), - _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x128_32 ( - _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), - unpack_32_1x128 (m)), - unpack_32_1x128 (d))); - w--; - } -} - -static force_inline __m128i -create_mask_16_128 (uint16_t mask) -{ - return _mm_set1_epi16 (mask); -} - -/* Work around a code generation bug in Sun Studio 12. */ -#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) -# define create_mask_2x32_128(mask0, mask1) \ - (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) -#else -static force_inline __m128i -create_mask_2x32_128 (uint32_t mask0, - uint32_t mask1) -{ - return _mm_set_epi32 (mask0, mask1, mask0, mask1); -} -#endif - -static void -sse2_composite_over_n_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src; - uint32_t *dst_line, *dst, d; - int32_t w; - int dst_stride; - __m128i xmm_src, xmm_alpha; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - - xmm_src = expand_pixel_32_1x128 (src); - xmm_alpha = expand_alpha_1x128 (xmm_src); - - while (height--) - { - dst = dst_line; - - dst_line += dst_stride; - w = width; - - while (w && (unsigned long)dst & 15) - { - d = *dst; - *dst++ = pack_1x128_32 (over_1x128 (xmm_src, - xmm_alpha, - unpack_32_1x128 (d))); - w--; - } - - while (w >= 4) - { - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - - over_2x128 (&xmm_src, &xmm_src, - &xmm_alpha, &xmm_alpha, - &xmm_dst_lo, &xmm_dst_hi); - - /* rebuid the 4 pixel data and save*/ - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - w -= 4; - dst += 4; - } - - while (w) - { - d = *dst; - *dst++ = pack_1x128_32 (over_1x128 (xmm_src, - xmm_alpha, - unpack_32_1x128 (d))); - w--; - } - - } -} - -static void -sse2_composite_over_n_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src; - uint16_t *dst_line, *dst, d; - int32_t w; - int dst_stride; - __m128i xmm_src, xmm_alpha; - __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - - xmm_src = expand_pixel_32_1x128 (src); - xmm_alpha = expand_alpha_1x128 (xmm_src); - - while (height--) - { - dst = dst_line; - - dst_line += dst_stride; - w = width; - - while (w && (unsigned long)dst & 15) - { - d = *dst; - - *dst++ = pack_565_32_16 ( - pack_1x128_32 (over_1x128 (xmm_src, - xmm_alpha, - expand565_16_1x128 (d)))); - w--; - } - - while (w >= 8) - { - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_565_128_4x128 (xmm_dst, - &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); - - over_2x128 (&xmm_src, &xmm_src, - &xmm_alpha, &xmm_alpha, - &xmm_dst0, &xmm_dst1); - over_2x128 (&xmm_src, &xmm_src, - &xmm_alpha, &xmm_alpha, - &xmm_dst2, &xmm_dst3); - - xmm_dst = pack_565_4x128_128 ( - &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); - - save_128_aligned ((__m128i*)dst, xmm_dst); - - dst += 8; - w -= 8; - } - - while (w--) - { - d = *dst; - *dst++ = pack_565_32_16 ( - pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, - expand565_16_1x128 (d)))); - } - } - -} - -static void -sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint32_t *dst_line, d; - uint32_t *mask_line, m; - uint32_t pack_cmp; - int dst_stride, mask_stride; - - __m128i xmm_src, xmm_alpha; - __m128i xmm_dst; - __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - - __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - srca = src >> 24; - - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - - xmm_src = _mm_unpacklo_epi8 ( - create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); - xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = xmm_src; - mmx_alpha = xmm_alpha; - - while (height--) - { - int w = width; - const uint32_t *pm = (uint32_t *)mask_line; - uint32_t *pd = (uint32_t *)dst_line; - - dst_line += dst_stride; - mask_line += mask_stride; - - while (w && (unsigned long)pd & 15) - { - m = *pm++; - - if (m) - { - d = *pd; - - mmx_mask = unpack_32_1x128 (m); - mmx_dest = unpack_32_1x128 (d); - - *pd = pack_1x128_32 ( - _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), - mmx_dest)); - } - - pd++; - w--; - } - - while (w >= 4) - { - xmm_mask = load_128_unaligned ((__m128i*)pm); - - pack_cmp = - _mm_movemask_epi8 ( - _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); - - /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ - if (pack_cmp != 0xffff) - { - xmm_dst = load_128_aligned ((__m128i*)pd); - - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - - pix_multiply_2x128 (&xmm_src, &xmm_src, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_mask_lo, &xmm_mask_hi); - xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); - - save_128_aligned ( - (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); - } - - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - m = *pm++; - - if (m) - { - d = *pd; - - mmx_mask = unpack_32_1x128 (m); - mmx_dest = unpack_32_1x128 (d); - - *pd = pack_1x128_32 ( - _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), - mmx_dest)); - } - - pd++; - w--; - } - } - -} - -static void -sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src; - uint32_t *dst_line, d; - uint32_t *mask_line, m; - uint32_t pack_cmp; - int dst_stride, mask_stride; - - __m128i xmm_src, xmm_alpha; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - - __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - - xmm_src = _mm_unpacklo_epi8 ( - create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); - xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = xmm_src; - mmx_alpha = xmm_alpha; - - while (height--) - { - int w = width; - const uint32_t *pm = (uint32_t *)mask_line; - uint32_t *pd = (uint32_t *)dst_line; - - dst_line += dst_stride; - mask_line += mask_stride; - - while (w && (unsigned long)pd & 15) - { - m = *pm++; - - if (m) - { - d = *pd; - mmx_mask = unpack_32_1x128 (m); - mmx_dest = unpack_32_1x128 (d); - - *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, - &mmx_alpha, - &mmx_mask, - &mmx_dest)); - } - - pd++; - w--; - } - - while (w >= 4) - { - xmm_mask = load_128_unaligned ((__m128i*)pm); - - pack_cmp = - _mm_movemask_epi8 ( - _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); - - /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ - if (pack_cmp != 0xffff) - { - xmm_dst = load_128_aligned ((__m128i*)pd); - - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - - in_over_2x128 (&xmm_src, &xmm_src, - &xmm_alpha, &xmm_alpha, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - } - - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - m = *pm++; - - if (m) - { - d = *pd; - mmx_mask = unpack_32_1x128 (m); - mmx_dest = unpack_32_1x128 (d); - - *pd = pack_1x128_32 ( - in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); - } - - pd++; - w--; - } - } - -} - -static void -sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *dst_line, *dst; - uint32_t *src_line, *src; - uint32_t mask; - int32_t w; - int dst_stride, src_stride; - - __m128i xmm_mask; - __m128i xmm_src, xmm_src_lo, xmm_src_hi; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - __m128i xmm_alpha_lo, xmm_alpha_hi; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); - - xmm_mask = create_mask_16_128 (mask >> 24); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w && (unsigned long)dst & 15) - { - uint32_t s = *src++; - - if (s) - { - uint32_t d = *dst; - - __m128i ms = unpack_32_1x128 (s); - __m128i alpha = expand_alpha_1x128 (ms); - __m128i dest = xmm_mask; - __m128i alpha_dst = unpack_32_1x128 (d); - - *dst = pack_1x128_32 ( - in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); - } - dst++; - w--; - } - - while (w >= 4) - { - xmm_src = load_128_unaligned ((__m128i*)src); - - if (!is_zero (xmm_src)) - { - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - in_over_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_mask, &xmm_mask, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - } - - dst += 4; - src += 4; - w -= 4; - } - - while (w) - { - uint32_t s = *src++; - - if (s) - { - uint32_t d = *dst; - - __m128i ms = unpack_32_1x128 (s); - __m128i alpha = expand_alpha_1x128 (ms); - __m128i mask = xmm_mask; - __m128i dest = unpack_32_1x128 (d); - - *dst = pack_1x128_32 ( - in_over_1x128 (&ms, &alpha, &mask, &dest)); - } - - dst++; - w--; - } - } - -} - -static void -sse2_composite_src_x888_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *dst_line, *dst; - uint32_t *src_line, *src; - int32_t w; - int dst_stride, src_stride; - - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w && (unsigned long)dst & 15) - { - *dst++ = *src++ | 0xff000000; - w--; - } - - while (w >= 16) - { - __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; - - xmm_src1 = load_128_unaligned ((__m128i*)src + 0); - xmm_src2 = load_128_unaligned ((__m128i*)src + 1); - xmm_src3 = load_128_unaligned ((__m128i*)src + 2); - xmm_src4 = load_128_unaligned ((__m128i*)src + 3); - - save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); - save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); - save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); - save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); - - dst += 16; - src += 16; - w -= 16; - } - - while (w) - { - *dst++ = *src++ | 0xff000000; - w--; - } - } - -} - -static void -sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *dst_line, *dst; - uint32_t *src_line, *src; - uint32_t mask; - int dst_stride, src_stride; - int32_t w; - - __m128i xmm_mask, xmm_alpha; - __m128i xmm_src, xmm_src_lo, xmm_src_hi; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); - - xmm_mask = create_mask_16_128 (mask >> 24); - xmm_alpha = mask_00ff; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w && (unsigned long)dst & 15) - { - uint32_t s = (*src++) | 0xff000000; - uint32_t d = *dst; - - __m128i src = unpack_32_1x128 (s); - __m128i alpha = xmm_alpha; - __m128i mask = xmm_mask; - __m128i dest = unpack_32_1x128 (d); - - *dst++ = pack_1x128_32 ( - in_over_1x128 (&src, &alpha, &mask, &dest)); - - w--; - } - - while (w >= 4) - { - xmm_src = _mm_or_si128 ( - load_128_unaligned ((__m128i*)src), mask_ff000000); - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - - in_over_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_alpha, &xmm_alpha, - &xmm_mask, &xmm_mask, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - dst += 4; - src += 4; - w -= 4; - - } - - while (w) - { - uint32_t s = (*src++) | 0xff000000; - uint32_t d = *dst; - - __m128i src = unpack_32_1x128 (s); - __m128i alpha = xmm_alpha; - __m128i mask = xmm_mask; - __m128i dest = unpack_32_1x128 (d); - - *dst++ = pack_1x128_32 ( - in_over_1x128 (&src, &alpha, &mask, &dest)); - - w--; - } - } - -} - -static void -sse2_composite_over_8888_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - int dst_stride, src_stride; - uint32_t *dst_line, *dst; - uint32_t *src_line, *src; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - dst = dst_line; - src = src_line; - - while (height--) - { - sse2_combine_over_u (imp, op, dst, src, NULL, width); - - dst += dst_stride; - src += src_stride; - } -} - -static force_inline uint16_t -composite_over_8888_0565pixel (uint32_t src, uint16_t dst) -{ - __m128i ms; - - ms = unpack_32_1x128 (src); - return pack_565_32_16 ( - pack_1x128_32 ( - over_1x128 ( - ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); -} - -static void -sse2_composite_over_8888_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint16_t *dst_line, *dst, d; - uint32_t *src_line, *src, s; - int dst_stride, src_stride; - int32_t w; - - __m128i xmm_alpha_lo, xmm_alpha_hi; - __m128i xmm_src, xmm_src_lo, xmm_src_hi; - __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - while (height--) - { - dst = dst_line; - src = src_line; - - dst_line += dst_stride; - src_line += src_stride; - w = width; - - /* Align dst on a 16-byte boundary */ - while (w && - ((unsigned long)dst & 15)) - { - s = *src++; - d = *dst; - - *dst++ = composite_over_8888_0565pixel (s, d); - w--; - } - - /* It's a 8 pixel loop */ - while (w >= 8) - { - /* I'm loading unaligned because I'm not sure - * about the address alignment. - */ - xmm_src = load_128_unaligned ((__m128i*) src); - xmm_dst = load_128_aligned ((__m128i*) dst); - - /* Unpacking */ - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - unpack_565_128_4x128 (xmm_dst, - &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - /* I'm loading next 4 pixels from memory - * before to optimze the memory read. - */ - xmm_src = load_128_unaligned ((__m128i*) (src + 4)); - - over_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_dst0, &xmm_dst1); - - /* Unpacking */ - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - over_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_dst2, &xmm_dst3); - - save_128_aligned ( - (__m128i*)dst, pack_565_4x128_128 ( - &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); - - w -= 8; - dst += 8; - src += 8; - } - - while (w--) - { - s = *src++; - d = *dst; - - *dst++ = composite_over_8888_0565pixel (s, d); - } - } - -} - -static void -sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint32_t *dst_line, *dst; - uint8_t *mask_line, *mask; - int dst_stride, mask_stride; - int32_t w; - uint32_t m, d; - - __m128i xmm_src, xmm_alpha, xmm_def; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - - __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - srca = src >> 24; - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - - xmm_def = create_mask_2x32_128 (src, src); - xmm_src = expand_pixel_32_1x128 (src); - xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = xmm_src; - mmx_alpha = xmm_alpha; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w && (unsigned long)dst & 15) - { - uint8_t m = *mask++; - - if (m) - { - d = *dst; - mmx_mask = expand_pixel_8_1x128 (m); - mmx_dest = unpack_32_1x128 (d); - - *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, - &mmx_alpha, - &mmx_mask, - &mmx_dest)); - } - - w--; - dst++; - } - - while (w >= 4) - { - m = *((uint32_t*)mask); - - if (srca == 0xff && m == 0xffffffff) - { - save_128_aligned ((__m128i*)dst, xmm_def); - } - else if (m) - { - xmm_dst = load_128_aligned ((__m128i*) dst); - xmm_mask = unpack_32_1x128 (m); - xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); - - /* Unpacking */ - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, - &xmm_mask_lo, &xmm_mask_hi); - - in_over_2x128 (&xmm_src, &xmm_src, - &xmm_alpha, &xmm_alpha, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - } - - w -= 4; - dst += 4; - mask += 4; - } - - while (w) - { - uint8_t m = *mask++; - - if (m) - { - d = *dst; - mmx_mask = expand_pixel_8_1x128 (m); - mmx_dest = unpack_32_1x128 (d); - - *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, - &mmx_alpha, - &mmx_mask, - &mmx_dest)); - } - - w--; - dst++; - } - } - -} - -static pixman_bool_t -pixman_fill_sse2 (uint32_t *bits, - int stride, - int bpp, - int x, - int y, - int width, - int height, - uint32_t data) -{ - uint32_t byte_width; - uint8_t *byte_line; - - __m128i xmm_def; - - if (bpp == 8) - { - uint8_t b; - uint16_t w; - - stride = stride * (int) sizeof (uint32_t) / 1; - byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); - byte_width = width; - stride *= 1; - - b = data & 0xff; - w = (b << 8) | b; - data = (w << 16) | w; - } - else if (bpp == 16) - { - stride = stride * (int) sizeof (uint32_t) / 2; - byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); - byte_width = 2 * width; - stride *= 2; - - data = (data & 0xffff) * 0x00010001; - } - else if (bpp == 32) - { - stride = stride * (int) sizeof (uint32_t) / 4; - byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); - byte_width = 4 * width; - stride *= 4; - } - else - { - return FALSE; - } - - xmm_def = create_mask_2x32_128 (data, data); - - while (height--) - { - int w; - uint8_t *d = byte_line; - byte_line += stride; - w = byte_width; - - while (w >= 1 && ((unsigned long)d & 1)) - { - *(uint8_t *)d = data; - w -= 1; - d += 1; - } - - while (w >= 2 && ((unsigned long)d & 3)) - { - *(uint16_t *)d = data; - w -= 2; - d += 2; - } - - while (w >= 4 && ((unsigned long)d & 15)) - { - *(uint32_t *)d = data; - - w -= 4; - d += 4; - } - - while (w >= 128) - { - save_128_aligned ((__m128i*)(d), xmm_def); - save_128_aligned ((__m128i*)(d + 16), xmm_def); - save_128_aligned ((__m128i*)(d + 32), xmm_def); - save_128_aligned ((__m128i*)(d + 48), xmm_def); - save_128_aligned ((__m128i*)(d + 64), xmm_def); - save_128_aligned ((__m128i*)(d + 80), xmm_def); - save_128_aligned ((__m128i*)(d + 96), xmm_def); - save_128_aligned ((__m128i*)(d + 112), xmm_def); - - d += 128; - w -= 128; - } - - if (w >= 64) - { - save_128_aligned ((__m128i*)(d), xmm_def); - save_128_aligned ((__m128i*)(d + 16), xmm_def); - save_128_aligned ((__m128i*)(d + 32), xmm_def); - save_128_aligned ((__m128i*)(d + 48), xmm_def); - - d += 64; - w -= 64; - } - - if (w >= 32) - { - save_128_aligned ((__m128i*)(d), xmm_def); - save_128_aligned ((__m128i*)(d + 16), xmm_def); - - d += 32; - w -= 32; - } - - if (w >= 16) - { - save_128_aligned ((__m128i*)(d), xmm_def); - - d += 16; - w -= 16; - } - - while (w >= 4) - { - *(uint32_t *)d = data; - - w -= 4; - d += 4; - } - - if (w >= 2) - { - *(uint16_t *)d = data; - w -= 2; - d += 2; - } - - if (w >= 1) - { - *(uint8_t *)d = data; - w -= 1; - d += 1; - } - } - - return TRUE; -} - -static void -sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint32_t *dst_line, *dst; - uint8_t *mask_line, *mask; - int dst_stride, mask_stride; - int32_t w; - uint32_t m; - - __m128i xmm_src, xmm_def; - __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - srca = src >> 24; - if (src == 0) - { - pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride, - PIXMAN_FORMAT_BPP (dst_image->bits.format), - dest_x, dest_y, width, height, 0); - return; - } - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - - xmm_def = create_mask_2x32_128 (src, src); - xmm_src = expand_pixel_32_1x128 (src); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w && (unsigned long)dst & 15) - { - uint8_t m = *mask++; - - if (m) - { - *dst = pack_1x128_32 ( - pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); - } - else - { - *dst = 0; - } - - w--; - dst++; - } - - while (w >= 4) - { - m = *((uint32_t*)mask); - - if (srca == 0xff && m == 0xffffffff) - { - save_128_aligned ((__m128i*)dst, xmm_def); - } - else if (m) - { - xmm_mask = unpack_32_1x128 (m); - xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); - - /* Unpacking */ - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, - &xmm_mask_lo, &xmm_mask_hi); - - pix_multiply_2x128 (&xmm_src, &xmm_src, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_mask_lo, &xmm_mask_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); - } - else - { - save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); - } - - w -= 4; - dst += 4; - mask += 4; - } - - while (w) - { - uint8_t m = *mask++; - - if (m) - { - *dst = pack_1x128_32 ( - pix_multiply_1x128 ( - xmm_src, expand_pixel_8_1x128 (m))); - } - else - { - *dst = 0; - } - - w--; - dst++; - } - } - -} - -static void -sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint16_t *dst_line, *dst, d; - uint8_t *mask_line, *mask; - int dst_stride, mask_stride; - int32_t w; - uint32_t m; - __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; - - __m128i xmm_src, xmm_alpha; - __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - srca = src >> 24; - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - - xmm_src = expand_pixel_32_1x128 (src); - xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = xmm_src; - mmx_alpha = xmm_alpha; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w && (unsigned long)dst & 15) - { - m = *mask++; - - if (m) - { - d = *dst; - mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); - mmx_dest = expand565_16_1x128 (d); - - *dst = pack_565_32_16 ( - pack_1x128_32 ( - in_over_1x128 ( - &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); - } - - w--; - dst++; - } - - while (w >= 8) - { - xmm_dst = load_128_aligned ((__m128i*) dst); - unpack_565_128_4x128 (xmm_dst, - &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); - - m = *((uint32_t*)mask); - mask += 4; - - if (m) - { - xmm_mask = unpack_32_1x128 (m); - xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); - - /* Unpacking */ - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, - &xmm_mask_lo, &xmm_mask_hi); - - in_over_2x128 (&xmm_src, &xmm_src, - &xmm_alpha, &xmm_alpha, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst0, &xmm_dst1); - } - - m = *((uint32_t*)mask); - mask += 4; - - if (m) - { - xmm_mask = unpack_32_1x128 (m); - xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); - - /* Unpacking */ - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - - expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, - &xmm_mask_lo, &xmm_mask_hi); - in_over_2x128 (&xmm_src, &xmm_src, - &xmm_alpha, &xmm_alpha, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst2, &xmm_dst3); - } - - save_128_aligned ( - (__m128i*)dst, pack_565_4x128_128 ( - &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); - - w -= 8; - dst += 8; - } - - while (w) - { - m = *mask++; - - if (m) - { - d = *dst; - mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); - mmx_dest = expand565_16_1x128 (d); - - *dst = pack_565_32_16 ( - pack_1x128_32 ( - in_over_1x128 ( - &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); - } - - w--; - dst++; - } - } - -} - -static void -sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint16_t *dst_line, *dst, d; - uint32_t *src_line, *src, s; - int dst_stride, src_stride; - int32_t w; - uint32_t opaque, zero; - - __m128i ms; - __m128i xmm_src, xmm_src_lo, xmm_src_hi; - __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w && (unsigned long)dst & 15) - { - s = *src++; - d = *dst; - - ms = unpack_32_1x128 (s); - - *dst++ = pack_565_32_16 ( - pack_1x128_32 ( - over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); - w--; - } - - while (w >= 8) - { - /* First round */ - xmm_src = load_128_unaligned ((__m128i*)src); - xmm_dst = load_128_aligned ((__m128i*)dst); - - opaque = is_opaque (xmm_src); - zero = is_zero (xmm_src); - - unpack_565_128_4x128 (xmm_dst, - &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - - /* preload next round*/ - xmm_src = load_128_unaligned ((__m128i*)(src + 4)); - - if (opaque) - { - invert_colors_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_dst0, &xmm_dst1); - } - else if (!zero) - { - over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_dst0, &xmm_dst1); - } - - /* Second round */ - opaque = is_opaque (xmm_src); - zero = is_zero (xmm_src); - - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - - if (opaque) - { - invert_colors_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_dst2, &xmm_dst3); - } - else if (!zero) - { - over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_dst2, &xmm_dst3); - } - - save_128_aligned ( - (__m128i*)dst, pack_565_4x128_128 ( - &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); - - w -= 8; - src += 8; - dst += 8; - } - - while (w) - { - s = *src++; - d = *dst; - - ms = unpack_32_1x128 (s); - - *dst++ = pack_565_32_16 ( - pack_1x128_32 ( - over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); - w--; - } - } - -} - -static void -sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *dst_line, *dst, d; - uint32_t *src_line, *src, s; - int dst_stride, src_stride; - int32_t w; - uint32_t opaque, zero; - - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_dst_lo, xmm_dst_hi; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w && (unsigned long)dst & 15) - { - s = *src++; - d = *dst; - - *dst++ = pack_1x128_32 ( - over_rev_non_pre_1x128 ( - unpack_32_1x128 (s), unpack_32_1x128 (d))); - - w--; - } - - while (w >= 4) - { - xmm_src_hi = load_128_unaligned ((__m128i*)src); - - opaque = is_opaque (xmm_src_hi); - zero = is_zero (xmm_src_hi); - - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - - if (opaque) - { - invert_colors_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - } - else if (!zero) - { - xmm_dst_hi = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - - over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - } - - w -= 4; - dst += 4; - src += 4; - } - - while (w) - { - s = *src++; - d = *dst; - - *dst++ = pack_1x128_32 ( - over_rev_non_pre_1x128 ( - unpack_32_1x128 (s), unpack_32_1x128 (d))); - - w--; - } - } - -} - -static void -sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src; - uint16_t *dst_line, *dst, d; - uint32_t *mask_line, *mask, m; - int dst_stride, mask_stride; - int w; - uint32_t pack_cmp; - - __m128i xmm_src, xmm_alpha; - __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - - __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - - xmm_src = expand_pixel_32_1x128 (src); - xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = xmm_src; - mmx_alpha = xmm_alpha; - - while (height--) - { - w = width; - mask = mask_line; - dst = dst_line; - mask_line += mask_stride; - dst_line += dst_stride; - - while (w && ((unsigned long)dst & 15)) - { - m = *(uint32_t *) mask; - - if (m) - { - d = *dst; - mmx_mask = unpack_32_1x128 (m); - mmx_dest = expand565_16_1x128 (d); - - *dst = pack_565_32_16 ( - pack_1x128_32 ( - in_over_1x128 ( - &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); - } - - w--; - dst++; - mask++; - } - - while (w >= 8) - { - /* First round */ - xmm_mask = load_128_unaligned ((__m128i*)mask); - xmm_dst = load_128_aligned ((__m128i*)dst); - - pack_cmp = _mm_movemask_epi8 ( - _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); - - unpack_565_128_4x128 (xmm_dst, - &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - - /* preload next round */ - xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); - - /* preload next round */ - if (pack_cmp != 0xffff) - { - in_over_2x128 (&xmm_src, &xmm_src, - &xmm_alpha, &xmm_alpha, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst0, &xmm_dst1); - } - - /* Second round */ - pack_cmp = _mm_movemask_epi8 ( - _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); - - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - - if (pack_cmp != 0xffff) - { - in_over_2x128 (&xmm_src, &xmm_src, - &xmm_alpha, &xmm_alpha, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst2, &xmm_dst3); - } - - save_128_aligned ( - (__m128i*)dst, pack_565_4x128_128 ( - &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); - - w -= 8; - dst += 8; - mask += 8; - } - - while (w) - { - m = *(uint32_t *) mask; - - if (m) - { - d = *dst; - mmx_mask = unpack_32_1x128 (m); - mmx_dest = expand565_16_1x128 (d); - - *dst = pack_565_32_16 ( - pack_1x128_32 ( - in_over_1x128 ( - &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); - } - - w--; - dst++; - mask++; - } - } - -} - -static void -sse2_composite_in_n_8_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint8_t *dst_line, *dst; - uint8_t *mask_line, *mask; - int dst_stride, mask_stride; - uint32_t d, m; - uint32_t src; - uint8_t sa; - int32_t w; - - __m128i xmm_alpha; - __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - sa = src >> 24; - - xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w && ((unsigned long)dst & 15)) - { - m = (uint32_t) *mask++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x128_32 ( - pix_multiply_1x128 ( - pix_multiply_1x128 (xmm_alpha, - unpack_32_1x128 (m)), - unpack_32_1x128 (d))); - w--; - } - - while (w >= 16) - { - xmm_mask = load_128_unaligned ((__m128i*)mask); - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - - pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_mask_lo, &xmm_mask_hi); - - pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, - &xmm_dst_lo, &xmm_dst_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - mask += 16; - dst += 16; - w -= 16; - } - - while (w) - { - m = (uint32_t) *mask++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x128_32 ( - pix_multiply_1x128 ( - pix_multiply_1x128 ( - xmm_alpha, unpack_32_1x128 (m)), - unpack_32_1x128 (d))); - w--; - } - } - -} - -static void -sse2_composite_in_n_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint8_t *dst_line, *dst; - int dst_stride; - uint32_t d; - uint32_t src; - int32_t w; - - __m128i xmm_alpha; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); - - src = src >> 24; - - if (src == 0xff) - return; - - if (src == 0x00) - { - pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride, - 8, dest_x, dest_y, width, height, src); - - return; - } - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - w = width; - - while (w && ((unsigned long)dst & 15)) - { - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x128_32 ( - pix_multiply_1x128 ( - xmm_alpha, - unpack_32_1x128 (d))); - w--; - } - - while (w >= 16) - { - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - - pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, - &xmm_dst_lo, &xmm_dst_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - dst += 16; - w -= 16; - } - - while (w) - { - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x128_32 ( - pix_multiply_1x128 ( - xmm_alpha, - unpack_32_1x128 (d))); - w--; - } - } - -} - -static void -sse2_composite_in_8_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint8_t *dst_line, *dst; - uint8_t *src_line, *src; - int src_stride, dst_stride; - int32_t w; - uint32_t s, d; - - __m128i xmm_src, xmm_src_lo, xmm_src_hi; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w && ((unsigned long)dst & 15)) - { - s = (uint32_t) *src++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x128_32 ( - pix_multiply_1x128 ( - unpack_32_1x128 (s), unpack_32_1x128 (d))); - w--; - } - - while (w >= 16) - { - xmm_src = load_128_unaligned ((__m128i*)src); - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - - pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_dst_lo, &xmm_dst_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - src += 16; - dst += 16; - w -= 16; - } - - while (w) - { - s = (uint32_t) *src++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x128_32 ( - pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); - w--; - } - } - -} - -static void -sse2_composite_add_n_8_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint8_t *dst_line, *dst; - uint8_t *mask_line, *mask; - int dst_stride, mask_stride; - int32_t w; - uint32_t src; - uint8_t sa; - uint32_t m, d; - - __m128i xmm_alpha; - __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - sa = src >> 24; - - xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w && ((unsigned long)dst & 15)) - { - m = (uint32_t) *mask++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x128_32 ( - _mm_adds_epu16 ( - pix_multiply_1x128 ( - xmm_alpha, unpack_32_1x128 (m)), - unpack_32_1x128 (d))); - w--; - } - - while (w >= 16) - { - xmm_mask = load_128_unaligned ((__m128i*)mask); - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - - pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, - &xmm_mask_lo, &xmm_mask_hi, - &xmm_mask_lo, &xmm_mask_hi); - - xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); - xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - - mask += 16; - dst += 16; - w -= 16; - } - - while (w) - { - m = (uint32_t) *mask++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x128_32 ( - _mm_adds_epu16 ( - pix_multiply_1x128 ( - xmm_alpha, unpack_32_1x128 (m)), - unpack_32_1x128 (d))); - - w--; - } - } - -} - -static void -sse2_composite_add_n_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint8_t *dst_line, *dst; - int dst_stride; - int32_t w; - uint32_t src; - - __m128i xmm_src; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - src >>= 24; - - if (src == 0x00) - return; - - if (src == 0xff) - { - pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride, - 8, dest_x, dest_y, width, height, 0xff); - - return; - } - - src = (src << 24) | (src << 16) | (src << 8) | src; - xmm_src = _mm_set_epi32 (src, src, src, src); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - w = width; - - while (w && ((unsigned long)dst & 15)) - { - *dst = (uint8_t)_mm_cvtsi128_si32 ( - _mm_adds_epu8 ( - xmm_src, - _mm_cvtsi32_si128 (*dst))); - - w--; - dst++; - } - - while (w >= 16) - { - save_128_aligned ( - (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); - - dst += 16; - w -= 16; - } - - while (w) - { - *dst = (uint8_t)_mm_cvtsi128_si32 ( - _mm_adds_epu8 ( - xmm_src, - _mm_cvtsi32_si128 (*dst))); - - w--; - dst++; - } - } - -} - -static void -sse2_composite_add_8_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint8_t *dst_line, *dst; - uint8_t *src_line, *src; - int dst_stride, src_stride; - int32_t w; - uint16_t t; - - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); - - while (height--) - { - dst = dst_line; - src = src_line; - - dst_line += dst_stride; - src_line += src_stride; - w = width; - - /* Small head */ - while (w && (unsigned long)dst & 3) - { - t = (*dst) + (*src++); - *dst++ = t | (0 - (t >> 8)); - w--; - } - - sse2_combine_add_u (imp, op, - (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); - - /* Small tail */ - dst += w & 0xfffc; - src += w & 0xfffc; - - w &= 3; - - while (w) - { - t = (*dst) + (*src++); - *dst++ = t | (0 - (t >> 8)); - w--; - } - } - -} - -static void -sse2_composite_add_8888_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *dst_line, *dst; - uint32_t *src_line, *src; - int dst_stride, src_stride; - - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - - sse2_combine_add_u (imp, op, dst, src, NULL, width); - } - -} - -static pixman_bool_t -pixman_blt_sse2 (uint32_t *src_bits, - uint32_t *dst_bits, - int src_stride, - int dst_stride, - int src_bpp, - int dst_bpp, - int src_x, - int src_y, - int dst_x, - int dst_y, - int width, - int height) -{ - uint8_t * src_bytes; - uint8_t * dst_bytes; - int byte_width; - - if (src_bpp != dst_bpp) - return FALSE; - - if (src_bpp == 16) - { - src_stride = src_stride * (int) sizeof (uint32_t) / 2; - dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; - src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); - dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); - byte_width = 2 * width; - src_stride *= 2; - dst_stride *= 2; - } - else if (src_bpp == 32) - { - src_stride = src_stride * (int) sizeof (uint32_t) / 4; - dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; - src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); - dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); - byte_width = 4 * width; - src_stride *= 4; - dst_stride *= 4; - } - else - { - return FALSE; - } - - while (height--) - { - int w; - uint8_t *s = src_bytes; - uint8_t *d = dst_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; - w = byte_width; - - while (w >= 2 && ((unsigned long)d & 3)) - { - *(uint16_t *)d = *(uint16_t *)s; - w -= 2; - s += 2; - d += 2; - } - - while (w >= 4 && ((unsigned long)d & 15)) - { - *(uint32_t *)d = *(uint32_t *)s; - - w -= 4; - s += 4; - d += 4; - } - - while (w >= 64) - { - __m128i xmm0, xmm1, xmm2, xmm3; - - xmm0 = load_128_unaligned ((__m128i*)(s)); - xmm1 = load_128_unaligned ((__m128i*)(s + 16)); - xmm2 = load_128_unaligned ((__m128i*)(s + 32)); - xmm3 = load_128_unaligned ((__m128i*)(s + 48)); - - save_128_aligned ((__m128i*)(d), xmm0); - save_128_aligned ((__m128i*)(d + 16), xmm1); - save_128_aligned ((__m128i*)(d + 32), xmm2); - save_128_aligned ((__m128i*)(d + 48), xmm3); - - s += 64; - d += 64; - w -= 64; - } - - while (w >= 16) - { - save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); - - w -= 16; - d += 16; - s += 16; - } - - while (w >= 4) - { - *(uint32_t *)d = *(uint32_t *)s; - - w -= 4; - s += 4; - d += 4; - } - - if (w >= 2) - { - *(uint16_t *)d = *(uint16_t *)s; - w -= 2; - s += 2; - d += 2; - } - } - - - return TRUE; -} - -static void -sse2_composite_copy_area (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - pixman_blt_sse2 (src_image->bits.bits, - dst_image->bits.bits, - src_image->bits.rowstride, - dst_image->bits.rowstride, - PIXMAN_FORMAT_BPP (src_image->bits.format), - PIXMAN_FORMAT_BPP (dst_image->bits.format), - src_x, src_y, dest_x, dest_y, width, height); -} - -static void -sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *src, *src_line, s; - uint32_t *dst, *dst_line, d; - uint8_t *mask, *mask_line; - uint32_t m; - int src_stride, mask_stride, dst_stride; - int32_t w; - __m128i ms; - - __m128i xmm_src, xmm_src_lo, xmm_src_hi; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - while (height--) - { - src = src_line; - src_line += src_stride; - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - - w = width; - - while (w && (unsigned long)dst & 15) - { - s = 0xff000000 | *src++; - m = (uint32_t) *mask++; - d = *dst; - ms = unpack_32_1x128 (s); - - if (m != 0xff) - { - __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); - __m128i md = unpack_32_1x128 (d); - - ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); - } - - *dst++ = pack_1x128_32 (ms); - w--; - } - - while (w >= 4) - { - m = *(uint32_t*) mask; - xmm_src = _mm_or_si128 ( - load_128_unaligned ((__m128i*)src), mask_ff000000); - - if (m == 0xffffffff) - { - save_128_aligned ((__m128i*)dst, xmm_src); - } - else - { - xmm_dst = load_128_aligned ((__m128i*)dst); - - xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); - - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - - expand_alpha_rev_2x128 ( - xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - in_over_2x128 (&xmm_src_lo, &xmm_src_hi, - &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - } - - src += 4; - dst += 4; - mask += 4; - w -= 4; - } - - while (w) - { - m = (uint32_t) *mask++; - - if (m) - { - s = 0xff000000 | *src; - - if (m == 0xff) - { - *dst = s; - } - else - { - __m128i ma, md, ms; - - d = *dst; - - ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); - md = unpack_32_1x128 (d); - ms = unpack_32_1x128 (s); - - *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); - } - - } - - src++; - dst++; - w--; - } - } - -} - -static void -sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *src, *src_line, s; - uint32_t *dst, *dst_line, d; - uint8_t *mask, *mask_line; - uint32_t m; - int src_stride, mask_stride, dst_stride; - int32_t w; - - __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - while (height--) - { - src = src_line; - src_line += src_stride; - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - - w = width; - - while (w && (unsigned long)dst & 15) - { - uint32_t sa; - - s = *src++; - m = (uint32_t) *mask++; - d = *dst; - - sa = s >> 24; - - if (m) - { - if (sa == 0xff && m == 0xff) - { - *dst = s; - } - else - { - __m128i ms, md, ma, msa; - - ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); - ms = unpack_32_1x128 (s); - md = unpack_32_1x128 (d); - - msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - - *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); - } - } - - dst++; - w--; - } - - while (w >= 4) - { - m = *(uint32_t *) mask; - - if (m) - { - xmm_src = load_128_unaligned ((__m128i*)src); - - if (m == 0xffffffff && is_opaque (xmm_src)) - { - save_128_aligned ((__m128i *)dst, xmm_src); - } - else - { - xmm_dst = load_128_aligned ((__m128i *)dst); - - xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); - - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); - expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, - &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - } - } - - src += 4; - dst += 4; - mask += 4; - w -= 4; - } - - while (w) - { - uint32_t sa; - - s = *src++; - m = (uint32_t) *mask++; - d = *dst; - - sa = s >> 24; - - if (m) - { - if (sa == 0xff && m == 0xff) - { - *dst = s; - } - else - { - __m128i ms, md, ma, msa; - - ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); - ms = unpack_32_1x128 (s); - md = unpack_32_1x128 (d); - - msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - - *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); - } - } - - dst++; - w--; - } - } - -} - -static void -sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src; - uint32_t *dst_line, *dst; - __m128i xmm_src; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - __m128i xmm_dsta_hi, xmm_dsta_lo; - int dst_stride; - int32_t w; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - - xmm_src = expand_pixel_32_1x128 (src); - - while (height--) - { - dst = dst_line; - - dst_line += dst_stride; - w = width; - - while (w && (unsigned long)dst & 15) - { - __m128i vd; - - vd = unpack_32_1x128 (*dst); - - *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), - xmm_src)); - w--; - dst++; - } - - while (w >= 4) - { - __m128i tmp_lo, tmp_hi; - - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); - - tmp_lo = xmm_src; - tmp_hi = xmm_src; - - over_2x128 (&xmm_dst_lo, &xmm_dst_hi, - &xmm_dsta_lo, &xmm_dsta_hi, - &tmp_lo, &tmp_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); - - w -= 4; - dst += 4; - } - - while (w) - { - __m128i vd; - - vd = unpack_32_1x128 (*dst); - - *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), - xmm_src)); - w--; - dst++; - } - - } - -} - -static void -sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *src, *src_line, s; - uint32_t *dst, *dst_line, d; - uint32_t *mask, *mask_line; - uint32_t m; - int src_stride, mask_stride, dst_stride; - int32_t w; - - __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE ( - mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - while (height--) - { - src = src_line; - src_line += src_stride; - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - - w = width; - - while (w && (unsigned long)dst & 15) - { - uint32_t sa; - - s = *src++; - m = (*mask++) >> 24; - d = *dst; - - sa = s >> 24; - - if (m) - { - if (sa == 0xff && m == 0xff) - { - *dst = s; - } - else - { - __m128i ms, md, ma, msa; - - ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); - ms = unpack_32_1x128 (s); - md = unpack_32_1x128 (d); - - msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - - *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); - } - } - - dst++; - w--; - } - - while (w >= 4) - { - xmm_mask = load_128_unaligned ((__m128i*)mask); - - if (!is_transparent (xmm_mask)) - { - xmm_src = load_128_unaligned ((__m128i*)src); - - if (is_opaque (xmm_mask) && is_opaque (xmm_src)) - { - save_128_aligned ((__m128i *)dst, xmm_src); - } - else - { - xmm_dst = load_128_aligned ((__m128i *)dst); - - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); - expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - - in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, - &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - } - } - - src += 4; - dst += 4; - mask += 4; - w -= 4; - } - - while (w) - { - uint32_t sa; - - s = *src++; - m = (*mask++) >> 24; - d = *dst; - - sa = s >> 24; - - if (m) - { - if (sa == 0xff && m == 0xff) - { - *dst = s; - } - else - { - __m128i ms, md, ma, msa; - - ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); - ms = unpack_32_1x128 (s); - md = unpack_32_1x128 (d); - - msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - - *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); - } - } - - dst++; - w--; - } - } - -} - -/* A variant of 'sse2_combine_over_u' with minor tweaks */ -static force_inline void -scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, - const uint32_t* ps, - int32_t w, - pixman_fixed_t vx, - pixman_fixed_t unit_x, - pixman_fixed_t max_vx, - pixman_bool_t fully_transparent_src) -{ - uint32_t s, d; - const uint32_t* pm = NULL; - - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_alpha_lo, xmm_alpha_hi; - - if (fully_transparent_src) - return; - - /* Align dst on a 16-byte boundary */ - while (w && ((unsigned long)pd & 15)) - { - d = *pd; - s = combine1 (ps + (vx >> 16), pm); - vx += unit_x; - - *pd++ = core_combine_over_u_pixel_sse2 (s, d); - if (pm) - pm++; - w--; - } - - while (w >= 4) - { - __m128i tmp; - uint32_t tmp1, tmp2, tmp3, tmp4; - - tmp1 = ps[vx >> 16]; - vx += unit_x; - tmp2 = ps[vx >> 16]; - vx += unit_x; - tmp3 = ps[vx >> 16]; - vx += unit_x; - tmp4 = ps[vx >> 16]; - vx += unit_x; - - tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); - - xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); - - if (is_opaque (xmm_src_hi)) - { - save_128_aligned ((__m128i*)pd, xmm_src_hi); - } - else if (!is_zero (xmm_src_hi)) - { - xmm_dst_hi = load_128_aligned ((__m128i*) pd); - - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - - expand_alpha_2x128 ( - xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); - - over_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_dst_lo, &xmm_dst_hi); - - /* rebuid the 4 pixel data and save*/ - save_128_aligned ((__m128i*)pd, - pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - } - - w -= 4; - pd += 4; - if (pm) - pm += 4; - } - - while (w) - { - d = *pd; - s = combine1 (ps + (vx >> 16), pm); - vx += unit_x; - - *pd++ = core_combine_over_u_pixel_sse2 (s, d); - if (pm) - pm++; - - w--; - } -} - -FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, - scaled_nearest_scanline_sse2_8888_8888_OVER, - uint32_t, uint32_t, COVER) -FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, - scaled_nearest_scanline_sse2_8888_8888_OVER, - uint32_t, uint32_t, NONE) -FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, - scaled_nearest_scanline_sse2_8888_8888_OVER, - uint32_t, uint32_t, PAD) - -static force_inline void -scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, - uint32_t * dst, - const uint32_t * src, - int32_t w, - pixman_fixed_t vx, - pixman_fixed_t unit_x, - pixman_fixed_t max_vx, - pixman_bool_t zero_src) -{ - __m128i xmm_mask; - __m128i xmm_src, xmm_src_lo, xmm_src_hi; - __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - __m128i xmm_alpha_lo, xmm_alpha_hi; - - if (zero_src || (*mask >> 24) == 0) - return; - - xmm_mask = create_mask_16_128 (*mask >> 24); - - while (w && (unsigned long)dst & 15) - { - uint32_t s = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - - if (s) - { - uint32_t d = *dst; - - __m128i ms = unpack_32_1x128 (s); - __m128i alpha = expand_alpha_1x128 (ms); - __m128i dest = xmm_mask; - __m128i alpha_dst = unpack_32_1x128 (d); - - *dst = pack_1x128_32 ( - in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); - } - dst++; - w--; - } - - while (w >= 4) - { - uint32_t tmp1, tmp2, tmp3, tmp4; - - tmp1 = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - tmp2 = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - tmp3 = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - tmp4 = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - - xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); - - if (!is_zero (xmm_src)) - { - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - in_over_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_mask, &xmm_mask, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - } - - dst += 4; - w -= 4; - } - - while (w) - { - uint32_t s = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - - if (s) - { - uint32_t d = *dst; - - __m128i ms = unpack_32_1x128 (s); - __m128i alpha = expand_alpha_1x128 (ms); - __m128i mask = xmm_mask; - __m128i dest = unpack_32_1x128 (d); - - *dst = pack_1x128_32 ( - in_over_1x128 (&ms, &alpha, &mask, &dest)); - } - - dst++; - w--; - } - -} - -FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, - scaled_nearest_scanline_sse2_8888_n_8888_OVER, - uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) -FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, - scaled_nearest_scanline_sse2_8888_n_8888_OVER, - uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) -FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, - scaled_nearest_scanline_sse2_8888_n_8888_OVER, - uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) - -static void -bilinear_interpolate_line_sse2 (uint32_t * out, - const uint32_t * top, - const uint32_t * bottom, - int wt, - int wb, - pixman_fixed_t x, - pixman_fixed_t ux, - int width) -{ - const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); - const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); - const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff); - const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); - const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux); - const __m128i xmm_zero = _mm_setzero_si128 (); - __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x); - uint32_t pix1, pix2, pix3, pix4; - - #define INTERPOLATE_ONE_PIXEL(pix) \ - do { \ - __m128i xmm_wh, xmm_lo, xmm_hi, a; \ - /* fetch 2x2 pixel block into sse2 register */ \ - uint32_t tl = top [pixman_fixed_to_int (x)]; \ - uint32_t tr = top [pixman_fixed_to_int (x) + 1]; \ - uint32_t bl = bottom [pixman_fixed_to_int (x)]; \ - uint32_t br = bottom [pixman_fixed_to_int (x) + 1]; \ - a = _mm_set_epi32 (tr, tl, br, bl); \ - x += ux; \ - /* vertical interpolation */ \ - a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \ - xmm_wt), \ - _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \ - xmm_wb)); \ - /* calculate horizontal weights */ \ - xmm_wh = _mm_add_epi16 (xmm_addc, \ - _mm_xor_si128 (xmm_xorc, \ - _mm_srli_epi16 (xmm_x, 8))); \ - xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ - /* horizontal interpolation */ \ - xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ - xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ - a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ - _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ - /* shift and pack the result */ \ - a = _mm_srli_epi32 (a, 16); \ - a = _mm_packs_epi32 (a, a); \ - a = _mm_packus_epi16 (a, a); \ - pix = _mm_cvtsi128_si32 (a); \ - } while (0) - - while ((width -= 4) >= 0) - { - INTERPOLATE_ONE_PIXEL (pix1); - INTERPOLATE_ONE_PIXEL (pix2); - INTERPOLATE_ONE_PIXEL (pix3); - INTERPOLATE_ONE_PIXEL (pix4); - *out++ = pix1; - *out++ = pix2; - *out++ = pix3; - *out++ = pix4; - } - if (width & 2) - { - INTERPOLATE_ONE_PIXEL (pix1); - INTERPOLATE_ONE_PIXEL (pix2); - *out++ = pix1; - *out++ = pix2; - } - if (width & 1) - { - INTERPOLATE_ONE_PIXEL (pix1); - *out = pix1; - } - - #undef INTERPOLATE_ONE_PIXEL -} - -static force_inline void -scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, - const uint32_t * mask, - const uint32_t * src_top, - const uint32_t * src_bottom, - int32_t w, - int wt, - int wb, - pixman_fixed_t vx, - pixman_fixed_t unit_x, - pixman_fixed_t max_vx, - pixman_bool_t zero_src) -{ - bilinear_interpolate_line_sse2 (dst, src_top, src_bottom, - wt, wb, vx, unit_x, w); -} - -FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, - scaled_bilinear_scanline_sse2_8888_8888_SRC, - uint32_t, uint32_t, uint32_t, - COVER, FALSE, FALSE) -FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, - scaled_bilinear_scanline_sse2_8888_8888_SRC, - uint32_t, uint32_t, uint32_t, - PAD, FALSE, FALSE) -FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, - scaled_bilinear_scanline_sse2_8888_8888_SRC, - uint32_t, uint32_t, uint32_t, - NONE, FALSE, FALSE) - -static const pixman_fast_path_t sse2_fast_paths[] = -{ - /* PIXMAN_OP_OVER */ - PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), - PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), - PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), - PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888), - PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888), - PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888), - PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565), - PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565), - PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), - PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), - - /* PIXMAN_OP_OVER_REVERSE */ - PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), - PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), - - /* PIXMAN_OP_ADD */ - PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), - PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), - PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), - PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), - PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), - - /* PIXMAN_OP_SRC */ - PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), - PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), - PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), - PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), - PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), - PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), - PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), - PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area), - PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), - PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), - PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), - PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), - PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area), - PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area), - - /* PIXMAN_OP_IN */ - PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8), - PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8), - PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8), - - SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), - SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), - SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), - SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), - SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), - SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), - SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), - SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), - SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), - SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), - SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), - SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), - - SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), - SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), - SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), - SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), - - SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), - SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), - SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), - - { PIXMAN_OP_NONE }, -}; - -static pixman_bool_t -sse2_blt (pixman_implementation_t *imp, - uint32_t * src_bits, - uint32_t * dst_bits, - int src_stride, - int dst_stride, - int src_bpp, - int dst_bpp, - int src_x, - int src_y, - int dst_x, - int dst_y, - int width, - int height) -{ - if (!pixman_blt_sse2 ( - src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, - src_x, src_y, dst_x, dst_y, width, height)) - - { - return _pixman_implementation_blt ( - imp->delegate, - src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, - src_x, src_y, dst_x, dst_y, width, height); - } - - return TRUE; -} - -#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) -__attribute__((__force_align_arg_pointer__)) -#endif -static pixman_bool_t -sse2_fill (pixman_implementation_t *imp, - uint32_t * bits, - int stride, - int bpp, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor)) - { - return _pixman_implementation_fill ( - imp->delegate, bits, stride, bpp, x, y, width, height, xor); - } - - return TRUE; -} - -static uint32_t * -sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) -{ - int w = iter->width; - __m128i ff000000 = mask_ff000000; - uint32_t *dst = iter->buffer; - uint32_t *src = (uint32_t *)iter->bits; - - iter->bits += iter->stride; - - while (w && ((unsigned long)dst) & 0x0f) - { - *dst++ = (*src++) | 0xff000000; - w--; - } - - while (w >= 4) - { - save_128_aligned ( - (__m128i *)dst, _mm_or_si128 ( - load_128_unaligned ((__m128i *)src), ff000000)); - - dst += 4; - src += 4; - w -= 4; - } - - while (w) - { - *dst++ = (*src++) | 0xff000000; - w--; - } - - return iter->buffer; -} - -static uint32_t * -sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) -{ - int w = iter->width; - uint32_t *dst = iter->buffer; - uint16_t *src = (uint16_t *)iter->bits; - __m128i ff000000 = mask_ff000000; - - iter->bits += iter->stride; - - while (w && ((unsigned long)dst) & 0x0f) - { - uint16_t s = *src++; - - *dst++ = CONVERT_0565_TO_8888 (s); - w--; - } - - while (w >= 8) - { - __m128i lo, hi, s; - - s = _mm_loadu_si128 ((__m128i *)src); - - lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ())); - hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ())); - - save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000)); - save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000)); - - dst += 8; - src += 8; - w -= 8; - } - - while (w) - { - uint16_t s = *src++; - - *dst++ = CONVERT_0565_TO_8888 (s); - w--; - } - - return iter->buffer; -} - -static uint32_t * -sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) -{ - int w = iter->width; - uint32_t *dst = iter->buffer; - uint8_t *src = iter->bits; - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; - - iter->bits += iter->stride; - - while (w && (((unsigned long)dst) & 15)) - { - *dst++ = *(src++) << 24; - w--; - } - - while (w >= 16) - { - xmm0 = _mm_loadu_si128((__m128i *)src); - - xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0); - xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0); - xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1); - xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1); - xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2); - xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2); - - _mm_store_si128(((__m128i *)(dst + 0)), xmm3); - _mm_store_si128(((__m128i *)(dst + 4)), xmm4); - _mm_store_si128(((__m128i *)(dst + 8)), xmm5); - _mm_store_si128(((__m128i *)(dst + 12)), xmm6); - - dst += 16; - src += 16; - w -= 16; - } - - while (w) - { - *dst++ = *(src++) << 24; - w--; - } - - return iter->buffer; -} - -typedef struct -{ - pixman_format_code_t format; - pixman_iter_get_scanline_t get_scanline; -} fetcher_info_t; - -static const fetcher_info_t fetchers[] = -{ - { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 }, - { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 }, - { PIXMAN_a8, sse2_fetch_a8 }, - { PIXMAN_null } -}; - -static void -sse2_src_iter_init (pixman_implementation_t *imp, - pixman_iter_t *iter, - pixman_image_t *image, - int x, int y, int width, int height, - uint8_t *buffer, iter_flags_t flags) -{ -#define FLAGS \ - (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM) - - if ((flags & ITER_NARROW) && - (image->common.flags & FLAGS) == FLAGS && - x >= 0 && y >= 0 && - x + width <= image->bits.width && - y + height <= image->bits.height) - { - const fetcher_info_t *f; - - for (f = &fetchers[0]; f->format != PIXMAN_null; f++) - { - if (image->common.extended_format_code == f->format) - { - uint8_t *b = (uint8_t *)image->bits.bits; - int s = image->bits.rowstride * 4; - - iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8; - iter->stride = s; - iter->width = width; - iter->buffer = (uint32_t *)buffer; - - iter->get_scanline = f->get_scanline; - return; - } - } - } - - _pixman_implementation_src_iter_init ( - imp->delegate, iter, image, x, y, width, height, buffer, flags); -} - -#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) -__attribute__((__force_align_arg_pointer__)) -#endif -pixman_implementation_t * -_pixman_implementation_create_sse2 (pixman_implementation_t *fallback) -{ - pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths); - - /* SSE2 constants */ - mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); - mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); - mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); - mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); - mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); - mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); - mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); - mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); - mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); - mask_0080 = create_mask_16_128 (0x0080); - mask_00ff = create_mask_16_128 (0x00ff); - mask_0101 = create_mask_16_128 (0x0101); - mask_ffff = create_mask_16_128 (0xffff); - mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); - mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); - - /* Set up function pointers */ - imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; - imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; - imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; - imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; - imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; - imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; - imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; - imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; - imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; - imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; - - imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; - - imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; - imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; - imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; - imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; - imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; - imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; - imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; - imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; - imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; - imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; - imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; - - imp->blt = sse2_blt; - imp->fill = sse2_fill; - - imp->src_iter_init = sse2_src_iter_init; - - return imp; -} +/* + * Copyright © 2008 Rodrigo Kumpera + * Copyright © 2008 André Tupinambá + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Red Hat not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. Red Hat makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Rodrigo Kumpera (kumpera@gmail.com) + * André Tupinambá (andrelrt@gmail.com) + * + * Based on work by Owen Taylor and Søren Sandmann + */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ +#include /* for SSE2 intrinsics */ +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "pixman-fast-path.h" + +static __m128i mask_0080; +static __m128i mask_00ff; +static __m128i mask_0101; +static __m128i mask_ffff; +static __m128i mask_ff000000; +static __m128i mask_alpha; + +static __m128i mask_565_r; +static __m128i mask_565_g1, mask_565_g2; +static __m128i mask_565_b; +static __m128i mask_red; +static __m128i mask_green; +static __m128i mask_blue; + +static __m128i mask_565_fix_rb; +static __m128i mask_565_fix_g; + +static force_inline __m128i +unpack_32_1x128 (uint32_t data) +{ + return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); +} + +static force_inline void +unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) +{ + *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); + *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); +} + +static force_inline __m128i +unpack_565_to_8888 (__m128i lo) +{ + __m128i r, g, b, rb, t; + + r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); + g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); + b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); + + rb = _mm_or_si128 (r, b); + t = _mm_and_si128 (rb, mask_565_fix_rb); + t = _mm_srli_epi32 (t, 5); + rb = _mm_or_si128 (rb, t); + + t = _mm_and_si128 (g, mask_565_fix_g); + t = _mm_srli_epi32 (t, 6); + g = _mm_or_si128 (g, t); + + return _mm_or_si128 (rb, g); +} + +static force_inline void +unpack_565_128_4x128 (__m128i data, + __m128i* data0, + __m128i* data1, + __m128i* data2, + __m128i* data3) +{ + __m128i lo, hi; + + lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); + hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); + + lo = unpack_565_to_8888 (lo); + hi = unpack_565_to_8888 (hi); + + unpack_128_2x128 (lo, data0, data1); + unpack_128_2x128 (hi, data2, data3); +} + +static force_inline uint16_t +pack_565_32_16 (uint32_t pixel) +{ + return (uint16_t) (((pixel >> 8) & 0xf800) | + ((pixel >> 5) & 0x07e0) | + ((pixel >> 3) & 0x001f)); +} + +static force_inline __m128i +pack_2x128_128 (__m128i lo, __m128i hi) +{ + return _mm_packus_epi16 (lo, hi); +} + +static force_inline __m128i +pack_565_2x128_128 (__m128i lo, __m128i hi) +{ + __m128i data; + __m128i r, g1, g2, b; + + data = pack_2x128_128 (lo, hi); + + r = _mm_and_si128 (data, mask_565_r); + g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); + g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); + b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); + + return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); +} + +static force_inline __m128i +pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) +{ + return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), + pack_565_2x128_128 (*xmm2, *xmm3)); +} + +static force_inline int +is_opaque (__m128i x) +{ + __m128i ffs = _mm_cmpeq_epi8 (x, x); + + return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; +} + +static force_inline int +is_zero (__m128i x) +{ + return _mm_movemask_epi8 ( + _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; +} + +static force_inline int +is_transparent (__m128i x) +{ + return (_mm_movemask_epi8 ( + _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; +} + +static force_inline __m128i +expand_pixel_32_1x128 (uint32_t data) +{ + return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); +} + +static force_inline __m128i +expand_alpha_1x128 (__m128i data) +{ + return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, + _MM_SHUFFLE (3, 3, 3, 3)), + _MM_SHUFFLE (3, 3, 3, 3)); +} + +static force_inline void +expand_alpha_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* alpha_lo, + __m128i* alpha_hi) +{ + __m128i lo, hi; + + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); + + *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); + *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); +} + +static force_inline void +expand_alpha_rev_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* alpha_lo, + __m128i* alpha_hi) +{ + __m128i lo, hi; + + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); + *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); + *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); +} + +static force_inline void +pix_multiply_2x128 (__m128i* data_lo, + __m128i* data_hi, + __m128i* alpha_lo, + __m128i* alpha_hi, + __m128i* ret_lo, + __m128i* ret_hi) +{ + __m128i lo, hi; + + lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); + hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); + lo = _mm_adds_epu16 (lo, mask_0080); + hi = _mm_adds_epu16 (hi, mask_0080); + *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); + *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); +} + +static force_inline void +pix_add_multiply_2x128 (__m128i* src_lo, + __m128i* src_hi, + __m128i* alpha_dst_lo, + __m128i* alpha_dst_hi, + __m128i* dst_lo, + __m128i* dst_hi, + __m128i* alpha_src_lo, + __m128i* alpha_src_hi, + __m128i* ret_lo, + __m128i* ret_hi) +{ + __m128i t1_lo, t1_hi; + __m128i t2_lo, t2_hi; + + pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); + pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); + + *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); + *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); +} + +static force_inline void +negate_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* neg_lo, + __m128i* neg_hi) +{ + *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); + *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); +} + +static force_inline void +invert_colors_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* inv_lo, + __m128i* inv_hi) +{ + __m128i lo, hi; + + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); + *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); + *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); +} + +static force_inline void +over_2x128 (__m128i* src_lo, + __m128i* src_hi, + __m128i* alpha_lo, + __m128i* alpha_hi, + __m128i* dst_lo, + __m128i* dst_hi) +{ + __m128i t1, t2; + + negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); + + pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); + + *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); + *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); +} + +static force_inline void +over_rev_non_pre_2x128 (__m128i src_lo, + __m128i src_hi, + __m128i* dst_lo, + __m128i* dst_hi) +{ + __m128i lo, hi; + __m128i alpha_lo, alpha_hi; + + expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); + + lo = _mm_or_si128 (alpha_lo, mask_alpha); + hi = _mm_or_si128 (alpha_hi, mask_alpha); + + invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); + + pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); + + over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); +} + +static force_inline void +in_over_2x128 (__m128i* src_lo, + __m128i* src_hi, + __m128i* alpha_lo, + __m128i* alpha_hi, + __m128i* mask_lo, + __m128i* mask_hi, + __m128i* dst_lo, + __m128i* dst_hi) +{ + __m128i s_lo, s_hi; + __m128i a_lo, a_hi; + + pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); + pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); + + over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); +} + +/* load 4 pixels from a 16-byte boundary aligned address */ +static force_inline __m128i +load_128_aligned (__m128i* src) +{ + return _mm_load_si128 (src); +} + +/* load 4 pixels from a unaligned address */ +static force_inline __m128i +load_128_unaligned (const __m128i* src) +{ + return _mm_loadu_si128 (src); +} + +/* save 4 pixels using Write Combining memory on a 16-byte + * boundary aligned address + */ +static force_inline void +save_128_write_combining (__m128i* dst, + __m128i data) +{ + _mm_stream_si128 (dst, data); +} + +/* save 4 pixels on a 16-byte boundary aligned address */ +static force_inline void +save_128_aligned (__m128i* dst, + __m128i data) +{ + _mm_store_si128 (dst, data); +} + +/* save 4 pixels on a unaligned address */ +static force_inline void +save_128_unaligned (__m128i* dst, + __m128i data) +{ + _mm_storeu_si128 (dst, data); +} + +static force_inline __m128i +load_32_1x128 (uint32_t data) +{ + return _mm_cvtsi32_si128 (data); +} + +static force_inline __m128i +expand_alpha_rev_1x128 (__m128i data) +{ + return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); +} + +static force_inline __m128i +expand_pixel_8_1x128 (uint8_t data) +{ + return _mm_shufflelo_epi16 ( + unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); +} + +static force_inline __m128i +pix_multiply_1x128 (__m128i data, + __m128i alpha) +{ + return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), + mask_0080), + mask_0101); +} + +static force_inline __m128i +pix_add_multiply_1x128 (__m128i* src, + __m128i* alpha_dst, + __m128i* dst, + __m128i* alpha_src) +{ + __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); + __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); + + return _mm_adds_epu8 (t1, t2); +} + +static force_inline __m128i +negate_1x128 (__m128i data) +{ + return _mm_xor_si128 (data, mask_00ff); +} + +static force_inline __m128i +invert_colors_1x128 (__m128i data) +{ + return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); +} + +static force_inline __m128i +over_1x128 (__m128i src, __m128i alpha, __m128i dst) +{ + return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); +} + +static force_inline __m128i +in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) +{ + return over_1x128 (pix_multiply_1x128 (*src, *mask), + pix_multiply_1x128 (*alpha, *mask), + *dst); +} + +static force_inline __m128i +over_rev_non_pre_1x128 (__m128i src, __m128i dst) +{ + __m128i alpha = expand_alpha_1x128 (src); + + return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), + _mm_or_si128 (alpha, mask_alpha)), + alpha, + dst); +} + +static force_inline uint32_t +pack_1x128_32 (__m128i data) +{ + return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); +} + +static force_inline __m128i +expand565_16_1x128 (uint16_t pixel) +{ + __m128i m = _mm_cvtsi32_si128 (pixel); + + m = unpack_565_to_8888 (m); + + return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); +} + +static force_inline uint32_t +core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) +{ + uint8_t a; + __m128i xmms; + + a = src >> 24; + + if (a == 0xff) + { + return src; + } + else if (src) + { + xmms = unpack_32_1x128 (src); + return pack_1x128_32 ( + over_1x128 (xmms, expand_alpha_1x128 (xmms), + unpack_32_1x128 (dst))); + } + + return dst; +} + +static force_inline uint32_t +combine1 (const uint32_t *ps, const uint32_t *pm) +{ + uint32_t s = *ps; + + if (pm) + { + __m128i ms, mm; + + mm = unpack_32_1x128 (*pm); + mm = expand_alpha_1x128 (mm); + + ms = unpack_32_1x128 (s); + ms = pix_multiply_1x128 (ms, mm); + + s = pack_1x128_32 (ms); + } + + return s; +} + +static force_inline __m128i +combine4 (const __m128i *ps, const __m128i *pm) +{ + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_msk_lo, xmm_msk_hi; + __m128i s; + + if (pm) + { + xmm_msk_lo = load_128_unaligned (pm); + + if (is_transparent (xmm_msk_lo)) + return _mm_setzero_si128 (); + } + + s = load_128_unaligned (ps); + + if (pm) + { + unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); + + expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_msk_lo, &xmm_msk_hi, + &xmm_src_lo, &xmm_src_hi); + + s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); + } + + return s; +} + +static force_inline void +core_combine_over_u_sse2_mask (uint32_t * pd, + const uint32_t* ps, + const uint32_t* pm, + int w) +{ + uint32_t s, d; + + /* Align dst on a 16-byte boundary */ + while (w && ((unsigned long)pd & 15)) + { + d = *pd; + s = combine1 (ps, pm); + + if (s) + *pd = core_combine_over_u_pixel_sse2 (s, d); + pd++; + ps++; + pm++; + w--; + } + + while (w >= 4) + { + __m128i mask = load_128_unaligned ((__m128i *)pm); + + if (!is_zero (mask)) + { + __m128i src; + __m128i src_hi, src_lo; + __m128i mask_hi, mask_lo; + __m128i alpha_hi, alpha_lo; + + src = load_128_unaligned ((__m128i *)ps); + + if (is_opaque (_mm_and_si128 (src, mask))) + { + save_128_aligned ((__m128i *)pd, src); + } + else + { + __m128i dst = load_128_aligned ((__m128i *)pd); + __m128i dst_hi, dst_lo; + + unpack_128_2x128 (mask, &mask_lo, &mask_hi); + unpack_128_2x128 (src, &src_lo, &src_hi); + + expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi); + pix_multiply_2x128 (&src_lo, &src_hi, + &mask_lo, &mask_hi, + &src_lo, &src_hi); + + unpack_128_2x128 (dst, &dst_lo, &dst_hi); + + expand_alpha_2x128 (src_lo, src_hi, + &alpha_lo, &alpha_hi); + + over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, + &dst_lo, &dst_hi); + + save_128_aligned ( + (__m128i *)pd, + pack_2x128_128 (dst_lo, dst_hi)); + } + } + + pm += 4; + ps += 4; + pd += 4; + w -= 4; + } + while (w) + { + d = *pd; + s = combine1 (ps, pm); + + if (s) + *pd = core_combine_over_u_pixel_sse2 (s, d); + pd++; + ps++; + pm++; + + w--; + } +} + +static force_inline void +core_combine_over_u_sse2_no_mask (uint32_t * pd, + const uint32_t* ps, + int w) +{ + uint32_t s, d; + + /* Align dst on a 16-byte boundary */ + while (w && ((unsigned long)pd & 15)) + { + d = *pd; + s = *ps; + + if (s) + *pd = core_combine_over_u_pixel_sse2 (s, d); + pd++; + ps++; + w--; + } + + while (w >= 4) + { + __m128i src; + __m128i src_hi, src_lo, dst_hi, dst_lo; + __m128i alpha_hi, alpha_lo; + + src = load_128_unaligned ((__m128i *)ps); + + if (!is_zero (src)) + { + if (is_opaque (src)) + { + save_128_aligned ((__m128i *)pd, src); + } + else + { + __m128i dst = load_128_aligned ((__m128i *)pd); + + unpack_128_2x128 (src, &src_lo, &src_hi); + unpack_128_2x128 (dst, &dst_lo, &dst_hi); + + expand_alpha_2x128 (src_lo, src_hi, + &alpha_lo, &alpha_hi); + over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, + &dst_lo, &dst_hi); + + save_128_aligned ( + (__m128i *)pd, + pack_2x128_128 (dst_lo, dst_hi)); + } + } + + ps += 4; + pd += 4; + w -= 4; + } + while (w) + { + d = *pd; + s = *ps; + + if (s) + *pd = core_combine_over_u_pixel_sse2 (s, d); + pd++; + ps++; + + w--; + } +} + +static force_inline void +sse2_combine_over_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + if (pm) + core_combine_over_u_sse2_mask (pd, ps, pm, w); + else + core_combine_over_u_sse2_no_mask (pd, ps, w); +} + +static void +sse2_combine_over_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, d; + + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + /* Align dst on a 16-byte boundary */ + while (w && + ((unsigned long)pd & 15)) + { + d = *pd; + s = combine1 (ps, pm); + + *pd++ = core_combine_over_u_pixel_sse2 (d, s); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + /* I'm loading unaligned because I'm not sure + * about the address alignment. + */ + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_src_lo, &xmm_src_hi); + + /* rebuid the 4 pixel data and save*/ + save_128_aligned ((__m128i*)pd, + pack_2x128_128 (xmm_src_lo, xmm_src_hi)); + + w -= 4; + ps += 4; + pd += 4; + + if (pm) + pm += 4; + } + + while (w) + { + d = *pd; + s = combine1 (ps, pm); + + *pd++ = core_combine_over_u_pixel_sse2 (d, s); + ps++; + w--; + if (pm) + pm++; + } +} + +static force_inline uint32_t +core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) +{ + uint32_t maska = src >> 24; + + if (maska == 0) + { + return 0; + } + else if (maska != 0xff) + { + return pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (dst), + expand_alpha_1x128 (unpack_32_1x128 (src)))); + } + + return dst; +} + +static void +sse2_combine_in_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + while (w && ((unsigned long) pd & 15)) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixel_sse2 (d, s); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)pd, + pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixel_sse2 (d, s); + w--; + ps++; + if (pm) + pm++; + } +} + +static void +sse2_combine_in_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + while (w && ((unsigned long) pd & 15)) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixel_sse2 (s, d); + ps++; + w--; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } +} + +static void +sse2_combine_out_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + while (w && ((unsigned long) pd & 15)) + { + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (s))))); + + if (pm) + pm++; + ps++; + w--; + } + + while (w >= 4) + { + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + if (pm) + pm += 4; + + w -= 4; + } + + while (w) + { + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (s))))); + ps++; + if (pm) + pm++; + w--; + } +} + +static void +sse2_combine_out_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + while (w && ((unsigned long) pd & 15)) + { + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (d))))); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (d))))); + w--; + ps++; + if (pm) + pm++; + } +} + +static force_inline uint32_t +core_combine_atop_u_pixel_sse2 (uint32_t src, + uint32_t dst) +{ + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); + + __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); + __m128i da = expand_alpha_1x128 (d); + + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); +} + +static void +sse2_combine_atop_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + + while (w && ((unsigned long) pd & 15)) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_atop_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + + pix_add_multiply_2x128 ( + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_atop_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } +} + +static force_inline uint32_t +core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, + uint32_t dst) +{ + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); + + __m128i sa = expand_alpha_1x128 (s); + __m128i da = negate_1x128 (expand_alpha_1x128 (d)); + + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); +} + +static void +sse2_combine_atop_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + + while (w && ((unsigned long) pd & 15)) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); + ps++; + w--; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_add_multiply_2x128 ( + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); + ps++; + w--; + if (pm) + pm++; + } +} + +static force_inline uint32_t +core_combine_xor_u_pixel_sse2 (uint32_t src, + uint32_t dst) +{ + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); + + __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); + __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); + + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); +} + +static void +sse2_combine_xor_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + int w = width; + uint32_t s, d; + uint32_t* pd = dst; + const uint32_t* ps = src; + const uint32_t* pm = mask; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + + while (w && ((unsigned long) pd & 15)) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_xor_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); + xmm_dst = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_add_multiply_2x128 ( + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_xor_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } +} + +static force_inline void +sse2_combine_add_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + int w = width; + uint32_t s, d; + uint32_t* pd = dst; + const uint32_t* ps = src; + const uint32_t* pm = mask; + + while (w && (unsigned long)pd & 15) + { + s = combine1 (ps, pm); + d = *pd; + + ps++; + if (pm) + pm++; + *pd++ = _mm_cvtsi128_si32 ( + _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); + w--; + } + + while (w >= 4) + { + __m128i s; + + s = combine4 ((__m128i*)ps, (__m128i*)pm); + + save_128_aligned ( + (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); + + pd += 4; + ps += 4; + if (pm) + pm += 4; + w -= 4; + } + + while (w--) + { + s = combine1 (ps, pm); + d = *pd; + + ps++; + *pd++ = _mm_cvtsi128_si32 ( + _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); + if (pm) + pm++; + } +} + +static force_inline uint32_t +core_combine_saturate_u_pixel_sse2 (uint32_t src, + uint32_t dst) +{ + __m128i ms = unpack_32_1x128 (src); + __m128i md = unpack_32_1x128 (dst); + uint32_t sa = src >> 24; + uint32_t da = ~dst >> 24; + + if (sa > da) + { + ms = pix_multiply_1x128 ( + ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); + } + + return pack_1x128_32 (_mm_adds_epu16 (md, ms)); +} + +static void +sse2_combine_saturate_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, d; + + uint32_t pack_cmp; + __m128i xmm_src, xmm_dst; + + while (w && (unsigned long)pd & 15) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_dst = load_128_aligned ((__m128i*)pd); + xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); + + pack_cmp = _mm_movemask_epi8 ( + _mm_cmpgt_epi32 ( + _mm_srli_epi32 (xmm_src, 24), + _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); + + /* if some alpha src is grater than respective ~alpha dst */ + if (pack_cmp) + { + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + } + else + { + save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); + + pd += 4; + ps += 4; + if (pm) + pm += 4; + } + + w -= 4; + } + + while (w--) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + ps++; + if (pm) + pm++; + } +} + +static void +sse2_combine_src_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, m; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); + w--; + } + + while (w >= 4) + { + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); + w--; + } +} + +static force_inline uint32_t +core_combine_over_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) +{ + __m128i s = unpack_32_1x128 (src); + __m128i expAlpha = expand_alpha_1x128 (s); + __m128i unpk_mask = unpack_32_1x128 (mask); + __m128i unpk_dst = unpack_32_1x128 (dst); + + return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); +} + +static void +sse2_combine_over_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); + w--; + } +} + +static force_inline uint32_t +core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) +{ + __m128i d = unpack_32_1x128 (dst); + + return pack_1x128_32 ( + over_1x128 (d, expand_alpha_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (src), + unpack_32_1x128 (mask)))); +} + +static void +sse2_combine_over_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask_lo, &xmm_mask_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); + w--; + } +} + +static void +sse2_combine_in_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), + expand_alpha_1x128 (unpack_32_1x128 (d)))); + + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + expand_alpha_1x128 (unpack_32_1x128 (d)))); + + w--; + } +} + +static void +sse2_combine_in_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s))))); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s))))); + w--; + } +} + +static void +sse2_combine_out_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); + + w--; + } +} + +static void +sse2_combine_out_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + negate_1x128 (pix_multiply_1x128 ( + unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s)))))); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask_lo, &xmm_mask_hi); + + negate_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + negate_1x128 (pix_multiply_1x128 ( + unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s)))))); + w--; + } +} + +static force_inline uint32_t +core_combine_atop_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) +{ + __m128i m = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); + __m128i sa = expand_alpha_1x128 (s); + __m128i da = expand_alpha_1x128 (d); + + s = pix_multiply_1x128 (s, m); + m = negate_1x128 (pix_multiply_1x128 (m, sa)); + + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); +} + +static void +sse2_combine_atop_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_mask_lo, &xmm_mask_hi); + + negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + pix_add_multiply_2x128 ( + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); + w--; + } +} + +static force_inline uint32_t +core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) +{ + __m128i m = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); + + __m128i da = negate_1x128 (expand_alpha_1x128 (d)); + __m128i sa = expand_alpha_1x128 (s); + + s = pix_multiply_1x128 (s, m); + m = pix_multiply_1x128 (m, sa); + + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); +} + +static void +sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_mask_lo, &xmm_mask_hi); + + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_add_multiply_2x128 ( + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); + w--; + } +} + +static force_inline uint32_t +core_combine_xor_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) +{ + __m128i a = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); + + __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( + a, expand_alpha_1x128 (s))); + __m128i dest = pix_multiply_1x128 (s, a); + __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); + + return pack_1x128_32 (pix_add_multiply_1x128 (&d, + &alpha_dst, + &dest, + &alpha_src)); +} + +static void +sse2_combine_xor_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_mask_lo, &xmm_mask_hi); + + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + negate_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_add_multiply_2x128 ( + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); + w--; + } +} + +static void +sse2_combine_add_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); + w--; + } + + while (w >= 4) + { + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 ( + _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), + _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); + w--; + } +} + +static force_inline __m128i +create_mask_16_128 (uint16_t mask) +{ + return _mm_set1_epi16 (mask); +} + +/* Work around a code generation bug in Sun Studio 12. */ +#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) +# define create_mask_2x32_128(mask0, mask1) \ + (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) +#else +static force_inline __m128i +create_mask_2x32_128 (uint32_t mask0, + uint32_t mask1) +{ + return _mm_set_epi32 (mask0, mask1, mask0, mask1); +} +#endif + +static void +sse2_composite_over_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint32_t *dst_line, *dst, d; + int32_t w; + int dst_stride; + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + + while (height--) + { + dst = dst_line; + + dst_line += dst_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + d = *dst; + *dst++ = pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + unpack_32_1x128 (d))); + w--; + } + + while (w >= 4) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_dst_lo, &xmm_dst_hi); + + /* rebuid the 4 pixel data and save*/ + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + w -= 4; + dst += 4; + } + + while (w) + { + d = *dst; + *dst++ = pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + unpack_32_1x128 (d))); + w--; + } + + } +} + +static void +sse2_composite_over_n_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint16_t *dst_line, *dst, d; + int32_t w; + int dst_stride; + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + + while (height--) + { + dst = dst_line; + + dst_line += dst_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + d = *dst; + + *dst++ = pack_565_32_16 ( + pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + expand565_16_1x128 (d)))); + w--; + } + + while (w >= 8) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + + over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_dst0, &xmm_dst1); + over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_dst2, &xmm_dst3); + + xmm_dst = pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + + save_128_aligned ((__m128i*)dst, xmm_dst); + + dst += 8; + w -= 8; + } + + while (w--) + { + d = *dst; + *dst++ = pack_565_32_16 ( + pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, + expand565_16_1x128 (d)))); + } + } + +} + +static void +sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, d; + uint32_t *mask_line, m; + uint32_t pack_cmp; + int dst_stride, mask_stride; + + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + srca = src >> 24; + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + xmm_src = _mm_unpacklo_epi8 ( + create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; + + while (height--) + { + int w = width; + const uint32_t *pm = (uint32_t *)mask_line; + uint32_t *pd = (uint32_t *)dst_line; + + dst_line += dst_stride; + mask_line += mask_stride; + + while (w && (unsigned long)pd & 15) + { + m = *pm++; + + if (m) + { + d = *pd; + + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); + + *pd = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), + mmx_dest)); + } + + pd++; + w--; + } + + while (w >= 4) + { + xmm_mask = load_128_unaligned ((__m128i*)pm); + + pack_cmp = + _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ + if (pack_cmp != 0xffff) + { + xmm_dst = load_128_aligned ((__m128i*)pd); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_src, &xmm_src, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); + + save_128_aligned ( + (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); + } + + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + m = *pm++; + + if (m) + { + d = *pd; + + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); + + *pd = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), + mmx_dest)); + } + + pd++; + w--; + } + } + +} + +static void +sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint32_t *dst_line, d; + uint32_t *mask_line, m; + uint32_t pack_cmp; + int dst_stride, mask_stride; + + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + xmm_src = _mm_unpacklo_epi8 ( + create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; + + while (height--) + { + int w = width; + const uint32_t *pm = (uint32_t *)mask_line; + uint32_t *pd = (uint32_t *)dst_line; + + dst_line += dst_stride; + mask_line += mask_stride; + + while (w && (unsigned long)pd & 15) + { + m = *pm++; + + if (m) + { + d = *pd; + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); + + *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, + &mmx_alpha, + &mmx_mask, + &mmx_dest)); + } + + pd++; + w--; + } + + while (w >= 4) + { + xmm_mask = load_128_unaligned ((__m128i*)pm); + + pack_cmp = + _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ + if (pack_cmp != 0xffff) + { + xmm_dst = load_128_aligned ((__m128i*)pd); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + m = *pm++; + + if (m) + { + d = *pd; + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); + + *pd = pack_1x128_32 ( + in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); + } + + pd++; + w--; + } + } + +} + +static void +sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t mask; + int32_t w; + int dst_stride, src_stride; + + __m128i xmm_mask; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); + + xmm_mask = create_mask_16_128 (mask >> 24); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + uint32_t s = *src++; + + if (s) + { + uint32_t d = *dst; + + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i dest = xmm_mask; + __m128i alpha_dst = unpack_32_1x128 (d); + + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); + } + dst++; + w--; + } + + while (w >= 4) + { + xmm_src = load_128_unaligned ((__m128i*)src); + + if (!is_zero (xmm_src)) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + dst += 4; + src += 4; + w -= 4; + } + + while (w) + { + uint32_t s = *src++; + + if (s) + { + uint32_t d = *dst; + + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); + + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &mask, &dest)); + } + + dst++; + w--; + } + } + +} + +static void +sse2_composite_src_x888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int32_t w; + int dst_stride, src_stride; + + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + *dst++ = *src++ | 0xff000000; + w--; + } + + while (w >= 16) + { + __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; + + xmm_src1 = load_128_unaligned ((__m128i*)src + 0); + xmm_src2 = load_128_unaligned ((__m128i*)src + 1); + xmm_src3 = load_128_unaligned ((__m128i*)src + 2); + xmm_src4 = load_128_unaligned ((__m128i*)src + 3); + + save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); + save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); + save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); + save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); + + dst += 16; + src += 16; + w -= 16; + } + + while (w) + { + *dst++ = *src++ | 0xff000000; + w--; + } + } + +} + +static void +sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t mask; + int dst_stride, src_stride; + int32_t w; + + __m128i xmm_mask, xmm_alpha; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); + + xmm_mask = create_mask_16_128 (mask >> 24); + xmm_alpha = mask_00ff; + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + uint32_t s = (*src++) | 0xff000000; + uint32_t d = *dst; + + __m128i src = unpack_32_1x128 (s); + __m128i alpha = xmm_alpha; + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); + + *dst++ = pack_1x128_32 ( + in_over_1x128 (&src, &alpha, &mask, &dest)); + + w--; + } + + while (w >= 4) + { + xmm_src = _mm_or_si128 ( + load_128_unaligned ((__m128i*)src), mask_ff000000); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha, &xmm_alpha, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + dst += 4; + src += 4; + w -= 4; + + } + + while (w) + { + uint32_t s = (*src++) | 0xff000000; + uint32_t d = *dst; + + __m128i src = unpack_32_1x128 (s); + __m128i alpha = xmm_alpha; + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); + + *dst++ = pack_1x128_32 ( + in_over_1x128 (&src, &alpha, &mask, &dest)); + + w--; + } + } + +} + +static void +sse2_composite_over_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + int dst_stride, src_stride; + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + dst = dst_line; + src = src_line; + + while (height--) + { + sse2_combine_over_u (imp, op, dst, src, NULL, width); + + dst += dst_stride; + src += src_stride; + } +} + +static force_inline uint16_t +composite_over_8888_0565pixel (uint32_t src, uint16_t dst) +{ + __m128i ms; + + ms = unpack_32_1x128 (src); + return pack_565_32_16 ( + pack_1x128_32 ( + over_1x128 ( + ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); +} + +static void +sse2_composite_over_8888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst, d; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + int32_t w; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + src = src_line; + + dst_line += dst_stride; + src_line += src_stride; + w = width; + + /* Align dst on a 16-byte boundary */ + while (w && + ((unsigned long)dst & 15)) + { + s = *src++; + d = *dst; + + *dst++ = composite_over_8888_0565pixel (s, d); + w--; + } + + /* It's a 8 pixel loop */ + while (w >= 8) + { + /* I'm loading unaligned because I'm not sure + * about the address alignment. + */ + xmm_src = load_128_unaligned ((__m128i*) src); + xmm_dst = load_128_aligned ((__m128i*) dst); + + /* Unpacking */ + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + /* I'm loading next 4 pixels from memory + * before to optimze the memory read. + */ + xmm_src = load_128_unaligned ((__m128i*) (src + 4)); + + over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst0, &xmm_dst1); + + /* Unpacking */ + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst2, &xmm_dst3); + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + dst += 8; + src += 8; + } + + while (w--) + { + s = *src++; + d = *dst; + + *dst++ = composite_over_8888_0565pixel (s, d); + } + } + +} + +static void +sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t m, d; + + __m128i xmm_src, xmm_alpha, xmm_def; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + xmm_def = create_mask_2x32_128 (src, src); + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_pixel_8_1x128 (m); + mmx_dest = unpack_32_1x128 (d); + + *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, + &mmx_alpha, + &mmx_mask, + &mmx_dest)); + } + + w--; + dst++; + } + + while (w >= 4) + { + m = *((uint32_t*)mask); + + if (srca == 0xff && m == 0xffffffff) + { + save_128_aligned ((__m128i*)dst, xmm_def); + } + else if (m) + { + xmm_dst = load_128_aligned ((__m128i*) dst); + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_pixel_8_1x128 (m); + mmx_dest = unpack_32_1x128 (d); + + *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, + &mmx_alpha, + &mmx_mask, + &mmx_dest)); + } + + w--; + dst++; + } + } + +} + +static pixman_bool_t +pixman_fill_sse2 (uint32_t *bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t data) +{ + uint32_t byte_width; + uint8_t *byte_line; + + __m128i xmm_def; + + if (bpp == 8) + { + uint8_t b; + uint16_t w; + + stride = stride * (int) sizeof (uint32_t) / 1; + byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); + byte_width = width; + stride *= 1; + + b = data & 0xff; + w = (b << 8) | b; + data = (w << 16) | w; + } + else if (bpp == 16) + { + stride = stride * (int) sizeof (uint32_t) / 2; + byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); + byte_width = 2 * width; + stride *= 2; + + data = (data & 0xffff) * 0x00010001; + } + else if (bpp == 32) + { + stride = stride * (int) sizeof (uint32_t) / 4; + byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); + byte_width = 4 * width; + stride *= 4; + } + else + { + return FALSE; + } + + xmm_def = create_mask_2x32_128 (data, data); + + while (height--) + { + int w; + uint8_t *d = byte_line; + byte_line += stride; + w = byte_width; + + while (w >= 1 && ((unsigned long)d & 1)) + { + *(uint8_t *)d = data; + w -= 1; + d += 1; + } + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(uint16_t *)d = data; + w -= 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 15)) + { + *(uint32_t *)d = data; + + w -= 4; + d += 4; + } + + while (w >= 128) + { + save_128_aligned ((__m128i*)(d), xmm_def); + save_128_aligned ((__m128i*)(d + 16), xmm_def); + save_128_aligned ((__m128i*)(d + 32), xmm_def); + save_128_aligned ((__m128i*)(d + 48), xmm_def); + save_128_aligned ((__m128i*)(d + 64), xmm_def); + save_128_aligned ((__m128i*)(d + 80), xmm_def); + save_128_aligned ((__m128i*)(d + 96), xmm_def); + save_128_aligned ((__m128i*)(d + 112), xmm_def); + + d += 128; + w -= 128; + } + + if (w >= 64) + { + save_128_aligned ((__m128i*)(d), xmm_def); + save_128_aligned ((__m128i*)(d + 16), xmm_def); + save_128_aligned ((__m128i*)(d + 32), xmm_def); + save_128_aligned ((__m128i*)(d + 48), xmm_def); + + d += 64; + w -= 64; + } + + if (w >= 32) + { + save_128_aligned ((__m128i*)(d), xmm_def); + save_128_aligned ((__m128i*)(d + 16), xmm_def); + + d += 32; + w -= 32; + } + + if (w >= 16) + { + save_128_aligned ((__m128i*)(d), xmm_def); + + d += 16; + w -= 16; + } + + while (w >= 4) + { + *(uint32_t *)d = data; + + w -= 4; + d += 4; + } + + if (w >= 2) + { + *(uint16_t *)d = data; + w -= 2; + d += 2; + } + + if (w >= 1) + { + *(uint8_t *)d = data; + w -= 1; + d += 1; + } + } + + return TRUE; +} + +static void +sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t m; + + __m128i xmm_src, xmm_def; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + { + pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride, + PIXMAN_FORMAT_BPP (dst_image->bits.format), + dest_x, dest_y, width, height, 0); + return; + } + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + xmm_def = create_mask_2x32_128 (src, src); + xmm_src = expand_pixel_32_1x128 (src); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + uint8_t m = *mask++; + + if (m) + { + *dst = pack_1x128_32 ( + pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); + } + else + { + *dst = 0; + } + + w--; + dst++; + } + + while (w >= 4) + { + m = *((uint32_t*)mask); + + if (srca == 0xff && m == 0xffffffff) + { + save_128_aligned ((__m128i*)dst, xmm_def); + } + else if (m) + { + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_src, &xmm_src, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); + } + else + { + save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + + if (m) + { + *dst = pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_src, expand_pixel_8_1x128 (m))); + } + else + { + *dst = 0; + } + + w--; + dst++; + } + } + +} + +static void +sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint16_t *dst_line, *dst, d; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t m; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; + + __m128i xmm_src, xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + mmx_dest = expand565_16_1x128 (d); + + *dst = pack_565_32_16 ( + pack_1x128_32 ( + in_over_1x128 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + } + + while (w >= 8) + { + xmm_dst = load_128_aligned ((__m128i*) dst); + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + + m = *((uint32_t*)mask); + mask += 4; + + if (m) + { + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst0, &xmm_dst1); + } + + m = *((uint32_t*)mask); + mask += 4; + + if (m) + { + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst2, &xmm_dst3); + } + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + dst += 8; + } + + while (w) + { + m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + mmx_dest = expand565_16_1x128 (d); + + *dst = pack_565_32_16 ( + pack_1x128_32 ( + in_over_1x128 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + } + } + +} + +static void +sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst, d; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + int32_t w; + uint32_t opaque, zero; + + __m128i ms; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + s = *src++; + d = *dst; + + ms = unpack_32_1x128 (s); + + *dst++ = pack_565_32_16 ( + pack_1x128_32 ( + over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); + w--; + } + + while (w >= 8) + { + /* First round */ + xmm_src = load_128_unaligned ((__m128i*)src); + xmm_dst = load_128_aligned ((__m128i*)dst); + + opaque = is_opaque (xmm_src); + zero = is_zero (xmm_src); + + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + + /* preload next round*/ + xmm_src = load_128_unaligned ((__m128i*)(src + 4)); + + if (opaque) + { + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst0, &xmm_dst1); + } + else if (!zero) + { + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst0, &xmm_dst1); + } + + /* Second round */ + opaque = is_opaque (xmm_src); + zero = is_zero (xmm_src); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + + if (opaque) + { + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst2, &xmm_dst3); + } + else if (!zero) + { + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst2, &xmm_dst3); + } + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + src += 8; + dst += 8; + } + + while (w) + { + s = *src++; + d = *dst; + + ms = unpack_32_1x128 (s); + + *dst++ = pack_565_32_16 ( + pack_1x128_32 ( + over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); + w--; + } + } + +} + +static void +sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst, d; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + int32_t w; + uint32_t opaque, zero; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + s = *src++; + d = *dst; + + *dst++ = pack_1x128_32 ( + over_rev_non_pre_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); + + w--; + } + + while (w >= 4) + { + xmm_src_hi = load_128_unaligned ((__m128i*)src); + + opaque = is_opaque (xmm_src_hi); + zero = is_zero (xmm_src_hi); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + + if (opaque) + { + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + else if (!zero) + { + xmm_dst_hi = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + dst += 4; + src += 4; + } + + while (w) + { + s = *src++; + d = *dst; + + *dst++ = pack_1x128_32 ( + over_rev_non_pre_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); + + w--; + } + } + +} + +static void +sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint16_t *dst_line, *dst, d; + uint32_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int w; + uint32_t pack_cmp; + + __m128i xmm_src, xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; + + while (height--) + { + w = width; + mask = mask_line; + dst = dst_line; + mask_line += mask_stride; + dst_line += dst_stride; + + while (w && ((unsigned long)dst & 15)) + { + m = *(uint32_t *) mask; + + if (m) + { + d = *dst; + mmx_mask = unpack_32_1x128 (m); + mmx_dest = expand565_16_1x128 (d); + + *dst = pack_565_32_16 ( + pack_1x128_32 ( + in_over_1x128 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + mask++; + } + + while (w >= 8) + { + /* First round */ + xmm_mask = load_128_unaligned ((__m128i*)mask); + xmm_dst = load_128_aligned ((__m128i*)dst); + + pack_cmp = _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + /* preload next round */ + xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); + + /* preload next round */ + if (pack_cmp != 0xffff) + { + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst0, &xmm_dst1); + } + + /* Second round */ + pack_cmp = _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + if (pack_cmp != 0xffff) + { + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst2, &xmm_dst3); + } + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + dst += 8; + mask += 8; + } + + while (w) + { + m = *(uint32_t *) mask; + + if (m) + { + d = *dst; + mmx_mask = unpack_32_1x128 (m); + mmx_dest = expand565_16_1x128 (d); + + *dst = pack_565_32_16 ( + pack_1x128_32 ( + in_over_1x128 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + mask++; + } + } + +} + +static void +sse2_composite_in_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + uint32_t d, m; + uint32_t src; + uint8_t sa; + int32_t w; + + __m128i xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + sa = src >> 24; + + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 (xmm_alpha, + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); + w--; + } + + while (w >= 16) + { + xmm_mask = load_128_unaligned ((__m128i*)mask); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + mask += 16; + dst += 16; + w -= 16; + } + + while (w) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); + w--; + } + } + +} + +static void +sse2_composite_in_n_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + int dst_stride; + uint32_t d; + uint32_t src; + int32_t w; + + __m128i xmm_alpha; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); + + src = src >> 24; + + if (src == 0xff) + return; + + if (src == 0x00) + { + pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride, + 8, dest_x, dest_y, width, height, src); + + return; + } + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_alpha, + unpack_32_1x128 (d))); + w--; + } + + while (w >= 16) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + dst += 16; + w -= 16; + } + + while (w) + { + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_alpha, + unpack_32_1x128 (d))); + w--; + } + } + +} + +static void +sse2_composite_in_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int src_stride, dst_stride; + int32_t w; + uint32_t s, d; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + s = (uint32_t) *src++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); + w--; + } + + while (w >= 16) + { + xmm_src = load_128_unaligned ((__m128i*)src); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + src += 16; + dst += 16; + w -= 16; + } + + while (w) + { + s = (uint32_t) *src++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); + w--; + } + } + +} + +static void +sse2_composite_add_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t src; + uint8_t sa; + uint32_t m, d; + + __m128i xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + sa = src >> 24; + + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x128_32 ( + _mm_adds_epu16 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); + w--; + } + + while (w >= 16) + { + xmm_mask = load_128_unaligned ((__m128i*)mask); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); + xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + mask += 16; + dst += 16; + w -= 16; + } + + while (w) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x128_32 ( + _mm_adds_epu16 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); + + w--; + } + } + +} + +static void +sse2_composite_add_n_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + int dst_stride; + int32_t w; + uint32_t src; + + __m128i xmm_src; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + src >>= 24; + + if (src == 0x00) + return; + + if (src == 0xff) + { + pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride, + 8, dest_x, dest_y, width, height, 0xff); + + return; + } + + src = (src << 24) | (src << 16) | (src << 8) | src; + xmm_src = _mm_set_epi32 (src, src, src, src); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + *dst = (uint8_t)_mm_cvtsi128_si32 ( + _mm_adds_epu8 ( + xmm_src, + _mm_cvtsi32_si128 (*dst))); + + w--; + dst++; + } + + while (w >= 16) + { + save_128_aligned ( + (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); + + dst += 16; + w -= 16; + } + + while (w) + { + *dst = (uint8_t)_mm_cvtsi128_si32 ( + _mm_adds_epu8 ( + xmm_src, + _mm_cvtsi32_si128 (*dst))); + + w--; + dst++; + } + } + +} + +static void +sse2_composite_add_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint16_t t; + + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + src = src_line; + + dst_line += dst_stride; + src_line += src_stride; + w = width; + + /* Small head */ + while (w && (unsigned long)dst & 3) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); + w--; + } + + sse2_combine_add_u (imp, op, + (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); + + /* Small tail */ + dst += w & 0xfffc; + src += w & 0xfffc; + + w &= 3; + + while (w) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); + w--; + } + } + +} + +static void +sse2_composite_add_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + + sse2_combine_add_u (imp, op, dst, src, NULL, width); + } + +} + +static pixman_bool_t +pixman_blt_sse2 (uint32_t *src_bits, + uint32_t *dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + uint8_t * src_bytes; + uint8_t * dst_bytes; + int byte_width; + + if (src_bpp != dst_bpp) + return FALSE; + + if (src_bpp == 16) + { + src_stride = src_stride * (int) sizeof (uint32_t) / 2; + dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; + src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); + dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + byte_width = 2 * width; + src_stride *= 2; + dst_stride *= 2; + } + else if (src_bpp == 32) + { + src_stride = src_stride * (int) sizeof (uint32_t) / 4; + dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; + src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); + dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + byte_width = 4 * width; + src_stride *= 4; + dst_stride *= 4; + } + else + { + return FALSE; + } + + while (height--) + { + int w; + uint8_t *s = src_bytes; + uint8_t *d = dst_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + w = byte_width; + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(uint16_t *)d = *(uint16_t *)s; + w -= 2; + s += 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 15)) + { + *(uint32_t *)d = *(uint32_t *)s; + + w -= 4; + s += 4; + d += 4; + } + + while (w >= 64) + { + __m128i xmm0, xmm1, xmm2, xmm3; + + xmm0 = load_128_unaligned ((__m128i*)(s)); + xmm1 = load_128_unaligned ((__m128i*)(s + 16)); + xmm2 = load_128_unaligned ((__m128i*)(s + 32)); + xmm3 = load_128_unaligned ((__m128i*)(s + 48)); + + save_128_aligned ((__m128i*)(d), xmm0); + save_128_aligned ((__m128i*)(d + 16), xmm1); + save_128_aligned ((__m128i*)(d + 32), xmm2); + save_128_aligned ((__m128i*)(d + 48), xmm3); + + s += 64; + d += 64; + w -= 64; + } + + while (w >= 16) + { + save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); + + w -= 16; + d += 16; + s += 16; + } + + while (w >= 4) + { + *(uint32_t *)d = *(uint32_t *)s; + + w -= 4; + s += 4; + d += 4; + } + + if (w >= 2) + { + *(uint16_t *)d = *(uint16_t *)s; + w -= 2; + s += 2; + d += 2; + } + } + + + return TRUE; +} + +static void +sse2_composite_copy_area (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + pixman_blt_sse2 (src_image->bits.bits, + dst_image->bits.bits, + src_image->bits.rowstride, + dst_image->bits.rowstride, + PIXMAN_FORMAT_BPP (src_image->bits.format), + PIXMAN_FORMAT_BPP (dst_image->bits.format), + src_x, src_y, dest_x, dest_y, width, height); +} + +static void +sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *src, *src_line, s; + uint32_t *dst, *dst_line, d; + uint8_t *mask, *mask_line; + uint32_t m; + int src_stride, mask_stride, dst_stride; + int32_t w; + __m128i ms; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + + while (w && (unsigned long)dst & 15) + { + s = 0xff000000 | *src++; + m = (uint32_t) *mask++; + d = *dst; + ms = unpack_32_1x128 (s); + + if (m != 0xff) + { + __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + __m128i md = unpack_32_1x128 (d); + + ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); + } + + *dst++ = pack_1x128_32 (ms); + w--; + } + + while (w >= 4) + { + m = *(uint32_t*) mask; + xmm_src = _mm_or_si128 ( + load_128_unaligned ((__m128i*)src), mask_ff000000); + + if (m == 0xffffffff) + { + save_128_aligned ((__m128i*)dst, xmm_src); + } + else + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_rev_2x128 ( + xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + src += 4; + dst += 4; + mask += 4; + w -= 4; + } + + while (w) + { + m = (uint32_t) *mask++; + + if (m) + { + s = 0xff000000 | *src; + + if (m == 0xff) + { + *dst = s; + } + else + { + __m128i ma, md, ms; + + d = *dst; + + ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + md = unpack_32_1x128 (d); + ms = unpack_32_1x128 (s); + + *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); + } + + } + + src++; + dst++; + w--; + } + } + +} + +static void +sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *src, *src_line, s; + uint32_t *dst, *dst_line, d; + uint8_t *mask, *mask_line; + uint32_t m; + int src_stride, mask_stride, dst_stride; + int32_t w; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + + while (w && (unsigned long)dst & 15) + { + uint32_t sa; + + s = *src++; + m = (uint32_t) *mask++; + d = *dst; + + sa = s >> 24; + + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m128i ms, md, ma, msa; + + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); + + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } + + while (w >= 4) + { + m = *(uint32_t *) mask; + + if (m) + { + xmm_src = load_128_unaligned ((__m128i*)src); + + if (m == 0xffffffff && is_opaque (xmm_src)) + { + save_128_aligned ((__m128i *)dst, xmm_src); + } + else + { + xmm_dst = load_128_aligned ((__m128i *)dst); + + xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, + &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + } + + src += 4; + dst += 4; + mask += 4; + w -= 4; + } + + while (w) + { + uint32_t sa; + + s = *src++; + m = (uint32_t) *mask++; + d = *dst; + + sa = s >> 24; + + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m128i ms, md, ma, msa; + + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); + + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } + } + +} + +static void +sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint32_t *dst_line, *dst; + __m128i xmm_src; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_dsta_hi, xmm_dsta_lo; + int dst_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + xmm_src = expand_pixel_32_1x128 (src); + + while (height--) + { + dst = dst_line; + + dst_line += dst_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + __m128i vd; + + vd = unpack_32_1x128 (*dst); + + *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), + xmm_src)); + w--; + dst++; + } + + while (w >= 4) + { + __m128i tmp_lo, tmp_hi; + + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); + + tmp_lo = xmm_src; + tmp_hi = xmm_src; + + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_dsta_lo, &xmm_dsta_hi, + &tmp_lo, &tmp_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); + + w -= 4; + dst += 4; + } + + while (w) + { + __m128i vd; + + vd = unpack_32_1x128 (*dst); + + *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), + xmm_src)); + w--; + dst++; + } + + } + +} + +static void +sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *src, *src_line, s; + uint32_t *dst, *dst_line, d; + uint32_t *mask, *mask_line; + uint32_t m; + int src_stride, mask_stride, dst_stride; + int32_t w; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + + while (w && (unsigned long)dst & 15) + { + uint32_t sa; + + s = *src++; + m = (*mask++) >> 24; + d = *dst; + + sa = s >> 24; + + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m128i ms, md, ma, msa; + + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); + + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } + + while (w >= 4) + { + xmm_mask = load_128_unaligned ((__m128i*)mask); + + if (!is_transparent (xmm_mask)) + { + xmm_src = load_128_unaligned ((__m128i*)src); + + if (is_opaque (xmm_mask) && is_opaque (xmm_src)) + { + save_128_aligned ((__m128i *)dst, xmm_src); + } + else + { + xmm_dst = load_128_aligned ((__m128i *)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); + expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, + &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + } + + src += 4; + dst += 4; + mask += 4; + w -= 4; + } + + while (w) + { + uint32_t sa; + + s = *src++; + m = (*mask++) >> 24; + d = *dst; + + sa = s >> 24; + + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m128i ms, md, ma, msa; + + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); + + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } + } + +} + +/* A variant of 'sse2_combine_over_u' with minor tweaks */ +static force_inline void +scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, + const uint32_t* ps, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t fully_transparent_src) +{ + uint32_t s, d; + const uint32_t* pm = NULL; + + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + if (fully_transparent_src) + return; + + /* Align dst on a 16-byte boundary */ + while (w && ((unsigned long)pd & 15)) + { + d = *pd; + s = combine1 (ps + (vx >> 16), pm); + vx += unit_x; + + *pd++ = core_combine_over_u_pixel_sse2 (s, d); + if (pm) + pm++; + w--; + } + + while (w >= 4) + { + __m128i tmp; + uint32_t tmp1, tmp2, tmp3, tmp4; + + tmp1 = ps[vx >> 16]; + vx += unit_x; + tmp2 = ps[vx >> 16]; + vx += unit_x; + tmp3 = ps[vx >> 16]; + vx += unit_x; + tmp4 = ps[vx >> 16]; + vx += unit_x; + + tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); + + xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); + + if (is_opaque (xmm_src_hi)) + { + save_128_aligned ((__m128i*)pd, xmm_src_hi); + } + else if (!is_zero (xmm_src_hi)) + { + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 ( + xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); + + over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + /* rebuid the 4 pixel data and save*/ + save_128_aligned ((__m128i*)pd, + pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + pd += 4; + if (pm) + pm += 4; + } + + while (w) + { + d = *pd; + s = combine1 (ps + (vx >> 16), pm); + vx += unit_x; + + *pd++ = core_combine_over_u_pixel_sse2 (s, d); + if (pm) + pm++; + + w--; + } +} + +FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, COVER) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, NONE) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, PAD) + +static force_inline void +scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, + uint32_t * dst, + const uint32_t * src, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + __m128i xmm_mask; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + if (zero_src || (*mask >> 24) == 0) + return; + + xmm_mask = create_mask_16_128 (*mask >> 24); + + while (w && (unsigned long)dst & 15) + { + uint32_t s = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + if (s) + { + uint32_t d = *dst; + + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i dest = xmm_mask; + __m128i alpha_dst = unpack_32_1x128 (d); + + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); + } + dst++; + w--; + } + + while (w >= 4) + { + uint32_t tmp1, tmp2, tmp3, tmp4; + + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp3 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp4 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); + + if (!is_zero (xmm_src)) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + dst += 4; + w -= 4; + } + + while (w) + { + uint32_t s = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + if (s) + { + uint32_t d = *dst; + + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); + + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &mask, &dest)); + } + + dst++; + w--; + } + +} + +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) + +static void +bilinear_interpolate_line_sse2 (uint32_t * out, + const uint32_t * top, + const uint32_t * bottom, + int wt, + int wb, + pixman_fixed_t x, + pixman_fixed_t ux, + int width) +{ + const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); + const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); + const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff); + const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); + const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux); + const __m128i xmm_zero = _mm_setzero_si128 (); + __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x); + uint32_t pix1, pix2, pix3, pix4; + + #define INTERPOLATE_ONE_PIXEL(pix) \ + do { \ + __m128i xmm_wh, xmm_lo, xmm_hi, a; \ + /* fetch 2x2 pixel block into sse2 register */ \ + uint32_t tl = top [pixman_fixed_to_int (x)]; \ + uint32_t tr = top [pixman_fixed_to_int (x) + 1]; \ + uint32_t bl = bottom [pixman_fixed_to_int (x)]; \ + uint32_t br = bottom [pixman_fixed_to_int (x) + 1]; \ + a = _mm_set_epi32 (tr, tl, br, bl); \ + x += ux; \ + /* vertical interpolation */ \ + a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \ + xmm_wt), \ + _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \ + xmm_wb)); \ + /* calculate horizontal weights */ \ + xmm_wh = _mm_add_epi16 (xmm_addc, \ + _mm_xor_si128 (xmm_xorc, \ + _mm_srli_epi16 (xmm_x, 8))); \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ + /* horizontal interpolation */ \ + xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ + xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ + a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ + _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ + /* shift and pack the result */ \ + a = _mm_srli_epi32 (a, 16); \ + a = _mm_packs_epi32 (a, a); \ + a = _mm_packus_epi16 (a, a); \ + pix = _mm_cvtsi128_si32 (a); \ + } while (0) + + while ((width -= 4) >= 0) + { + INTERPOLATE_ONE_PIXEL (pix1); + INTERPOLATE_ONE_PIXEL (pix2); + INTERPOLATE_ONE_PIXEL (pix3); + INTERPOLATE_ONE_PIXEL (pix4); + *out++ = pix1; + *out++ = pix2; + *out++ = pix3; + *out++ = pix4; + } + if (width & 2) + { + INTERPOLATE_ONE_PIXEL (pix1); + INTERPOLATE_ONE_PIXEL (pix2); + *out++ = pix1; + *out++ = pix2; + } + if (width & 1) + { + INTERPOLATE_ONE_PIXEL (pix1); + *out = pix1; + } + + #undef INTERPOLATE_ONE_PIXEL +} + +static force_inline void +scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, + const uint32_t * mask, + const uint32_t * src_top, + const uint32_t * src_bottom, + int32_t w, + int wt, + int wb, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + bilinear_interpolate_line_sse2 (dst, src_top, src_bottom, + wt, wb, vx, unit_x, w); +} + +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, + scaled_bilinear_scanline_sse2_8888_8888_SRC, + uint32_t, uint32_t, uint32_t, + COVER, FALSE, FALSE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, + scaled_bilinear_scanline_sse2_8888_8888_SRC, + uint32_t, uint32_t, uint32_t, + PAD, FALSE, FALSE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, + scaled_bilinear_scanline_sse2_8888_8888_SRC, + uint32_t, uint32_t, uint32_t, + NONE, FALSE, FALSE) + +static const pixman_fast_path_t sse2_fast_paths[] = +{ + /* PIXMAN_OP_OVER */ + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), + + /* PIXMAN_OP_OVER_REVERSE */ + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), + + /* PIXMAN_OP_ADD */ + PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), + PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), + + /* PIXMAN_OP_SRC */ + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area), + + /* PIXMAN_OP_IN */ + PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8), + PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8), + PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8), + + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), + + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), + + { PIXMAN_OP_NONE }, +}; + +static pixman_bool_t +sse2_blt (pixman_implementation_t *imp, + uint32_t * src_bits, + uint32_t * dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + if (!pixman_blt_sse2 ( + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dst_x, dst_y, width, height)) + + { + return _pixman_implementation_blt ( + imp->delegate, + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dst_x, dst_y, width, height); + } + + return TRUE; +} + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +static pixman_bool_t +sse2_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor)) + { + return _pixman_implementation_fill ( + imp->delegate, bits, stride, bpp, x, y, width, height, xor); + } + + return TRUE; +} + +static uint32_t * +sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) +{ + int w = iter->width; + __m128i ff000000 = mask_ff000000; + uint32_t *dst = iter->buffer; + uint32_t *src = (uint32_t *)iter->bits; + + iter->bits += iter->stride; + + while (w && ((unsigned long)dst) & 0x0f) + { + *dst++ = (*src++) | 0xff000000; + w--; + } + + while (w >= 4) + { + save_128_aligned ( + (__m128i *)dst, _mm_or_si128 ( + load_128_unaligned ((__m128i *)src), ff000000)); + + dst += 4; + src += 4; + w -= 4; + } + + while (w) + { + *dst++ = (*src++) | 0xff000000; + w--; + } + + return iter->buffer; +} + +static uint32_t * +sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) +{ + int w = iter->width; + uint32_t *dst = iter->buffer; + uint16_t *src = (uint16_t *)iter->bits; + __m128i ff000000 = mask_ff000000; + + iter->bits += iter->stride; + + while (w && ((unsigned long)dst) & 0x0f) + { + uint16_t s = *src++; + + *dst++ = CONVERT_0565_TO_8888 (s); + w--; + } + + while (w >= 8) + { + __m128i lo, hi, s; + + s = _mm_loadu_si128 ((__m128i *)src); + + lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ())); + hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ())); + + save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000)); + save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000)); + + dst += 8; + src += 8; + w -= 8; + } + + while (w) + { + uint16_t s = *src++; + + *dst++ = CONVERT_0565_TO_8888 (s); + w--; + } + + return iter->buffer; +} + +static uint32_t * +sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) +{ + int w = iter->width; + uint32_t *dst = iter->buffer; + uint8_t *src = iter->bits; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + + iter->bits += iter->stride; + + while (w && (((unsigned long)dst) & 15)) + { + *dst++ = *(src++) << 24; + w--; + } + + while (w >= 16) + { + xmm0 = _mm_loadu_si128((__m128i *)src); + + xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0); + xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0); + xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1); + xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1); + xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2); + xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2); + + _mm_store_si128(((__m128i *)(dst + 0)), xmm3); + _mm_store_si128(((__m128i *)(dst + 4)), xmm4); + _mm_store_si128(((__m128i *)(dst + 8)), xmm5); + _mm_store_si128(((__m128i *)(dst + 12)), xmm6); + + dst += 16; + src += 16; + w -= 16; + } + + while (w) + { + *dst++ = *(src++) << 24; + w--; + } + + return iter->buffer; +} + +typedef struct +{ + pixman_format_code_t format; + pixman_iter_get_scanline_t get_scanline; +} fetcher_info_t; + +static const fetcher_info_t fetchers[] = +{ + { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 }, + { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 }, + { PIXMAN_a8, sse2_fetch_a8 }, + { PIXMAN_null } +}; + +static void +sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) +{ + pixman_image_t *image = iter->image; + int x = iter->x; + int y = iter->y; + int width = iter->width; + int height = iter->height; + +#define FLAGS \ + (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM) + + if ((iter->flags & ITER_NARROW) && + (image->common.flags & FLAGS) == FLAGS && + x >= 0 && y >= 0 && + x + width <= image->bits.width && + y + height <= image->bits.height) + { + const fetcher_info_t *f; + + for (f = &fetchers[0]; f->format != PIXMAN_null; f++) + { + if (image->common.extended_format_code == f->format) + { + uint8_t *b = (uint8_t *)image->bits.bits; + int s = image->bits.rowstride * 4; + + iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8; + iter->stride = s; + + iter->get_scanline = f->get_scanline; + return; + } + } + } + + imp->delegate->src_iter_init (imp->delegate, iter); +} + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +pixman_implementation_t * +_pixman_implementation_create_sse2 (pixman_implementation_t *fallback) +{ + pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths); + + /* SSE2 constants */ + mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); + mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); + mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); + mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); + mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); + mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); + mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); + mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); + mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); + mask_0080 = create_mask_16_128 (0x0080); + mask_00ff = create_mask_16_128 (0x00ff); + mask_0101 = create_mask_16_128 (0x0101); + mask_ffff = create_mask_16_128 (0xffff); + mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); + mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); + + /* Set up function pointers */ + imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; + imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; + imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; + imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; + imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; + imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; + imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; + imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; + imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; + imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; + + imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; + + imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; + imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; + imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; + imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; + imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; + imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; + + imp->blt = sse2_blt; + imp->fill = sse2_fill; + + imp->src_iter_init = sse2_src_iter_init; + + return imp; +} -- cgit v1.2.3