diff options
Diffstat (limited to 'pixman')
-rw-r--r-- | pixman/configure.ac | 2 | ||||
-rw-r--r-- | pixman/pixman/pixman-sse2.c | 1601 |
2 files changed, 572 insertions, 1031 deletions
diff --git a/pixman/configure.ac b/pixman/configure.ac index 5242799bb..8d96647f9 100644 --- a/pixman/configure.ac +++ b/pixman/configure.ac @@ -326,7 +326,7 @@ if test "x$SSE2_CFLAGS" = "x" ; then SSE2_CFLAGS="-xarch=sse2" fi else - SSE2_CFLAGS="-mmmx -msse2 -Winline" + SSE2_CFLAGS="-msse2 -Winline" fi fi diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index 2e135e2fe..88287b453 100644 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -30,36 +30,12 @@ #include <config.h> #endif -#include <mmintrin.h> #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ #include <emmintrin.h> /* for SSE2 intrinsics */ #include "pixman-private.h" #include "pixman-combine32.h" #include "pixman-fast-path.h" -#if defined(_MSC_VER) && defined(_M_AMD64) -/* Windows 64 doesn't allow MMX to be used, so - * the pixman-x64-mmx-emulation.h file contains - * implementations of those MMX intrinsics that - * are used in the SSE2 implementation. - */ -# include "pixman-x64-mmx-emulation.h" -#endif - -#ifdef USE_SSE2 - -/* -------------------------------------------------------------------- - * Locals - */ - -static __m64 mask_x0080; -static __m64 mask_x00ff; -static __m64 mask_x0101; -static __m64 mask_x_alpha; - -static __m64 mask_x565_rgb; -static __m64 mask_x565_unpack; - static __m128i mask_0080; static __m128i mask_00ff; static __m128i mask_0101; @@ -77,9 +53,6 @@ static __m128i mask_blue; static __m128i mask_565_fix_rb; static __m128i mask_565_fix_g; -/* ---------------------------------------------------------------------- - * SSE2 Inlines - */ static force_inline __m128i unpack_32_1x128 (uint32_t data) { @@ -397,146 +370,104 @@ save_128_unaligned (__m128i* dst, _mm_storeu_si128 (dst, data); } -/* ------------------------------------------------------------------ - * MMX inlines - */ - -static force_inline __m64 -load_32_1x64 (uint32_t data) -{ - return _mm_cvtsi32_si64 (data); -} - -static force_inline __m64 -unpack_32_1x64 (uint32_t data) -{ - return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ()); -} - -static force_inline __m64 -expand_alpha_1x64 (__m64 data) +static force_inline __m128i +load_32_1x128 (uint32_t data) { - return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3)); + return _mm_cvtsi32_si128 (data); } -static force_inline __m64 -expand_alpha_rev_1x64 (__m64 data) +static force_inline __m128i +expand_alpha_rev_1x128 (__m128i data) { - return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); + return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); } -static force_inline __m64 -expand_pixel_8_1x64 (uint8_t data) +static force_inline __m128i +expand_pixel_8_1x128 (uint8_t data) { - return _mm_shuffle_pi16 ( - unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); + return _mm_shufflelo_epi16 ( + unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); } -static force_inline __m64 -pix_multiply_1x64 (__m64 data, - __m64 alpha) +static force_inline __m128i +pix_multiply_1x128 (__m128i data, + __m128i alpha) { - return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha), - mask_x0080), - mask_x0101); + return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), + mask_0080), + mask_0101); } -static force_inline __m64 -pix_add_multiply_1x64 (__m64* src, - __m64* alpha_dst, - __m64* dst, - __m64* alpha_src) +static force_inline __m128i +pix_add_multiply_1x128 (__m128i* src, + __m128i* alpha_dst, + __m128i* dst, + __m128i* alpha_src) { - __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst); - __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src); + __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); + __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); - return _mm_adds_pu8 (t1, t2); + return _mm_adds_epu8 (t1, t2); } -static force_inline __m64 -negate_1x64 (__m64 data) +static force_inline __m128i +negate_1x128 (__m128i data) { - return _mm_xor_si64 (data, mask_x00ff); + return _mm_xor_si128 (data, mask_00ff); } -static force_inline __m64 -invert_colors_1x64 (__m64 data) +static force_inline __m128i +invert_colors_1x128 (__m128i data) { - return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); + return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); } -static force_inline __m64 -over_1x64 (__m64 src, __m64 alpha, __m64 dst) +static force_inline __m128i +over_1x128 (__m128i src, __m128i alpha, __m128i dst) { - return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha))); + return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); } -static force_inline __m64 -in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst) +static force_inline __m128i +in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) { - return over_1x64 (pix_multiply_1x64 (*src, *mask), - pix_multiply_1x64 (*alpha, *mask), - *dst); + return over_1x128 (pix_multiply_1x128 (*src, *mask), + pix_multiply_1x128 (*alpha, *mask), + *dst); } -static force_inline __m64 -over_rev_non_pre_1x64 (__m64 src, __m64 dst) +static force_inline __m128i +over_rev_non_pre_1x128 (__m128i src, __m128i dst) { - __m64 alpha = expand_alpha_1x64 (src); + __m128i alpha = expand_alpha_1x128 (src); - return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src), - _mm_or_si64 (alpha, mask_x_alpha)), - alpha, - dst); + return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), + _mm_or_si128 (alpha, mask_alpha)), + alpha, + dst); } static force_inline uint32_t -pack_1x64_32 (__m64 data) +pack_1x128_32 (__m128i data) { - return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ())); + return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); } -/* Expand 16 bits positioned at @pos (0-3) of a mmx register into - * - * 00RR00GG00BB - * - * --- Expanding 565 in the low word --- - * - * m = (m << (32 - 3)) | (m << (16 - 5)) | m; - * m = m & (01f0003f001f); - * m = m * (008404100840); - * m = m >> 8; - * - * Note the trick here - the top word is shifted by another nibble to - * avoid it bumping into the middle word - */ -static force_inline __m64 -expand565_16_1x64 (uint16_t pixel) +static force_inline __m128i +expand565_16_1x128 (uint16_t pixel) { - __m64 p; - __m64 t1, t2; + __m128i m = _mm_cvtsi32_si128 (pixel); - p = _mm_cvtsi32_si64 ((uint32_t) pixel); + m = unpack_565_to_8888 (m); - t1 = _mm_slli_si64 (p, 36 - 11); - t2 = _mm_slli_si64 (p, 16 - 5); - - p = _mm_or_si64 (t1, p); - p = _mm_or_si64 (t2, p); - p = _mm_and_si64 (p, mask_x565_rgb); - p = _mm_mullo_pi16 (p, mask_x565_unpack); - - return _mm_srli_pi16 (p, 8); + return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); } -/* ---------------------------------------------------------------------------- - * Compose Core transformations - */ static force_inline uint32_t core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) { uint8_t a; - __m64 ms; + __m128i xmms; a = src >> 24; @@ -546,9 +477,10 @@ core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) } else if (src) { - ms = unpack_32_1x64 (src); - return pack_1x64_32 ( - over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst))); + xmms = unpack_32_1x128 (src); + return pack_1x128_32 ( + over_1x128 (xmms, expand_alpha_1x128 (xmms), + unpack_32_1x128 (dst))); } return dst; @@ -561,15 +493,15 @@ combine1 (const uint32_t *ps, const uint32_t *pm) if (pm) { - __m64 ms, mm; + __m128i ms, mm; - mm = unpack_32_1x64 (*pm); - mm = expand_alpha_1x64 (mm); + mm = unpack_32_1x128 (*pm); + mm = expand_alpha_1x128 (mm); - ms = unpack_32_1x64 (s); - ms = pix_multiply_1x64 (ms, mm); + ms = unpack_32_1x128 (s); + ms = pix_multiply_1x128 (ms, mm); - s = pack_1x64_32 (ms); + s = pack_1x128_32 (ms); } return s; @@ -766,10 +698,12 @@ core_combine_over_u_sse2_no_mask (uint32_t * pd, } static force_inline void -core_combine_over_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +sse2_combine_over_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { if (pm) core_combine_over_u_sse2_mask (pd, ps, pm, w); @@ -777,11 +711,13 @@ core_combine_over_u_sse2 (uint32_t* pd, core_combine_over_u_sse2_no_mask (pd, ps, w); } -static force_inline void -core_combine_over_reverse_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +static void +sse2_combine_over_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; @@ -847,7 +783,7 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, } static force_inline uint32_t -core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst) +core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) { uint32_t maska = src >> 24; @@ -857,19 +793,21 @@ core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst) } else if (maska != 0xff) { - return pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (dst), - expand_alpha_1x64 (unpack_32_1x64 (src)))); + return pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (dst), + expand_alpha_1x128 (unpack_32_1x128 (src)))); } return dst; } -static force_inline void -core_combine_in_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +static void +sse2_combine_in_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; @@ -881,7 +819,7 @@ core_combine_in_u_sse2 (uint32_t* pd, s = combine1 (ps, pm); d = *pd; - *pd++ = core_combine_in_u_pixelsse2 (d, s); + *pd++ = core_combine_in_u_pixel_sse2 (d, s); w--; ps++; if (pm) @@ -916,7 +854,7 @@ core_combine_in_u_sse2 (uint32_t* pd, s = combine1 (ps, pm); d = *pd; - *pd++ = core_combine_in_u_pixelsse2 (d, s); + *pd++ = core_combine_in_u_pixel_sse2 (d, s); w--; ps++; if (pm) @@ -924,11 +862,13 @@ core_combine_in_u_sse2 (uint32_t* pd, } } -static force_inline void -core_combine_reverse_in_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t *pm, - int w) +static void +sse2_combine_in_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; @@ -940,7 +880,7 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, s = combine1 (ps, pm); d = *pd; - *pd++ = core_combine_in_u_pixelsse2 (s, d); + *pd++ = core_combine_in_u_pixel_sse2 (s, d); ps++; w--; if (pm) @@ -975,7 +915,7 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, s = combine1 (ps, pm); d = *pd; - *pd++ = core_combine_in_u_pixelsse2 (s, d); + *pd++ = core_combine_in_u_pixel_sse2 (s, d); w--; ps++; if (pm) @@ -983,21 +923,23 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, } } -static force_inline void -core_combine_reverse_out_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +static void +sse2_combine_out_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (s))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (s))))); if (pm) pm++; @@ -1039,10 +981,10 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (s))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (s))))); ps++; if (pm) pm++; @@ -1050,21 +992,23 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, } } -static force_inline void -core_combine_out_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +static void +sse2_combine_out_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; ps++; if (pm) @@ -1104,10 +1048,10 @@ core_combine_out_u_sse2 (uint32_t* pd, uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; ps++; if (pm) @@ -1119,20 +1063,22 @@ static force_inline uint32_t core_combine_atop_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 sa = negate_1x64 (expand_alpha_1x64 (s)); - __m64 da = expand_alpha_1x64 (d); + __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); + __m128i da = expand_alpha_1x128 (d); - return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); } -static force_inline void -core_combine_atop_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +static void +sse2_combine_atop_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; @@ -1201,20 +1147,22 @@ static force_inline uint32_t core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 sa = expand_alpha_1x64 (s); - __m64 da = negate_1x64 (expand_alpha_1x64 (d)); + __m128i sa = expand_alpha_1x128 (s); + __m128i da = negate_1x128 (expand_alpha_1x128 (d)); - return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); } -static force_inline void -core_combine_reverse_atop_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +static void +sse2_combine_atop_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; @@ -1283,20 +1231,22 @@ static force_inline uint32_t core_combine_xor_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d)); - __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s)); + __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); + __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); - return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s)); + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); } -static force_inline void -core_combine_xor_u_sse2 (uint32_t* dst, - const uint32_t* src, - const uint32_t *mask, - int width) +static void +sse2_combine_xor_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { int w = width; uint32_t s, d; @@ -1368,10 +1318,12 @@ core_combine_xor_u_sse2 (uint32_t* dst, } static force_inline void -core_combine_add_u_sse2 (uint32_t* dst, - const uint32_t* src, - const uint32_t* mask, - int width) +sse2_combine_add_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { int w = width; uint32_t s, d; @@ -1387,8 +1339,8 @@ core_combine_add_u_sse2 (uint32_t* dst, ps++; if (pm) pm++; - *pd++ = _mm_cvtsi64_si32 ( - _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + *pd++ = _mm_cvtsi128_si32 ( + _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); w--; } @@ -1414,8 +1366,8 @@ core_combine_add_u_sse2 (uint32_t* dst, d = *pd; ps++; - *pd++ = _mm_cvtsi64_si32 ( - _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + *pd++ = _mm_cvtsi128_si32 ( + _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); if (pm) pm++; } @@ -1425,25 +1377,27 @@ static force_inline uint32_t core_combine_saturate_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 ms = unpack_32_1x64 (src); - __m64 md = unpack_32_1x64 (dst); + __m128i ms = unpack_32_1x128 (src); + __m128i md = unpack_32_1x128 (dst); uint32_t sa = src >> 24; uint32_t da = ~dst >> 24; if (sa > da) { - ms = pix_multiply_1x64 ( - ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24))); + ms = pix_multiply_1x128 ( + ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); } - return pack_1x64_32 (_mm_adds_pu16 (md, ms)); + return pack_1x128_32 (_mm_adds_epu16 (md, ms)); } -static force_inline void -core_combine_saturate_u_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_saturate_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; @@ -1524,11 +1478,13 @@ core_combine_saturate_u_sse2 (uint32_t * pd, } } -static force_inline void -core_combine_src_ca_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t *pm, - int w) +static void +sse2_combine_src_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m; @@ -1540,8 +1496,8 @@ core_combine_src_ca_sse2 (uint32_t* pd, { s = *ps++; m = *pm++; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); w--; } @@ -1570,8 +1526,8 @@ core_combine_src_ca_sse2 (uint32_t* pd, { s = *ps++; m = *pm++; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); w--; } } @@ -1581,19 +1537,21 @@ core_combine_over_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 expAlpha = expand_alpha_1x64 (s); - __m64 unpk_mask = unpack_32_1x64 (mask); - __m64 unpk_dst = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i expAlpha = expand_alpha_1x128 (s); + __m128i unpk_mask = unpack_32_1x128 (mask); + __m128i unpk_dst = unpack_32_1x128 (dst); - return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst)); + return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); } -static force_inline void -core_combine_over_ca_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t *pm, - int w) +static void +sse2_combine_over_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -1655,19 +1613,21 @@ core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 d = unpack_32_1x64 (dst); + __m128i d = unpack_32_1x128 (dst); - return pack_1x64_32 ( - over_1x64 (d, expand_alpha_1x64 (d), - pix_multiply_1x64 (unpack_32_1x64 (src), - unpack_32_1x64 (mask)))); + return pack_1x128_32 ( + over_1x128 (d, expand_alpha_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (src), + unpack_32_1x128 (mask)))); } -static force_inline void -core_combine_over_reverse_ca_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t *pm, - int w) +static void +sse2_combine_over_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -1726,11 +1686,13 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, } } -static force_inline void -core_combine_in_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_in_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -1745,10 +1707,10 @@ core_combine_in_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), - expand_alpha_1x64 (unpack_32_1x64 (d)))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), + expand_alpha_1x128 (unpack_32_1x128 (d)))); w--; } @@ -1789,21 +1751,23 @@ core_combine_in_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (m)), - expand_alpha_1x64 (unpack_32_1x64 (d)))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + expand_alpha_1x128 (unpack_32_1x128 (d)))); w--; } } -static force_inline void -core_combine_in_reverse_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_in_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -1818,11 +1782,11 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - pix_multiply_1x64 (unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s))))); w--; } @@ -1861,20 +1825,22 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - pix_multiply_1x64 (unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s))))); w--; } } -static force_inline void -core_combine_out_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_out_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -1889,11 +1855,11 @@ core_combine_out_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (m)), - negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; } @@ -1934,21 +1900,23 @@ core_combine_out_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (m)), - negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; } } -static force_inline void -core_combine_out_reverse_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_out_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -1963,12 +1931,12 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - negate_1x64 (pix_multiply_1x64 ( - unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s)))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + negate_1x128 (pix_multiply_1x128 ( + unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s)))))); w--; } @@ -2011,12 +1979,12 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - negate_1x64 (pix_multiply_1x64 ( - unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s)))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + negate_1x128 (pix_multiply_1x128 ( + unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s)))))); w--; } } @@ -2026,23 +1994,25 @@ core_combine_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 m = unpack_32_1x64 (mask); - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); - __m64 sa = expand_alpha_1x64 (s); - __m64 da = expand_alpha_1x64 (d); + __m128i m = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); + __m128i sa = expand_alpha_1x128 (s); + __m128i da = expand_alpha_1x128 (d); - s = pix_multiply_1x64 (s, m); - m = negate_1x64 (pix_multiply_1x64 (m, sa)); + s = pix_multiply_1x128 (s, m); + m = negate_1x128 (pix_multiply_1x128 (m, sa)); - return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); } -static force_inline void -core_combine_atop_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_atop_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -2116,24 +2086,26 @@ core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 m = unpack_32_1x64 (mask); - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i m = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 da = negate_1x64 (expand_alpha_1x64 (d)); - __m64 sa = expand_alpha_1x64 (s); + __m128i da = negate_1x128 (expand_alpha_1x128 (d)); + __m128i sa = expand_alpha_1x128 (s); - s = pix_multiply_1x64 (s, m); - m = pix_multiply_1x64 (m, sa); + s = pix_multiply_1x128 (s, m); + m = pix_multiply_1x128 (m, sa); - return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); } -static force_inline void -core_combine_reverse_atop_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -2208,26 +2180,28 @@ core_combine_xor_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 a = unpack_32_1x64 (mask); - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i a = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 ( - a, expand_alpha_1x64 (s))); - __m64 dest = pix_multiply_1x64 (s, a); - __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d)); + __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( + a, expand_alpha_1x128 (s))); + __m128i dest = pix_multiply_1x128 (s, a); + __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); - return pack_1x64_32 (pix_add_multiply_1x64 (&d, + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &alpha_dst, &dest, &alpha_src)); } -static force_inline void -core_combine_xor_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_xor_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -2299,11 +2273,13 @@ core_combine_xor_ca_sse2 (uint32_t * pd, } } -static force_inline void -core_combine_add_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_add_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -2317,10 +2293,10 @@ core_combine_add_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), - unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *pd++ = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } @@ -2355,36 +2331,20 @@ core_combine_add_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), - unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *pd++ = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } } -/* --------------------------------------------------- - * fb_compose_setup_sSE2 - */ -static force_inline __m64 -create_mask_16_64 (uint16_t mask) -{ - return _mm_set1_pi16 (mask); -} - static force_inline __m128i create_mask_16_128 (uint16_t mask) { return _mm_set1_epi16 (mask); } -static force_inline __m64 -create_mask_2x32_64 (uint32_t mask0, - uint32_t mask1) -{ - return _mm_set_pi32 (mask0, mask1); -} - /* Work around a code generation bug in Sun Studio 12. */ #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) # define create_mask_2x32_128(mask0, mask1) \ @@ -2398,276 +2358,6 @@ create_mask_2x32_128 (uint32_t mask0, } #endif -/* SSE2 code patch for fbcompose.c */ - -static void -sse2_combine_over_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_over_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_over_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_over_reverse_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_in_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_in_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_in_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_reverse_in_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_out_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_out_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_out_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_reverse_out_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_atop_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_atop_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_atop_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_reverse_atop_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_xor_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_xor_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_add_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_add_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_saturate_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_saturate_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_src_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_src_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_over_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_over_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_over_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_over_reverse_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_in_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_in_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_in_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_in_reverse_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_out_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_out_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_out_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_out_reverse_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_atop_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_atop_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_reverse_atop_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_xor_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_xor_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_add_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_add_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -/* ------------------------------------------------------------------- - * composite_over_n_8888 - */ - static void sse2_composite_over_n_8888 (pixman_implementation_t *imp, pixman_op_t op, @@ -2711,9 +2401,9 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, while (w && (unsigned long)dst & 15) { d = *dst; - *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + unpack_32_1x128 (d))); w--; } @@ -2738,19 +2428,15 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, while (w) { d = *dst; - *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + unpack_32_1x128 (d))); w--; } } - _mm_empty (); } -/* --------------------------------------------------------------------- - * composite_over_n_0565 - */ static void sse2_composite_over_n_0565 (pixman_implementation_t *imp, pixman_op_t op, @@ -2796,9 +2482,9 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, d = *dst; *dst++ = pack_565_32_16 ( - pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - expand565_16_1x64 (d)))); + pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + expand565_16_1x128 (d)))); w--; } @@ -2829,18 +2515,13 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, { d = *dst; *dst++ = pack_565_32_16 ( - pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - expand565_16_1x64 (d)))); + pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, + expand565_16_1x128 (d)))); } } - _mm_empty (); } -/* ------------------------------ - * composite_add_n_8888_8888_ca - */ static void sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, pixman_op_t op, @@ -2866,7 +2547,7 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, __m128i xmm_dst; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); srca = src >> 24; @@ -2882,8 +2563,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, xmm_src = _mm_unpacklo_epi8 ( create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -2902,11 +2583,12 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, { d = *pd; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *pd = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); + *pd = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), + mmx_dest)); } pd++; @@ -2950,11 +2632,12 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, { d = *pd; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *pd = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); + *pd = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), + mmx_dest)); } pd++; @@ -2962,13 +2645,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, } } - _mm_empty (); } -/* --------------------------------------------------------------------------- - * composite_over_n_8888_8888_ca - */ - static void sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, pixman_op_t op, @@ -2994,7 +2672,7 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); @@ -3009,8 +2687,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, xmm_src = _mm_unpacklo_epi8 ( create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -3028,10 +2706,10 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, if (m) { d = *pd; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *pd = pack_1x64_32 (in_over_1x64 (&mmx_src, + *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); @@ -3078,11 +2756,11 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, if (m) { d = *pd; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *pd = pack_1x64_32 ( - in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); + *pd = pack_1x128_32 ( + in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); } pd++; @@ -3090,13 +2768,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, } } - _mm_empty (); } -/*--------------------------------------------------------------------- - * composite_over_8888_n_8888 - */ - static void sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, pixman_op_t op, @@ -3148,13 +2821,13 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, { uint32_t d = *dst; - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 dest = _mm_movepi64_pi64 (xmm_mask); - __m64 alpha_dst = unpack_32_1x64 (d); + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i dest = xmm_mask; + __m128i alpha_dst = unpack_32_1x128 (d); - *dst = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); } dst++; w--; @@ -3195,13 +2868,13 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, { uint32_t d = *dst; - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); - *dst = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &mask, &dest)); + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &mask, &dest)); } dst++; @@ -3209,13 +2882,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, } } - _mm_empty (); } -/*--------------------------------------------------------------------- - * composite_over_8888_n_8888 - */ - static void sse2_composite_src_x888_8888 (pixman_implementation_t *imp, pixman_op_t op, @@ -3282,12 +2950,8 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp, } } - _mm_empty (); } -/* --------------------------------------------------------------------- - * composite_over_x888_n_8888 - */ static void sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, pixman_op_t op, @@ -3336,13 +3000,13 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, uint32_t s = (*src++) | 0xff000000; uint32_t d = *dst; - __m64 src = unpack_32_1x64 (s); - __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); + __m128i src = unpack_32_1x128 (s); + __m128i alpha = xmm_alpha; + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); - *dst++ = pack_1x64_32 ( - in_over_1x64 (&src, &alpha, &mask, &dest)); + *dst++ = pack_1x128_32 ( + in_over_1x128 (&src, &alpha, &mask, &dest)); w--; } @@ -3375,24 +3039,20 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, uint32_t s = (*src++) | 0xff000000; uint32_t d = *dst; - __m64 src = unpack_32_1x64 (s); - __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); + __m128i src = unpack_32_1x128 (s); + __m128i alpha = xmm_alpha; + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); - *dst++ = pack_1x64_32 ( - in_over_1x64 (&src, &alpha, &mask, &dest)); + *dst++ = pack_1x128_32 ( + in_over_1x128 (&src, &alpha, &mask, &dest)); w--; } } - _mm_empty (); } -/* -------------------------------------------------------------------- - * composite_over_8888_8888 - */ static void sse2_composite_over_8888_8888 (pixman_implementation_t *imp, pixman_op_t op, @@ -3422,27 +3082,23 @@ sse2_composite_over_8888_8888 (pixman_implementation_t *imp, while (height--) { - core_combine_over_u_sse2 (dst, src, NULL, width); + sse2_combine_over_u (imp, op, dst, src, NULL, width); dst += dst_stride; src += src_stride; } - _mm_empty (); } -/* ------------------------------------------------------------------ - * composite_over_8888_0565 - */ static force_inline uint16_t composite_over_8888_0565pixel (uint32_t src, uint16_t dst) { - __m64 ms; + __m128i ms; - ms = unpack_32_1x64 (src); + ms = unpack_32_1x128 (src); return pack_565_32_16 ( - pack_1x64_32 ( - over_1x64 ( - ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst)))); + pack_1x128_32 ( + over_1x128 ( + ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); } static void @@ -3474,15 +3130,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); -#if 0 - /* FIXME - * - * I copy the code from MMX one and keep the fixme. - * If it's a problem there, probably is a problem here. - */ - assert (src_image->drawable == mask_image->drawable); -#endif - while (height--) { dst = dst_line; @@ -3555,13 +3202,8 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, } } - _mm_empty (); } -/* ----------------------------------------------------------------- - * composite_over_n_8_8888 - */ - static void sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, pixman_op_t op, @@ -3588,7 +3230,7 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); @@ -3604,8 +3246,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, xmm_def = create_mask_2x32_128 (src, src); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -3622,10 +3264,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_pixel_8_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = expand_pixel_8_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, + *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); @@ -3677,10 +3319,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_pixel_8_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = expand_pixel_8_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, + *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); @@ -3691,14 +3333,9 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, } } - _mm_empty (); } -/* ---------------------------------------------------------------- - * composite_over_n_8_8888 - */ - -pixman_bool_t +static pixman_bool_t pixman_fill_sse2 (uint32_t *bits, int stride, int bpp, @@ -3845,7 +3482,6 @@ pixman_fill_sse2 (uint32_t *bits, } } - _mm_empty (); return TRUE; } @@ -3907,9 +3543,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, if (m) { - *dst = pack_1x64_32 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); + *dst = pack_1x128_32 ( + pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); } else { @@ -3962,9 +3597,9 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, if (m) { - *dst = pack_1x64_32 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); + *dst = pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_src, expand_pixel_8_1x128 (m))); } else { @@ -3976,13 +3611,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, } } - _mm_empty (); } -/*----------------------------------------------------------------------- - * composite_over_n_8_0565 - */ - static void sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, pixman_op_t op, @@ -4004,7 +3634,7 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, int dst_stride, mask_stride; int32_t w; uint32_t m; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; __m128i xmm_src, xmm_alpha; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; @@ -4023,8 +3653,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -4041,12 +3671,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4114,12 +3744,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4128,13 +3758,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, } } - _mm_empty (); } -/* ----------------------------------------------------------------------- - * composite_over_pixbuf_0565 - */ - static void sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, pixman_op_t op, @@ -4156,7 +3781,7 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, int32_t w; uint32_t opaque, zero; - __m64 ms; + __m128i ms; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; @@ -4165,15 +3790,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); -#if 0 - /* FIXME - * - * I copy the code from MMX one and keep the fixme. - * If it's a problem there, probably is a problem here. - */ - assert (src_image->drawable == mask_image->drawable); -#endif - while (height--) { dst = dst_line; @@ -4187,11 +3803,11 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, s = *src++; d = *dst; - ms = unpack_32_1x64 (s); + ms = unpack_32_1x128 (s); *dst++ = pack_565_32_16 ( - pack_1x64_32 ( - over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); + pack_1x128_32 ( + over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); w--; } @@ -4253,22 +3869,17 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, s = *src++; d = *dst; - ms = unpack_32_1x64 (s); + ms = unpack_32_1x128 (s); *dst++ = pack_565_32_16 ( - pack_1x64_32 ( - over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); + pack_1x128_32 ( + over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); w--; } } - _mm_empty (); } -/* ------------------------------------------------------------------------- - * composite_over_pixbuf_8888 - */ - static void sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, pixman_op_t op, @@ -4298,15 +3909,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); -#if 0 - /* FIXME - * - * I copy the code from MMX one and keep the fixme. - * If it's a problem there, probably is a problem here. - */ - assert (src_image->drawable == mask_image->drawable); -#endif - while (height--) { dst = dst_line; @@ -4320,9 +3922,9 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, s = *src++; d = *dst; - *dst++ = pack_1x64_32 ( - over_rev_non_pre_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 ( + over_rev_non_pre_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } @@ -4367,21 +3969,16 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, s = *src++; d = *dst; - *dst++ = pack_1x64_32 ( - over_rev_non_pre_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 ( + over_rev_non_pre_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } } - _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * composite_over_n_8888_0565_ca - */ - static void sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, pixman_op_t op, @@ -4408,7 +4005,7 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); @@ -4422,8 +4019,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -4440,12 +4037,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4509,12 +4106,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4524,13 +4121,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, } } - _mm_empty (); } -/* ----------------------------------------------------------------------- - * composite_in_n_8_8 - */ - static void sse2_composite_in_n_8_8 (pixman_implementation_t *imp, pixman_op_t op, @@ -4582,11 +4174,11 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 (xmm_alpha, + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } @@ -4619,22 +4211,17 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } } - _mm_empty (); } -/* ----------------------------------------------------------------------- - * composite_in_n_8 - */ - static void sse2_composite_in_n_8 (pixman_implementation_t *imp, pixman_op_t op, @@ -4689,10 +4276,10 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, { d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_alpha, + unpack_32_1x128 (d))); w--; } @@ -4717,21 +4304,16 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, { d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_alpha, + unpack_32_1x128 (d))); w--; } } - _mm_empty (); } -/* --------------------------------------------------------------------------- - * composite_in_8_8 - */ - static void sse2_composite_in_8_8 (pixman_implementation_t *imp, pixman_op_t op, @@ -4774,9 +4356,9 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, s = (uint32_t) *src++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } @@ -4805,19 +4387,14 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, s = (uint32_t) *src++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } } - _mm_empty (); } -/* ------------------------------------------------------------------------- - * composite_add_n_8_8 - */ - static void sse2_composite_add_n_8_8 (pixman_implementation_t *imp, pixman_op_t op, @@ -4869,11 +4446,11 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - _mm_adds_pu16 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + _mm_adds_epu16 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } @@ -4905,23 +4482,18 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - _mm_adds_pu16 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + _mm_adds_epu16 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } } - _mm_empty (); } -/* ------------------------------------------------------------------------- - * composite_add_n_8_8 - */ - static void sse2_composite_add_n_8 (pixman_implementation_t *imp, pixman_op_t op, @@ -4973,10 +4545,10 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, while (w && ((unsigned long)dst & 15)) { - *dst = (uint8_t)_mm_cvtsi64_si32 ( - _mm_adds_pu8 ( - _mm_movepi64_pi64 (xmm_src), - _mm_cvtsi32_si64 (*dst))); + *dst = (uint8_t)_mm_cvtsi128_si32 ( + _mm_adds_epu8 ( + xmm_src, + _mm_cvtsi32_si128 (*dst))); w--; dst++; @@ -4993,23 +4565,18 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, while (w) { - *dst = (uint8_t)_mm_cvtsi64_si32 ( - _mm_adds_pu8 ( - _mm_movepi64_pi64 (xmm_src), - _mm_cvtsi32_si64 (*dst))); + *dst = (uint8_t)_mm_cvtsi128_si32 ( + _mm_adds_epu8 ( + xmm_src, + _mm_cvtsi32_si128 (*dst))); w--; dst++; } } - _mm_empty (); } -/* ---------------------------------------------------------------------- - * composite_add_8_8 - */ - static void sse2_composite_add_8_8 (pixman_implementation_t *imp, pixman_op_t op, @@ -5053,7 +4620,8 @@ sse2_composite_add_8_8 (pixman_implementation_t *imp, w--; } - core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); + sse2_combine_add_u (imp, op, + (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); /* Small tail */ dst += w & 0xfffc; @@ -5069,12 +4637,8 @@ sse2_composite_add_8_8 (pixman_implementation_t *imp, } } - _mm_empty (); } -/* --------------------------------------------------------------------- - * composite_add_8888_8888 - */ static void sse2_composite_add_8888_8888 (pixman_implementation_t *imp, pixman_op_t op, @@ -5106,16 +4670,11 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp, src = src_line; src_line += src_stride; - core_combine_add_u_sse2 (dst, src, NULL, width); + sse2_combine_add_u (imp, op, dst, src, NULL, width); } - _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * sse2_composite_copy_area - */ - static pixman_bool_t pixman_blt_sse2 (uint32_t *src_bits, uint32_t *dst_bits, @@ -5234,7 +4793,6 @@ pixman_blt_sse2 (uint32_t *src_bits, } } - _mm_empty (); return TRUE; } @@ -5284,7 +4842,7 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, uint32_t m; int src_stride, mask_stride, dst_stride; int32_t w; - __m64 ms; + __m128i ms; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; @@ -5313,24 +4871,25 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, s = 0xff000000 | *src++; m = (uint32_t) *mask++; d = *dst; - ms = unpack_32_1x64 (s); + ms = unpack_32_1x128 (s); if (m != 0xff) { - __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - __m64 md = unpack_32_1x64 (d); + __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + __m128i md = unpack_32_1x128 (d); - ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md); + ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); } - *dst++ = pack_1x64_32 (ms); + *dst++ = pack_1x128_32 (ms); w--; } while (w >= 4) { m = *(uint32_t*) mask; - xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000); + xmm_src = _mm_or_si128 ( + load_128_unaligned ((__m128i*)src), mask_ff000000); if (m == 0xffffffff) { @@ -5346,9 +4905,12 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + expand_alpha_rev_2x128 ( + xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); } @@ -5373,15 +4935,15 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, } else { - __m64 ma, md, ms; + __m128i ma, md, ms; d = *dst; - ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - md = unpack_32_1x64 (d); - ms = unpack_32_1x64 (s); + ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + md = unpack_32_1x128 (d); + ms = unpack_32_1x128 (s); - *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md)); + *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); } } @@ -5392,7 +4954,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, } } - _mm_empty (); } static void @@ -5457,15 +5018,15 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, } else { - __m64 ms, md, ma, msa; + __m128i ms, md, ma, msa; - ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); - ms = unpack_32_1x64 (s); - md = unpack_32_1x64 (d); + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); - msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); } } @@ -5529,15 +5090,15 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, } else { - __m64 ms, md, ma, msa; + __m128i ms, md, ma, msa; - ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); - ms = unpack_32_1x64 (s); - md = unpack_32_1x64 (d); + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); - msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); } } @@ -5546,7 +5107,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, } } - _mm_empty (); } static void @@ -5591,12 +5151,12 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, while (w && (unsigned long)dst & 15) { - __m64 vd; + __m128i vd; - vd = unpack_32_1x64 (*dst); + vd = unpack_32_1x128 (*dst); - *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd), - _mm_movepi64_pi64 (xmm_src))); + *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), + xmm_src)); w--; dst++; } @@ -5626,19 +5186,18 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, while (w) { - __m64 vd; + __m128i vd; - vd = unpack_32_1x64 (*dst); + vd = unpack_32_1x128 (*dst); - *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd), - _mm_movepi64_pi64 (xmm_src))); + *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), + xmm_src)); w--; dst++; } } - _mm_empty (); } static void @@ -5703,15 +5262,15 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, } else { - __m64 ms, md, ma, msa; + __m128i ms, md, ma, msa; - ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); - ms = unpack_32_1x64 (s); - md = unpack_32_1x64 (d); + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); - msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); } } @@ -5773,15 +5332,15 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, } else { - __m64 ms, md, ma, msa; + __m128i ms, md, ma, msa; - ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); - ms = unpack_32_1x64 (s); - md = unpack_32_1x64 (d); + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); - msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); } } @@ -5790,10 +5349,9 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, } } - _mm_empty (); } -/* A variant of 'core_combine_over_u_sse2' with minor tweaks */ +/* A variant of 'sse2_combine_over_u' with minor tweaks */ static force_inline void scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, const uint32_t* ps, @@ -5885,7 +5443,6 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, w--; } - _mm_empty (); } FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, @@ -5927,13 +5484,13 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, { uint32_t d = *dst; - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 dest = _mm_movepi64_pi64 (xmm_mask); - __m64 alpha_dst = unpack_32_1x64 (d); + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i dest = xmm_mask; + __m128i alpha_dst = unpack_32_1x128 (d); - *dst = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); } dst++; w--; @@ -5985,20 +5542,19 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, { uint32_t d = *dst; - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); - *dst = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &mask, &dest)); + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &mask, &dest)); } dst++; w--; } - _mm_empty (); } FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, @@ -6374,20 +5930,7 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback) mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); - /* MMX constants */ - mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f); - mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840); - - mask_x0080 = create_mask_16_64 (0x0080); - mask_x00ff = create_mask_16_64 (0x00ff); - mask_x0101 = create_mask_16_64 (0x0101); - mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000); - - _mm_empty (); - /* Set up function pointers */ - - /* SSE code patch for fbcompose.c */ imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; @@ -6420,5 +5963,3 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback) return imp; } - -#endif /* USE_SSE2 */ |