diff options
Diffstat (limited to 'pixman/pixman/pixman-sse2.c')
-rw-r--r-- | pixman/pixman/pixman-sse2.c | 7856 |
1 files changed, 4419 insertions, 3437 deletions
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index 0f36436b6..2fa956e21 100644 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -23,7 +23,7 @@ * * Author: Rodrigo Kumpera (kumpera@gmail.com) * André Tupinambá (andrelrt@gmail.com) - * + * * Based on work by Owen Taylor and Søren Sandmann */ #ifdef HAVE_CONFIG_H @@ -33,405 +33,478 @@ #include <mmintrin.h> #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ #include <emmintrin.h> /* for SSE2 intrinsics */ - -#include "pixman-sse2.h" - -#ifdef USE_SSE2 - -#ifdef _MSC_VER -#undef inline -#define inline __forceinline +#include "pixman-private.h" +#include "pixman-combine32.h" + +#if defined(_MSC_VER) && defined(_M_AMD64) +/* Windows 64 doesn't allow MMX to be used, so + * the pixman-x64-mmx-emulation.h file contains + * implementations of those MMX intrinsics that + * are used in the SSE2 implementation. + */ +# include "pixman-x64-mmx-emulation.h" #endif -#ifdef __GNUC__ -# define inline __inline__ __attribute__ ((__always_inline__)) -#endif +#ifdef USE_SSE2 -/* ------------------------------------------------------------------------------------------------- +/* -------------------------------------------------------------------- * Locals */ -static __m64 xMask0080; -static __m64 xMask00ff; -static __m64 xMask0101; -static __m64 xMaskAlpha; +static __m64 mask_x0080; +static __m64 mask_x00ff; +static __m64 mask_x0101; +static __m64 mask_x_alpha; -static __m64 xMask565rgb; -static __m64 xMask565Unpack; +static __m64 mask_x565_rgb; +static __m64 mask_x565_unpack; -static __m128i Mask0080; -static __m128i Mask00ff; -static __m128i Mask0101; -static __m128i Maskffff; -static __m128i Maskff000000; -static __m128i MaskAlpha; +static __m128i mask_0080; +static __m128i mask_00ff; +static __m128i mask_0101; +static __m128i mask_ffff; +static __m128i mask_ff000000; +static __m128i mask_alpha; -static __m128i Mask565r; -static __m128i Mask565g1, Mask565g2; -static __m128i Mask565b; -static __m128i MaskRed; -static __m128i MaskGreen; -static __m128i MaskBlue; +static __m128i mask_565_r; +static __m128i mask_565_g1, mask_565_g2; +static __m128i mask_565_b; +static __m128i mask_red; +static __m128i mask_green; +static __m128i mask_blue; -static __m128i Mask565FixRB; -static __m128i Mask565FixG; +static __m128i mask_565_fix_rb; +static __m128i mask_565_fix_g; -/* ------------------------------------------------------------------------------------------------- +/* ---------------------------------------------------------------------- * SSE2 Inlines */ -static inline __m128i +static force_inline __m128i unpack_32_1x128 (uint32_t data) { - return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128()); + return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); } -static inline void -unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi) +static force_inline void +unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) { - *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); - *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); + *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); + *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); } -static inline __m128i -unpack565to8888 (__m128i lo) +static force_inline __m128i +unpack_565_to_8888 (__m128i lo) { __m128i r, g, b, rb, t; - - r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed); - g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen); - b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue); + + r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); + g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); + b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); rb = _mm_or_si128 (r, b); - t = _mm_and_si128 (rb, Mask565FixRB); + t = _mm_and_si128 (rb, mask_565_fix_rb); t = _mm_srli_epi32 (t, 5); rb = _mm_or_si128 (rb, t); - t = _mm_and_si128 (g, Mask565FixG); + t = _mm_and_si128 (g, mask_565_fix_g); t = _mm_srli_epi32 (t, 6); g = _mm_or_si128 (g, t); - + return _mm_or_si128 (rb, g); } -static inline void -unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3) +static force_inline void +unpack_565_128_4x128 (__m128i data, + __m128i* data0, + __m128i* data1, + __m128i* data2, + __m128i* data3) { __m128i lo, hi; lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); - lo = unpack565to8888 (lo); - hi = unpack565to8888 (hi); + lo = unpack_565_to_8888 (lo); + hi = unpack_565_to_8888 (hi); unpack_128_2x128 (lo, data0, data1); unpack_128_2x128 (hi, data2, data3); } -static inline uint16_t -pack565_32_16 (uint32_t pixel) +static force_inline uint16_t +pack_565_32_16 (uint32_t pixel) { - return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f)); + return (uint16_t) (((pixel >> 8) & 0xf800) | + ((pixel >> 5) & 0x07e0) | + ((pixel >> 3) & 0x001f)); } -static inline __m128i +static force_inline __m128i pack_2x128_128 (__m128i lo, __m128i hi) { return _mm_packus_epi16 (lo, hi); } -static inline __m128i -pack565_2x128_128 (__m128i lo, __m128i hi) +static force_inline __m128i +pack_565_2x128_128 (__m128i lo, __m128i hi) { __m128i data; __m128i r, g1, g2, b; - data = pack_2x128_128 ( lo, hi ); + data = pack_2x128_128 (lo, hi); - r = _mm_and_si128 (data , Mask565r); - g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1); - g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2); - b = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b); + r = _mm_and_si128 (data, mask_565_r); + g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); + g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); + b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); } -static inline __m128i -pack565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) +static force_inline __m128i +pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) { - return _mm_packus_epi16 (pack565_2x128_128 (*xmm0, *xmm1), pack565_2x128_128 (*xmm2, *xmm3)); + return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), + pack_565_2x128_128 (*xmm2, *xmm3)); } -static inline uint32_t -packAlpha (__m128i x) +static force_inline int +is_opaque (__m128i x) { - return _mm_cvtsi128_si32 (_mm_packus_epi16 (_mm_packus_epi16 (_mm_srli_epi32 (x, 24), - _mm_setzero_si128 ()), - _mm_setzero_si128 ())); + __m128i ffs = _mm_cmpeq_epi8 (x, x); + + return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; } -static inline __m128i -expandPixel_32_1x128 (uint32_t data) +static force_inline int +is_zero (__m128i x) { - return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0)); + return _mm_movemask_epi8 ( + _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; } -static inline __m128i -expandAlpha_1x128 (__m128i data) +static force_inline int +is_transparent (__m128i x) { - return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); + return (_mm_movemask_epi8 ( + _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; } -static inline void -expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi) +static force_inline __m128i +expand_pixel_32_1x128 (uint32_t data) { - __m128i lo, hi; + return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); +} - lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3)); - hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3)); - *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3)); - *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3)); +static force_inline __m128i +expand_alpha_1x128 (__m128i data) +{ + return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, + _MM_SHUFFLE (3, 3, 3, 3)), + _MM_SHUFFLE (3, 3, 3, 3)); } -static inline void -expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi) +static force_inline void +expand_alpha_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* alpha_lo, + __m128i* alpha_hi) { __m128i lo, hi; - lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0)); - hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0)); - *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0)); - *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0)); + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); + + *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); + *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); } -static inline void -pixMultiply_2x128 (__m128i* dataLo, __m128i* dataHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* retLo, __m128i* retHi) +static force_inline void +expand_alpha_rev_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* alpha_lo, + __m128i* alpha_hi) { __m128i lo, hi; - lo = _mm_mullo_epi16 (*dataLo, *alphaLo); - hi = _mm_mullo_epi16 (*dataHi, *alphaHi); - lo = _mm_adds_epu16 (lo, Mask0080); - hi = _mm_adds_epu16 (hi, Mask0080); - *retLo = _mm_mulhi_epu16 (lo, Mask0101); - *retHi = _mm_mulhi_epu16 (hi, Mask0101); + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); + *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); + *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); } -static inline void -pixAddMultiply_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaDstLo, __m128i* alphaDstHi, - __m128i* dstLo, __m128i* dstHi, __m128i* alphaSrcLo, __m128i* alphaSrcHi, - __m128i* retLo, __m128i* retHi) +static force_inline void +pix_multiply_2x128 (__m128i* data_lo, + __m128i* data_hi, + __m128i* alpha_lo, + __m128i* alpha_hi, + __m128i* ret_lo, + __m128i* ret_hi) { __m128i lo, hi; - __m128i mulLo, mulHi; - lo = _mm_mullo_epi16 (*srcLo, *alphaDstLo); - hi = _mm_mullo_epi16 (*srcHi, *alphaDstHi); - mulLo = _mm_mullo_epi16 (*dstLo, *alphaSrcLo); - mulHi = _mm_mullo_epi16 (*dstHi, *alphaSrcHi); - lo = _mm_adds_epu16 (lo, Mask0080); - hi = _mm_adds_epu16 (hi, Mask0080); - lo = _mm_adds_epu16 (lo, mulLo); - hi = _mm_adds_epu16 (hi, mulHi); - *retLo = _mm_mulhi_epu16 (lo, Mask0101); - *retHi = _mm_mulhi_epu16 (hi, Mask0101); + lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); + hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); + lo = _mm_adds_epu16 (lo, mask_0080); + hi = _mm_adds_epu16 (hi, mask_0080); + *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); + *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); +} + +static force_inline void +pix_add_multiply_2x128 (__m128i* src_lo, + __m128i* src_hi, + __m128i* alpha_dst_lo, + __m128i* alpha_dst_hi, + __m128i* dst_lo, + __m128i* dst_hi, + __m128i* alpha_src_lo, + __m128i* alpha_src_hi, + __m128i* ret_lo, + __m128i* ret_hi) +{ + __m128i lo, hi; + __m128i mul_lo, mul_hi; + + lo = _mm_mullo_epi16 (*src_lo, *alpha_dst_lo); + hi = _mm_mullo_epi16 (*src_hi, *alpha_dst_hi); + mul_lo = _mm_mullo_epi16 (*dst_lo, *alpha_src_lo); + mul_hi = _mm_mullo_epi16 (*dst_hi, *alpha_src_hi); + lo = _mm_adds_epu16 (lo, mask_0080); + hi = _mm_adds_epu16 (hi, mask_0080); + lo = _mm_adds_epu16 (lo, mul_lo); + hi = _mm_adds_epu16 (hi, mul_hi); + *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); + *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); } -static inline void -negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi) +static force_inline void +negate_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* neg_lo, + __m128i* neg_hi) { - *negLo = _mm_xor_si128 (dataLo, Mask00ff); - *negHi = _mm_xor_si128 (dataHi, Mask00ff); + *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); + *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); } -static inline void -invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi) +static force_inline void +invert_colors_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* inv_lo, + __m128i* inv_hi) { __m128i lo, hi; - lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2)); - hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2)); - *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2)); - *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2)); + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); + *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); + *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); } -static inline void -over_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* dstLo, __m128i* dstHi) +static force_inline void +over_2x128 (__m128i* src_lo, + __m128i* src_hi, + __m128i* alpha_lo, + __m128i* alpha_hi, + __m128i* dst_lo, + __m128i* dst_hi) { __m128i t1, t2; - negate_2x128 (*alphaLo, *alphaHi, &t1, &t2); + negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); - pixMultiply_2x128 (dstLo, dstHi, &t1, &t2, dstLo, dstHi); + pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); - *dstLo = _mm_adds_epu8 (*srcLo, *dstLo); - *dstHi = _mm_adds_epu8 (*srcHi, *dstHi); + *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); + *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); } -static inline void -overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi) +static force_inline void +over_rev_non_pre_2x128 (__m128i src_lo, + __m128i src_hi, + __m128i* dst_lo, + __m128i* dst_hi) { __m128i lo, hi; - __m128i alphaLo, alphaHi; + __m128i alpha_lo, alpha_hi; - expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi); + expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); - lo = _mm_or_si128 (alphaLo, MaskAlpha); - hi = _mm_or_si128 (alphaHi, MaskAlpha); + lo = _mm_or_si128 (alpha_lo, mask_alpha); + hi = _mm_or_si128 (alpha_hi, mask_alpha); - invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi); + invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); - pixMultiply_2x128 (&srcLo, &srcHi, &lo, &hi, &lo, &hi); + pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); - over_2x128 (&lo, &hi, &alphaLo, &alphaHi, dstLo, dstHi); + over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); } -static inline void -inOver_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi, - __m128i* maskLo, __m128i* maskHi, __m128i* dstLo, __m128i* dstHi) +static force_inline void +in_over_2x128 (__m128i* src_lo, + __m128i* src_hi, + __m128i* alpha_lo, + __m128i* alpha_hi, + __m128i* mask_lo, + __m128i* mask_hi, + __m128i* dst_lo, + __m128i* dst_hi) { - __m128i sLo, sHi; - __m128i aLo, aHi; + __m128i s_lo, s_hi; + __m128i a_lo, a_hi; - pixMultiply_2x128 ( srcLo, srcHi, maskLo, maskHi, &sLo, &sHi); - pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi); + pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); + pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); - over_2x128 (&sLo, &sHi, &aLo, &aHi, dstLo, dstHi); + over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); } -static inline void -cachePrefetch (__m128i* addr) +static force_inline void +cache_prefetch (__m128i* addr) { _mm_prefetch (addr, _MM_HINT_T0); } -static inline void -cachePrefetchNext (__m128i* addr) +static force_inline void +cache_prefetch_next (__m128i* addr) { - _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead + _mm_prefetch (addr + 4, _MM_HINT_T0); /* 64 bytes ahead */ } /* load 4 pixels from a 16-byte boundary aligned address */ -static inline __m128i -load128Aligned (__m128i* src) +static force_inline __m128i +load_128_aligned (__m128i* src) { return _mm_load_si128 (src); } /* load 4 pixels from a unaligned address */ -static inline __m128i -load128Unaligned (__m128i* src) +static force_inline __m128i +load_128_unaligned (const __m128i* src) { return _mm_loadu_si128 (src); } -/* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */ -static inline void -save128WriteCombining (__m128i* dst, __m128i data) +/* save 4 pixels using Write Combining memory on a 16-byte + * boundary aligned address + */ +static force_inline void +save_128_write_combining (__m128i* dst, + __m128i data) { _mm_stream_si128 (dst, data); } /* save 4 pixels on a 16-byte boundary aligned address */ -static inline void -save128Aligned (__m128i* dst, __m128i data) +static force_inline void +save_128_aligned (__m128i* dst, + __m128i data) { _mm_store_si128 (dst, data); } /* save 4 pixels on a unaligned address */ -static inline void -save128Unaligned (__m128i* dst, __m128i data) +static force_inline void +save_128_unaligned (__m128i* dst, + __m128i data) { _mm_storeu_si128 (dst, data); } -/* ------------------------------------------------------------------------------------------------- +/* ------------------------------------------------------------------ * MMX inlines */ -static inline __m64 +static force_inline __m64 unpack_32_1x64 (uint32_t data) { - return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64()); + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ()); } -static inline __m64 -expandAlpha_1x64 (__m64 data) +static force_inline __m64 +expand_alpha_1x64 (__m64 data) { - return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3)); + return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3)); } -static inline __m64 -expandAlphaRev_1x64 (__m64 data) +static force_inline __m64 +expand_alpha_rev_1x64 (__m64 data) { - return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0)); + return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); } -static inline __m64 -expandPixel_8_1x64 (uint8_t data) +static force_inline __m64 +expand_pixel_8_1x64 (uint8_t data) { - return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0)); + return _mm_shuffle_pi16 ( + unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); } -static inline __m64 -pixMultiply_1x64 (__m64 data, __m64 alpha) +static force_inline __m64 +pix_multiply_1x64 (__m64 data, + __m64 alpha) { return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha), - xMask0080), - xMask0101); + mask_x0080), + mask_x0101); } -static inline __m64 -pixAddMultiply_1x64 (__m64* src, __m64* alphaDst, __m64* dst, __m64* alphaSrc) +static force_inline __m64 +pix_add_multiply_1x64 (__m64* src, + __m64* alpha_dst, + __m64* dst, + __m64* alpha_src) { - return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alphaDst), - xMask0080), - _mm_mullo_pi16 (*dst, *alphaSrc)), - xMask0101); + return _mm_mulhi_pu16 ( + _mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alpha_dst), + mask_x0080), + _mm_mullo_pi16 (*dst, *alpha_src)), + mask_x0101); } -static inline __m64 +static force_inline __m64 negate_1x64 (__m64 data) { - return _mm_xor_si64 (data, xMask00ff); + return _mm_xor_si64 (data, mask_x00ff); } -static inline __m64 -invertColors_1x64 (__m64 data) +static force_inline __m64 +invert_colors_1x64 (__m64 data) { - return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2)); + return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); } -static inline __m64 +static force_inline __m64 over_1x64 (__m64 src, __m64 alpha, __m64 dst) { - return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha))); + return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha))); } -static inline __m64 -inOver_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst) +static force_inline __m64 +in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst) { - return over_1x64 (pixMultiply_1x64 (*src, *mask), - pixMultiply_1x64 (*alpha, *mask), + return over_1x64 (pix_multiply_1x64 (*src, *mask), + pix_multiply_1x64 (*alpha, *mask), *dst); } -static inline __m64 -overRevNonPre_1x64 (__m64 src, __m64 dst) +static force_inline __m64 +over_rev_non_pre_1x64 (__m64 src, __m64 dst) { - __m64 alpha = expandAlpha_1x64 (src); + __m64 alpha = expand_alpha_1x64 (src); - return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src), - _mm_or_si64 (alpha, xMaskAlpha)), + return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src), + _mm_or_si64 (alpha, mask_x_alpha)), alpha, dst); } -static inline uint32_t -pack_1x64_32( __m64 data ) +static force_inline uint32_t +pack_1x64_32 (__m64 data) { - return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64())); + return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ())); } /* Expand 16 bits positioned at @pos (0-3) of a mmx register into @@ -448,7 +521,7 @@ pack_1x64_32( __m64 data ) * Note the trick here - the top word is shifted by another nibble to * avoid it bumping into the middle word */ -static inline __m64 +static force_inline __m64 expand565_16_1x64 (uint16_t pixel) { __m64 p; @@ -461,702 +534,965 @@ expand565_16_1x64 (uint16_t pixel) p = _mm_or_si64 (t1, p); p = _mm_or_si64 (t2, p); - p = _mm_and_si64 (p, xMask565rgb); - p = _mm_mullo_pi16 (p, xMask565Unpack); + p = _mm_and_si64 (p, mask_x565_rgb); + p = _mm_mullo_pi16 (p, mask_x565_unpack); return _mm_srli_pi16 (p, 8); } -/* ------------------------------------------------------------------------------------------------- +/* ---------------------------------------------------------------------------- * Compose Core transformations */ -static inline uint32_t -coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst) +static force_inline uint32_t +core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) { - uint8_t a; - __m64 ms; + uint8_t a; + __m64 ms; a = src >> 24; if (a == 0xff) { - return src; + return src; } - else if (a) + else if (src) { - ms = unpack_32_1x64 (src); - return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst))); + ms = unpack_32_1x64 (src); + return pack_1x64_32 ( + over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst))); } return dst; } -static inline void -coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, int w) +static force_inline uint32_t +combine1 (const uint32_t *ps, const uint32_t *pm) { - uint32_t pa; - uint32_t s, d; + uint32_t s = *ps; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmAlphaLo, xmmAlphaHi; + if (pm) + { + __m64 ms, mm; - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + mm = unpack_32_1x64 (*pm); + mm = expand_alpha_1x64 (mm); - /* Align dst on a 16-byte boundary */ - while (w && - ((unsigned long)pd & 15)) + ms = unpack_32_1x64 (s); + ms = pix_multiply_1x64 (ms, mm); + + s = pack_1x64_32 (ms); + } + + return s; +} + +static force_inline __m128i +combine4 (const __m128i *ps, const __m128i *pm) +{ + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_msk_lo, xmm_msk_hi; + __m128i s; + + if (pm) { - d = *pd; - s = *ps++; + xmm_msk_lo = load_128_unaligned (pm); - *pd++ = coreCombineOverUPixelsse2 (s, d); - w--; + if (is_transparent (xmm_msk_lo)) + return _mm_setzero_si128 (); } - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + s = load_128_unaligned (ps); - while (w >= 4) + if (pm) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); + unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); - /* I'm loading unaligned because I'm not sure about the address alignment. */ - xmmSrcHi = load128Unaligned ((__m128i*) ps); + expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); - /* Check the alpha channel */ - pa = packAlpha (xmmSrcHi); + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_msk_lo, &xmm_msk_hi, + &xmm_src_lo, &xmm_src_hi); - if (pa == 0xffffffff) - { - save128Aligned ((__m128i*)pd, xmmSrcHi); - } - else if (pa) - { - xmmDstHi = load128Aligned ((__m128i*) pd); + s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); + } - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + return s; +} - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); +static force_inline void +core_combine_over_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) +{ + uint32_t s, d; - over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi); + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; - /* rebuid the 4 pixel data and save*/ - save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - } + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); - w -= 4; - ps += 4; - pd += 4; + /* Align dst on a 16-byte boundary */ + while (w && ((unsigned long)pd & 15)) + { + d = *pd; + s = combine1 (ps, pm); + + *pd++ = core_combine_over_u_pixel_sse2 (s, d); + ps++; + if (pm) + pm++; + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + /* I'm loading unaligned because I'm not sure about + * the address alignment. + */ + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + + if (is_opaque (xmm_src_hi)) + { + save_128_aligned ((__m128i*)pd, xmm_src_hi); + } + else if (!is_zero (xmm_src_hi)) + { + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 ( + xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); + + over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + /* rebuid the 4 pixel data and save*/ + save_128_aligned ((__m128i*)pd, + pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + ps += 4; + pd += 4; + if (pm) + pm += 4; } while (w) { - d = *pd; - s = *ps++; + d = *pd; + s = combine1 (ps, pm); + + *pd++ = core_combine_over_u_pixel_sse2 (s, d); + ps++; + if (pm) + pm++; - *pd++ = coreCombineOverUPixelsse2 (s, d); - w--; + w--; } } -static inline void -coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, int w) +static force_inline void +core_combine_over_reverse_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) { uint32_t s, d; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmAlphaLo, xmmAlphaHi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) { - d = *pd; - s = *ps++; - - *pd++ = coreCombineOverUPixelsse2 (d, s); - w--; + d = *pd; + s = combine1 (ps, pm); + + *pd++ = core_combine_over_u_pixel_sse2 (d, s); + w--; + ps++; + if (pm) + pm++; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); - /* I'm loading unaligned because I'm not sure about the address alignment. */ - xmmSrcHi = load128Unaligned ((__m128i*) ps); - xmmDstHi = load128Aligned ((__m128i*) pd); + /* I'm loading unaligned because I'm not sure + * about the address alignment. + */ + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); - over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmSrcLo, &xmmSrcHi); + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_src_lo, &xmm_src_hi); - /* rebuid the 4 pixel data and save*/ - save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi)); + /* rebuid the 4 pixel data and save*/ + save_128_aligned ((__m128i*)pd, + pack_2x128_128 (xmm_src_lo, xmm_src_hi)); - w -= 4; - ps += 4; - pd += 4; + w -= 4; + ps += 4; + pd += 4; + + if (pm) + pm += 4; } while (w) { - d = *pd; - s = *ps++; - - *pd++ = coreCombineOverUPixelsse2 (d, s); - w--; + d = *pd; + s = combine1 (ps, pm); + + *pd++ = core_combine_over_u_pixel_sse2 (d, s); + ps++; + w--; + if (pm) + pm++; } } -static inline uint32_t -coreCombineInUPixelsse2 (uint32_t src, uint32_t dst) +static force_inline uint32_t +core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst) { uint32_t maska = src >> 24; if (maska == 0) { - return 0; + return 0; } else if (maska != 0xff) { - return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src)))); + return pack_1x64_32 ( + pix_multiply_1x64 (unpack_32_1x64 (dst), + expand_alpha_1x64 (unpack_32_1x64 (src)))); } return dst; } -static inline void -coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, int w) +static force_inline void +core_combine_in_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) { uint32_t s, d; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { - s = *ps++; - d = *pd; - - *pd++ = coreCombineInUPixelsse2 (d, s); - w--; + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixelsse2 (d, s); + w--; + ps++; + if (pm) + pm++; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - - xmmDstHi = load128Aligned ((__m128i*) pd); - xmmSrcHi = load128Unaligned ((__m128i*) ps); - - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); - - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi); - - save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - ps += 4; - pd += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)pd, + pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; } while (w) { - s = *ps++; - d = *pd; - - *pd++ = coreCombineInUPixelsse2 (d, s); - w--; + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixelsse2 (d, s); + w--; + ps++; + if (pm) + pm++; } } -static inline void -coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, int w) +static force_inline void +core_combine_reverse_in_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t *pm, + int w) { uint32_t s, d; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { - s = *ps++; - d = *pd; - - *pd++ = coreCombineInUPixelsse2 (s, d); - w--; + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixelsse2 (s, d); + ps++; + w--; + if (pm) + pm++; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - - xmmDstHi = load128Aligned ((__m128i*) pd); - xmmSrcHi = load128Unaligned ((__m128i*) ps); - - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi); - - save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - ps += 4; - pd += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; } while (w) { - s = *ps++; - d = *pd; - - *pd++ = coreCombineInUPixelsse2 (s, d); - w--; + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixelsse2 (s, d); + w--; + ps++; + if (pm) + pm++; } } -static inline void -coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, int w) +static force_inline void +core_combine_reverse_out_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) { /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { - uint32_t s = *ps++; - uint32_t d = *pd; - - *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s))))); - w--; + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), negate_1x64 ( + expand_alpha_1x64 (unpack_32_1x64 (s))))); + + if (pm) + pm++; + ps++; + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); - xmmSrcHi = load128Unaligned ((__m128i*) ps); - xmmDstHi = load128Aligned ((__m128i*) pd); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - negate_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); - pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi); + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + ps += 4; + pd += 4; + if (pm) + pm += 4; - ps += 4; - pd += 4; - w -= 4; + w -= 4; } while (w) { - uint32_t s = *ps++; - uint32_t d = *pd; - - *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s))))); - w--; + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), negate_1x64 ( + expand_alpha_1x64 (unpack_32_1x64 (s))))); + ps++; + if (pm) + pm++; + w--; } } -static inline void -coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, int w) +static force_inline void +core_combine_out_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) { /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { - uint32_t s = *ps++; - uint32_t d = *pd; - - *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); - w--; + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), negate_1x64 ( + expand_alpha_1x64 (unpack_32_1x64 (d))))); + w--; + ps++; + if (pm) + pm++; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); - xmmSrcHi = load128Unaligned ((__m128i*) ps); - xmmDstHi = load128Aligned ((__m128i*) pd); + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); - negate_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); - pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi); + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); - save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - ps += 4; - pd += 4; - w -= 4; + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; } while (w) { - uint32_t s = *ps++; - uint32_t d = *pd; - - *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); - w--; + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), negate_1x64 ( + expand_alpha_1x64 (unpack_32_1x64 (d))))); + w--; + ps++; + if (pm) + pm++; } } -static inline uint32_t -coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst) +static force_inline uint32_t +core_combine_atop_u_pixel_sse2 (uint32_t src, + uint32_t dst) { __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); - __m64 sa = negate_1x64 (expandAlpha_1x64 (s)); - __m64 da = expandAlpha_1x64 (d); + __m64 sa = negate_1x64 (expand_alpha_1x64 (s)); + __m64 da = expand_alpha_1x64 (d); - return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa)); + return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); } -static inline void -coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w) +static force_inline void +core_combine_atop_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) { uint32_t s, d; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; - __m128i xmmAlphaDstLo, xmmAlphaDstHi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { - s = *ps++; - d = *pd; - - *pd++ = coreCombineAtopUPixelsse2 (s, d); - w--; + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_atop_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - - xmmSrcHi = load128Unaligned ((__m128i*) ps); - xmmDstHi = load128Aligned ((__m128i*) pd); - - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); - - negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); - - pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi, - &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, - &xmmDstLo, &xmmDstHi ); - - save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - ps += 4; - pd += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + + pix_add_multiply_2x128 ( + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; } while (w) { - s = *ps++; - d = *pd; - - *pd++ = coreCombineAtopUPixelsse2 (s, d); - w--; + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_atop_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; } } -static inline uint32_t -coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst) +static force_inline uint32_t +core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, + uint32_t dst) { __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); - __m64 sa = expandAlpha_1x64 (s); - __m64 da = negate_1x64 (expandAlpha_1x64 (d)); + __m64 sa = expand_alpha_1x64 (s); + __m64 da = negate_1x64 (expand_alpha_1x64 (d)); - return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa)); + return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); } -static inline void -coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w) +static force_inline void +core_combine_reverse_atop_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) { uint32_t s, d; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; - __m128i xmmAlphaDstLo, xmmAlphaDstHi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { - s = *ps++; - d = *pd; - - *pd++ = coreCombineReverseAtopUPixelsse2 (s, d); - w--; + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); + ps++; + w--; + if (pm) + pm++; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - - xmmSrcHi = load128Unaligned ((__m128i*) ps); - xmmDstHi = load128Aligned ((__m128i*) pd); - - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); - - negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); - - pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi, - &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, - &xmmDstLo, &xmmDstHi ); - - save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - ps += 4; - pd += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_add_multiply_2x128 ( + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; } while (w) { - s = *ps++; - d = *pd; - - *pd++ = coreCombineReverseAtopUPixelsse2 (s, d); - w--; + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); + ps++; + w--; + if (pm) + pm++; } } -static inline uint32_t -coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst) +static force_inline uint32_t +core_combine_xor_u_pixel_sse2 (uint32_t src, + uint32_t dst) { __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); - __m64 negD = negate_1x64 (expandAlpha_1x64 (d)); - __m64 negS = negate_1x64 (expandAlpha_1x64 (s)); + __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d)); + __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s)); - return pack_1x64_32 (pixAddMultiply_1x64 (&s, &negD, &d, &negS)); + return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s)); } -static inline void -coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, int width) +static force_inline void +core_combine_xor_u_sse2 (uint32_t* dst, + const uint32_t* src, + const uint32_t *mask, + int width) { int w = width; uint32_t s, d; uint32_t* pd = dst; const uint32_t* ps = src; + const uint32_t* pm = mask; - __m128i xmmSrc, xmmSrcLo, xmmSrcHi; - __m128i xmmDst, xmmDstLo, xmmDstHi; - __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; - __m128i xmmAlphaDstLo, xmmAlphaDstHi; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { - s = *ps++; - d = *pd; - - *pd++ = coreCombineXorUPixelsse2 (s, d); - w--; + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_xor_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - - xmmSrc = load128Unaligned ((__m128i*) ps); - xmmDst = load128Aligned ((__m128i*) pd); - - unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); - - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); - - negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); - negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); - - pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi, - &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, - &xmmDstLo, &xmmDstHi ); - - save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - ps += 4; - pd += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); + xmm_dst = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_add_multiply_2x128 ( + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; } while (w) { - s = *ps++; - d = *pd; - - *pd++ = coreCombineXorUPixelsse2 (s, d); - w--; + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_xor_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; } } -static inline void -coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, int width) +static force_inline void +core_combine_add_u_sse2 (uint32_t* dst, + const uint32_t* src, + const uint32_t* mask, + int width) { int w = width; - uint32_t s,d; + uint32_t s, d; uint32_t* pd = dst; const uint32_t* ps = src; + const uint32_t* pm = mask; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - d = *pd; - *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); - w--; + s = combine1 (ps, pm); + d = *pd; + + ps++; + if (pm) + pm++; + *pd++ = _mm_cvtsi64_si32 ( + _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - - save128Aligned( (__m128i*)pd, - _mm_adds_epu8( load128Unaligned((__m128i*)ps), - load128Aligned ((__m128i*)pd)) ); - pd += 4; - ps += 4; - w -= 4; + __m128i s; + + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + s = combine4 ((__m128i*)ps, (__m128i*)pm); + + save_128_aligned ( + (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); + + pd += 4; + ps += 4; + if (pm) + pm += 4; + w -= 4; } while (w--) { - s = *ps++; - d = *pd; - *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + s = combine1 (ps, pm); + d = *pd; + + ps++; + *pd++ = _mm_cvtsi64_si32 ( + _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + if (pm) + pm++; } } -static inline uint32_t -coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst) +static force_inline uint32_t +core_combine_saturate_u_pixel_sse2 (uint32_t src, + uint32_t dst) { __m64 ms = unpack_32_1x64 (src); __m64 md = unpack_32_1x64 (dst); @@ -1165,1816 +1501,2061 @@ coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst) if (sa > da) { - ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (FbIntDiv(da, sa) << 24))); + ms = pix_multiply_1x64 ( + ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24))); } return pack_1x64_32 (_mm_adds_pu16 (md, ms)); } -static inline void -coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, int w) +static force_inline void +core_combine_saturate_u_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) { - uint32_t s,d; + uint32_t s, d; - uint32_t packCmp; - __m128i xmmSrc, xmmDst; + uint32_t pack_cmp; + __m128i xmm_src, xmm_dst; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - d = *pd; - *pd++ = coreCombineSaturateUPixelsse2 (s, d); - w--; + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - - xmmDst = load128Aligned ((__m128i*)pd); - xmmSrc = load128Unaligned((__m128i*)ps); - - packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24), - _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24))); - - /* if some alpha src is grater than respective ~alpha dst */ - if (packCmp) - { - s = *ps++; - d = *pd; - *pd++ = coreCombineSaturateUPixelsse2 (s, d); - - s = *ps++; - d = *pd; - *pd++ = coreCombineSaturateUPixelsse2 (s, d); - - s = *ps++; - d = *pd; - *pd++ = coreCombineSaturateUPixelsse2 (s, d); - - s = *ps++; - d = *pd; - *pd++ = coreCombineSaturateUPixelsse2 (s, d); - } - else - { - save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc)); - - pd += 4; - ps += 4; - } - - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_dst = load_128_aligned ((__m128i*)pd); + xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); + + pack_cmp = _mm_movemask_epi8 ( + _mm_cmpgt_epi32 ( + _mm_srli_epi32 (xmm_src, 24), + _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); + + /* if some alpha src is grater than respective ~alpha dst */ + if (pack_cmp) + { + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + } + else + { + save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); + + pd += 4; + ps += 4; + if (pm) + pm += 4; + } + + w -= 4; } while (w--) { - s = *ps++; - d = *pd; - *pd++ = coreCombineSaturateUPixelsse2 (s, d); + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + ps++; + if (pm) + pm++; } } -static inline void -coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) +static force_inline void +core_combine_src_ca_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t *pm, + int w) { uint32_t s, m; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmMaskLo, xmmMaskHi; - __m128i xmmDstLo, xmmDstHi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst_lo, xmm_dst_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - m = *pm++; - *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); - w--; + s = *ps++; + m = *pm++; + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); - xmmSrcHi = load128Unaligned ((__m128i*)ps); - xmmMaskHi = load128Unaligned ((__m128i*)pm); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi); + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); - save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - ps += 4; - pd += 4; - pm += 4; - w -= 4; + ps += 4; + pd += 4; + pm += 4; + w -= 4; } while (w) { - s = *ps++; - m = *pm++; - *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); - w--; + s = *ps++; + m = *pm++; + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + w--; } } -static inline uint32_t -coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) +static force_inline uint32_t +core_combine_over_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) { __m64 s = unpack_32_1x64 (src); - __m64 expAlpha = expandAlpha_1x64 (s); - __m64 unpkMask = unpack_32_1x64 (mask); - __m64 unpkDst = unpack_32_1x64 (dst); + __m64 expAlpha = expand_alpha_1x64 (s); + __m64 unpk_mask = unpack_32_1x64 (mask); + __m64 unpk_dst = unpack_32_1x64 (dst); - return pack_1x64_32 (inOver_1x64 (&s, &expAlpha, &unpkMask, &unpkDst)); + return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst)); } -static inline void -coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) +static force_inline void +core_combine_over_ca_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t *pm, + int w) { uint32_t s, m, d; - __m128i xmmAlphaLo, xmmAlphaHi; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmMaskLo, xmmMaskHi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; - *pd++ = coreCombineOverCPixelsse2 (s, m, d); - w--; + *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); - - xmmDstHi = load128Aligned ((__m128i*)pd); - xmmSrcHi = load128Unaligned ((__m128i*)ps); - xmmMaskHi = load128Unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); - - inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi); - - save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; } while (w) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; - *pd++ = coreCombineOverCPixelsse2 (s, m, d); - w--; + *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); + w--; } } -static inline uint32_t -coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) +static force_inline uint32_t +core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) { __m64 d = unpack_32_1x64 (dst); - return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask)))); + return pack_1x64_32 ( + over_1x64 (d, expand_alpha_1x64 (d), + pix_multiply_1x64 (unpack_32_1x64 (src), + unpack_32_1x64 (mask)))); } -static inline void -coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) +static force_inline void +core_combine_over_reverse_ca_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t *pm, + int w) { uint32_t s, m, d; - __m128i xmmAlphaLo, xmmAlphaHi; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmMaskLo, xmmMaskHi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; - *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d); - w--; + *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); - - xmmDstHi = load128Aligned ((__m128i*)pd); - xmmSrcHi = load128Unaligned ((__m128i*)ps); - xmmMaskHi = load128Unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); - pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi); - - save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask_lo, &xmm_mask_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; } while (w) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; - *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d); - w--; + *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); + w--; } } -static inline void -coreCombineInCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +static force_inline void +core_combine_in_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) { uint32_t s, m, d; - __m128i xmmAlphaLo, xmmAlphaHi; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmMaskLo, xmmMaskHi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), + expand_alpha_1x64 (unpack_32_1x64 (d)))); - *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), - expandAlpha_1x64 (unpack_32_1x64 (d)))); - w--; + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); - xmmDstHi = load128Aligned ((__m128i*)pd); - xmmSrcHi = load128Unaligned ((__m128i*)ps); - xmmMaskHi = load128Unaligned ((__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); - pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); - pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi); + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); - save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); - ps += 4; - pd += 4; - pm += 4; - w -= 4; + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; } while (w) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (m)), + expand_alpha_1x64 (unpack_32_1x64 (d)))); - *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), - expandAlpha_1x64 (unpack_32_1x64 (d)))); - w--; + w--; } } -static inline void -coreCombineInReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +static force_inline void +core_combine_in_reverse_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) { uint32_t s, m, d; - __m128i xmmAlphaLo, xmmAlphaHi; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmMaskLo, xmmMaskHi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), - pixMultiply_1x64 (unpack_32_1x64 (m), - expandAlpha_1x64 (unpack_32_1x64 (s))))); - w--; + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), + pix_multiply_1x64 (unpack_32_1x64 (m), + expand_alpha_1x64 (unpack_32_1x64 (s))))); + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); - - xmmDstHi = load128Aligned ((__m128i*)pd); - xmmSrcHi = load128Unaligned ((__m128i*)ps); - xmmMaskHi = load128Unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); - pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi); - - pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi); - - save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; } while (w) { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), - pixMultiply_1x64 (unpack_32_1x64 (m), - expandAlpha_1x64 (unpack_32_1x64 (s))))); - w--; + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), + pix_multiply_1x64 (unpack_32_1x64 (m), + expand_alpha_1x64 (unpack_32_1x64 (s))))); + w--; } } -static inline void -coreCombineOutCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +static force_inline void +core_combine_out_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) { uint32_t s, m, d; - __m128i xmmAlphaLo, xmmAlphaHi; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmMaskLo, xmmMaskHi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), - negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); - w--; + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (m)), + negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); - - xmmDstHi = load128Aligned ((__m128i*)pd); - xmmSrcHi = load128Unaligned ((__m128i*)ps); - xmmMaskHi = load128Unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); - negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi); - - pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi); - pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi); - - save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; } while (w) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (m)), + negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); - *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), - negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); - w--; + w--; } } -static inline void -coreCombineOutReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +static force_inline void +core_combine_out_reverse_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) { uint32_t s, m, d; - __m128i xmmAlphaLo, xmmAlphaHi; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmMaskLo, xmmMaskHi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), - negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m), - expandAlpha_1x64 (unpack_32_1x64 (s)))))); - w--; + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), + negate_1x64 (pix_multiply_1x64 ( + unpack_32_1x64 (m), + expand_alpha_1x64 (unpack_32_1x64 (s)))))); + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); - xmmDstHi = load128Aligned ((__m128i*)pd); - xmmSrcHi = load128Unaligned ((__m128i*)ps); - xmmMaskHi = load128Unaligned ((__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); - pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask_lo, &xmm_mask_hi); - negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + negate_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); - pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi); + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); - save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - ps += 4; - pd += 4; - pm += 4; - w -= 4; + ps += 4; + pd += 4; + pm += 4; + w -= 4; } while (w) { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), - negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m), - expandAlpha_1x64 (unpack_32_1x64 (s)))))); - w--; + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), + negate_1x64 (pix_multiply_1x64 ( + unpack_32_1x64 (m), + expand_alpha_1x64 (unpack_32_1x64 (s)))))); + w--; } } -static inline uint32_t -coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) +static force_inline uint32_t +core_combine_atop_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) { __m64 m = unpack_32_1x64 (mask); __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); - __m64 sa = expandAlpha_1x64 (s); - __m64 da = expandAlpha_1x64 (d); + __m64 sa = expand_alpha_1x64 (s); + __m64 da = expand_alpha_1x64 (d); - s = pixMultiply_1x64 (s, m); - m = negate_1x64 (pixMultiply_1x64 (m, sa)); + s = pix_multiply_1x64 (s, m); + m = negate_1x64 (pix_multiply_1x64 (m, sa)); - return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da)); + return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); } -static inline void -coreCombineAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +static force_inline void +core_combine_atop_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) { uint32_t s, m, d; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; - __m128i xmmAlphaDstLo, xmmAlphaDstHi; - __m128i xmmMaskLo, xmmMaskHi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; - *pd++ = coreCombineAtopCPixelsse2 (s, m, d); - w--; + *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); - - xmmDstHi = load128Aligned ((__m128i*)pd); - xmmSrcHi = load128Unaligned ((__m128i*)ps); - xmmMaskHi = load128Unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); - - pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi); - pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi); - - negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, - &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi, - &xmmDstLo, &xmmDstHi); - - save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_mask_lo, &xmm_mask_hi); + + negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + pix_add_multiply_2x128 ( + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; } while (w) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; - *pd++ = coreCombineAtopCPixelsse2 (s, m, d); - w--; + *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); + w--; } } -static inline uint32_t -coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) +static force_inline uint32_t +core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) { __m64 m = unpack_32_1x64 (mask); __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); - __m64 da = negate_1x64 (expandAlpha_1x64 (d)); - __m64 sa = expandAlpha_1x64 (s); + __m64 da = negate_1x64 (expand_alpha_1x64 (d)); + __m64 sa = expand_alpha_1x64 (s); - s = pixMultiply_1x64 (s, m); - m = pixMultiply_1x64 (m, sa); + s = pix_multiply_1x64 (s, m); + m = pix_multiply_1x64 (m, sa); - return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da)); + return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); } -static inline void -coreCombineReverseAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +static force_inline void +core_combine_reverse_atop_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) { uint32_t s, m, d; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; - __m128i xmmAlphaDstLo, xmmAlphaDstHi; - __m128i xmmMaskLo, xmmMaskHi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; - *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d); - w--; + *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); - - xmmDstHi = load128Aligned ((__m128i*)pd); - xmmSrcHi = load128Unaligned ((__m128i*)ps); - xmmMaskHi = load128Unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); - - pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi); - pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi); - - negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); - - pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, - &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi, - &xmmDstLo, &xmmDstHi); - - save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_mask_lo, &xmm_mask_hi); + + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_add_multiply_2x128 ( + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; } while (w) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; - *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d); - w--; + *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); + w--; } } -static inline uint32_t -coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) +static force_inline uint32_t +core_combine_xor_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) { __m64 a = unpack_32_1x64 (mask); __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); - __m64 alphaDst = negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s))); - __m64 dest = pixMultiply_1x64 (s, a); - __m64 alphaSrc = negate_1x64 (expandAlpha_1x64 (d)); + __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 ( + a, expand_alpha_1x64 (s))); + __m64 dest = pix_multiply_1x64 (s, a); + __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d)); - return pack_1x64_32 (pixAddMultiply_1x64 (&d, - &alphaDst, - &dest, - &alphaSrc)); + return pack_1x64_32 (pix_add_multiply_1x64 (&d, + &alpha_dst, + &dest, + &alpha_src)); } -static inline void -coreCombineXorCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +static force_inline void +core_combine_xor_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) { uint32_t s, m, d; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; - __m128i xmmAlphaDstLo, xmmAlphaDstHi; - __m128i xmmMaskLo, xmmMaskHi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; - *pd++ = coreCombineXorCPixelsse2 (s, m, d); - w--; + *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); - - xmmDstHi = load128Aligned ((__m128i*)pd); - xmmSrcHi = load128Unaligned ((__m128i*)ps); - xmmMaskHi = load128Unaligned ((__m128i*)pm); - - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); - expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); - - pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi); - pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi); - - negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); - negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, - &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi, - &xmmDstLo, &xmmDstHi); - - save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_mask_lo, &xmm_mask_hi); + + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + negate_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_add_multiply_2x128 ( + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; } while (w) { - s = *ps++; - m = *pm++; - d = *pd; + s = *ps++; + m = *pm++; + d = *pd; - *pd++ = coreCombineXorCPixelsse2 (s, m, d); - w--; + *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); + w--; } } -static inline void -coreCombineAddCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +static force_inline void +core_combine_add_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) { uint32_t s, m, d; - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - __m128i xmmMaskLo, xmmMaskHi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s), - unpack_32_1x64 (m)), - unpack_32_1x64 (d))); - w--; + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), + unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; } /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)ps); - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); + cache_prefetch ((__m128i*)ps); + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); while (w >= 4) { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)ps); - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); - - xmmSrcHi = load128Unaligned ((__m128i*)ps); - xmmMaskHi = load128Unaligned ((__m128i*)pm); - xmmDstHi = load128Aligned ((__m128i*)pd); - - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - - pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi); - - save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo), - _mm_adds_epu8 (xmmSrcHi, xmmDstHi))); - - ps += 4; - pd += 4; - pm += 4; - w -= 4; + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)ps); + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 ( + _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), + _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; } while (w) { - s = *ps++; - m = *pm++; - d = *pd; - - *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s), - unpack_32_1x64 (m)), - unpack_32_1x64 (d))); - w--; + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), + unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; } } -/* ------------------------------------------------------------------------------------------------- - * fbComposeSetupSSE2 +/* --------------------------------------------------- + * fb_compose_setup_sSE2 */ -static inline __m64 -createMask_16_64 (uint16_t mask) +static force_inline __m64 +create_mask_16_64 (uint16_t mask) { return _mm_set1_pi16 (mask); } -static inline __m128i -createMask_16_128 (uint16_t mask) +static force_inline __m128i +create_mask_16_128 (uint16_t mask) { return _mm_set1_epi16 (mask); } -static inline __m64 -createMask_2x32_64 (uint32_t mask0, uint32_t mask1) +static force_inline __m64 +create_mask_2x32_64 (uint32_t mask0, + uint32_t mask1) { return _mm_set_pi32 (mask0, mask1); } -static inline __m128i -createMask_2x32_128 (uint32_t mask0, uint32_t mask1) +static force_inline __m128i +create_mask_2x32_128 (uint32_t mask0, + uint32_t mask1) { return _mm_set_epi32 (mask0, mask1, mask0, mask1); } /* SSE2 code patch for fbcompose.c */ -static FASTCALL void -sse2CombineMaskU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_over_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineReverseInUsse2 (dst, src, width); - _mm_empty(); + core_combine_over_u_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineOverU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_over_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineOverUsse2 (dst, src, width); - _mm_empty(); + core_combine_over_reverse_u_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineOverReverseU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_in_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineOverReverseUsse2 (dst, src, width); - _mm_empty(); + core_combine_in_u_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineInU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_in_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineInUsse2 (dst, src, width); - _mm_empty(); + core_combine_reverse_in_u_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineInReverseU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_out_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineReverseInUsse2 (dst, src, width); - _mm_empty(); + core_combine_out_u_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineOutU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_out_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineOutUsse2 (dst, src, width); - _mm_empty(); + core_combine_reverse_out_u_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineOutReverseU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_atop_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineReverseOutUsse2 (dst, src, width); - _mm_empty(); + core_combine_atop_u_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineAtopU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_atop_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineAtopUsse2 (dst, src, width); - _mm_empty(); + core_combine_reverse_atop_u_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineAtopReverseU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_xor_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineReverseAtopUsse2 (dst, src, width); - _mm_empty(); + core_combine_xor_u_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineXorU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_add_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineXorUsse2 (dst, src, width); - _mm_empty(); + core_combine_add_u_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineAddU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_saturate_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineAddUsse2 (dst, src, width); - _mm_empty(); + core_combine_saturate_u_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineSaturateU (uint32_t *dst, const uint32_t *src, int width) +static void +sse2_combine_src_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineSaturateUsse2 (dst, src, width); - _mm_empty(); + core_combine_src_ca_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineSrcC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +static void +sse2_combine_over_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineSrcCsse2 (dst, src, mask, width); - _mm_empty(); + core_combine_over_ca_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineOverC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +static void +sse2_combine_over_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineOverCsse2 (dst, src, mask, width); - _mm_empty(); + core_combine_over_reverse_ca_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineOverReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +static void +sse2_combine_in_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineOverReverseCsse2 (dst, src, mask, width); - _mm_empty(); + core_combine_in_ca_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineInC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +static void +sse2_combine_in_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineInCsse2 (dst, src, mask, width); - _mm_empty(); + core_combine_in_reverse_ca_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineInReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +static void +sse2_combine_out_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineInReverseCsse2 (dst, src, mask, width); - _mm_empty(); + core_combine_out_ca_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineOutC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +static void +sse2_combine_out_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineOutCsse2 (dst, src, mask, width); - _mm_empty(); + core_combine_out_reverse_ca_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineOutReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +static void +sse2_combine_atop_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineOutReverseCsse2 (dst, src, mask, width); - _mm_empty(); + core_combine_atop_ca_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineAtopC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +static void +sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineAtopCsse2 (dst, src, mask, width); - _mm_empty(); + core_combine_reverse_atop_ca_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineAtopReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +static void +sse2_combine_xor_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineReverseAtopCsse2 (dst, src, mask, width); - _mm_empty(); + core_combine_xor_ca_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineXorC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +static void +sse2_combine_add_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { - coreCombineXorCsse2 (dst, src, mask, width); - _mm_empty(); + core_combine_add_ca_sse2 (dst, src, mask, width); + _mm_empty (); } -static FASTCALL void -sse2CombineAddC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) -{ - coreCombineAddCsse2 (dst, src, mask, width); - _mm_empty(); -} +/* ------------------------------------------------------------------- + * composite_over_n_8888 + */ -void -fbComposeSetupSSE2(void) +static void +sse2_composite_over_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) { - static pixman_bool_t initialized = FALSE; - - if (initialized) - return; - - /* check if we have SSE2 support and initialize accordingly */ - if (pixman_have_sse2()) - { - /* SSE2 constants */ - Mask565r = createMask_2x32_128 (0x00f80000, 0x00f80000); - Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000); - Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0); - Mask565b = createMask_2x32_128 (0x0000001f, 0x0000001f); - MaskRed = createMask_2x32_128 (0x00f80000, 0x00f80000); - MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00); - MaskBlue = createMask_2x32_128 (0x000000f8, 0x000000f8); - Mask565FixRB = createMask_2x32_128 (0x00e000e0, 0x00e000e0); - Mask565FixG = createMask_2x32_128 (0x0000c000, 0x0000c000); - Mask0080 = createMask_16_128 (0x0080); - Mask00ff = createMask_16_128 (0x00ff); - Mask0101 = createMask_16_128 (0x0101); - Maskffff = createMask_16_128 (0xffff); - Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000); - MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000); - - /* MMX constants */ - xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f); - xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840); - - xMask0080 = createMask_16_64 (0x0080); - xMask00ff = createMask_16_64 (0x00ff); - xMask0101 = createMask_16_64 (0x0101); - xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000); - - /* SSE code patch for fbcompose.c */ - pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = sse2CombineOverU; - pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU; - pixman_composeFunctions.combineU[PIXMAN_OP_IN] = sse2CombineInU; - pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU; - pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = sse2CombineOutU; - - pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU; - pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = sse2CombineAtopU; - pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU; - pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = sse2CombineXorU; - pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = sse2CombineAddU; - - pixman_composeFunctions.combineU[PIXMAN_OP_SATURATE] = sse2CombineSaturateU; - - pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = sse2CombineSrcC; - pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = sse2CombineOverC; - pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC; - pixman_composeFunctions.combineC[PIXMAN_OP_IN] = sse2CombineInC; - pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC; - pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = sse2CombineOutC; - pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC; - pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = sse2CombineAtopC; - pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC; - pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = sse2CombineXorC; - pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = sse2CombineAddC; - - pixman_composeFunctions.combineMaskU = sse2CombineMaskU; - } - - initialized = TRUE; - - _mm_empty(); -} + uint32_t src; + uint32_t *dst_line, *dst, d; + uint16_t w; + int dst_stride; + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + src = _pixman_image_get_solid (src_image, dst_image->bits.format); -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSolid_nx8888 - */ - -void -fbCompositeSolid_nx8888sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t src; - uint32_t *dstLine, *dst, d; - uint16_t w; - int dstStride; - __m128i xmmSrc, xmmAlpha; - __m128i xmmDst, xmmDstLo, xmmDstHi; - - fbComposeGetSolid(pSrc, src, pDst->bits.format); - - if (src >> 24 == 0) + if (src == 0) return; - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - xmmSrc = expandPixel_32_1x128 (src); - xmmAlpha = expandAlpha_1x128 (xmmSrc); + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); while (height--) { - dst = dstLine; + dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)dst); + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)dst); - dstLine += dstStride; - w = width; + dst_line += dst_stride; + w = width; - while (w && (unsigned long)dst & 15) - { - d = *dst; - *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), - _mm_movepi64_pi64 (xmmAlpha), - unpack_32_1x64 (d))); - w--; - } + while (w && (unsigned long)dst & 15) + { + d = *dst; + *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), + _mm_movepi64_pi64 (xmm_alpha), + unpack_32_1x64 (d))); + w--; + } - cachePrefetch ((__m128i*)dst); + cache_prefetch ((__m128i*)dst); - while (w >= 4) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)dst); + while (w >= 4) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)dst); - xmmDst = load128Aligned ((__m128i*)dst); + xmm_dst = load_128_aligned ((__m128i*)dst); - unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDstLo, &xmmDstHi); + over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_dst_lo, &xmm_dst_hi); - /* rebuid the 4 pixel data and save*/ - save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + /* rebuid the 4 pixel data and save*/ + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - w -= 4; - dst += 4; - } + w -= 4; + dst += 4; + } - while (w) - { - d = *dst; - *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), - _mm_movepi64_pi64 (xmmAlpha), - unpack_32_1x64 (d))); - w--; - } + while (w) + { + d = *dst; + *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), + _mm_movepi64_pi64 (xmm_alpha), + unpack_32_1x64 (d))); + w--; + } } - _mm_empty(); -} - -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSolid_nx0565 - */ -void -fbCompositeSolid_nx0565sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t src; - uint16_t *dstLine, *dst, d; - uint16_t w; - int dstStride; - __m128i xmmSrc, xmmAlpha; - __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; - - fbComposeGetSolid(pSrc, src, pDst->bits.format); - - if (src >> 24 == 0) - return; - - fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); - - xmmSrc = expandPixel_32_1x128 (src); - xmmAlpha = expandAlpha_1x128 (xmmSrc); - - while (height--) - { - dst = dstLine; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)dst); - - dstLine += dstStride; - w = width; - - while (w && (unsigned long)dst & 15) - { - d = *dst; - - *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), - _mm_movepi64_pi64 (xmmAlpha), - expand565_16_1x64 (d)))); - w--; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)dst); - - while (w >= 8) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)dst); - - xmmDst = load128Aligned ((__m128i*)dst); - - unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); - - over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst0, &xmmDst1); - over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst2, &xmmDst3); - - xmmDst = pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); - save128Aligned ((__m128i*)dst, xmmDst); - - dst += 8; - w -= 8; - } - - while (w--) - { - d = *dst; - *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), - _mm_movepi64_pi64 (xmmAlpha), - expand565_16_1x64 (d)))); - } - } - - _mm_empty(); + _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSolidMask_nx8888x8888C +/* --------------------------------------------------------------------- + * composite_over_n_0565 */ +static void +sse2_composite_over_n_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint16_t *dst_line, *dst, d; + uint16_t w; + int dst_stride; + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; -void -fbCompositeSolidMask_nx8888x8888Csse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t src, srca; - uint32_t *dstLine, d; - uint32_t *maskLine, m; - uint32_t packCmp; - int dstStride, maskStride; - - __m128i xmmSrc, xmmAlpha; - __m128i xmmDst, xmmDstLo, xmmDstHi; - __m128i xmmMask, xmmMaskLo, xmmMaskHi; - - __m64 mmxSrc, mmxAlpha, mmxMask, mmxDst; - - fbComposeGetSolid(pSrc, src, pDst->bits.format); + src = _pixman_image_get_solid (src_image, dst_image->bits.format); - srca = src >> 24; - if (srca == 0) + if (src == 0) return; - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ()); - xmmAlpha = expandAlpha_1x128 (xmmSrc); - mmxSrc = _mm_movepi64_pi64 (xmmSrc); - mmxAlpha = _mm_movepi64_pi64 (xmmAlpha); + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); while (height--) { - int w = width; - uint32_t *pm = (uint32_t *)maskLine; - uint32_t *pd = (uint32_t *)dstLine; - - dstLine += dstStride; - maskLine += maskStride; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); - - while (w && (unsigned long)pd & 15) - { - m = *pm++; - - if (m) - { - d = *pd; - mmxMask = unpack_32_1x64 (m); - mmxDst = unpack_32_1x64 (d); - - *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc, - &mmxAlpha, - &mmxMask, - &mmxDst)); - } - - pd++; - w--; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)pd); - cachePrefetch ((__m128i*)pm); - - while (w >= 4) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)pd); - cachePrefetchNext ((__m128i*)pm); - - xmmMask = load128Unaligned ((__m128i*)pm); - - packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128())); - - /* if all bits in mask are zero, packCmp are equal to 0xffff */ - if (packCmp != 0xffff) - { - xmmDst = load128Aligned ((__m128i*)pd); - - unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); - unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); - - inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi); - - save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); - } - - pd += 4; - pm += 4; - w -= 4; - } - - while (w) - { - m = *pm++; - - if (m) - { - d = *pd; - mmxMask = unpack_32_1x64 (m); - mmxDst = unpack_32_1x64 (d); - - *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc, - &mmxAlpha, - &mmxMask, - &mmxDst)); - } - - pd++; - w--; - } + dst = dst_line; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)dst); + + dst_line += dst_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + d = *dst; + + *dst++ = pack_565_32_16 ( + pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), + _mm_movepi64_pi64 (xmm_alpha), + expand565_16_1x64 (d)))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)dst); + + while (w >= 8) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)dst); + + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + + over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_dst0, &xmm_dst1); + over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_dst2, &xmm_dst3); + + xmm_dst = pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + + save_128_aligned ((__m128i*)dst, xmm_dst); + + dst += 8; + w -= 8; + } + + while (w--) + { + d = *dst; + *dst++ = pack_565_32_16 ( + pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), + _mm_movepi64_pi64 (xmm_alpha), + expand565_16_1x64 (d)))); + } } - _mm_empty(); + _mm_empty (); } - -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSrc_8888x8x8888 +/* --------------------------------------------------------------------------- + * composite_over_n_8888_8888_ca */ -void -fbCompositeSrc_8888x8x8888sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t *dstLine, *dst; - uint32_t *srcLine, *src; - uint32_t mask; - uint16_t w; - int dstStride, srcStride; - - __m128i xmmMask; - __m128i xmmSrc, xmmSrcLo, xmmSrcHi; - __m128i xmmDst, xmmDstLo, xmmDstHi; - __m128i xmmAlphaLo, xmmAlphaHi; - - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); - fbComposeGetSolid (pMask, mask, pDst->bits.format); - - xmmMask = createMask_16_128 (mask >> 24); - - while (height--) - { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; - w = width; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)dst); - cachePrefetch ((__m128i*)src); - - while (w && (unsigned long)dst & 15) - { - uint32_t s = *src++; - uint32_t d = *dst; - - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expandAlpha_1x64 (ms); - __m64 dest = _mm_movepi64_pi64 (xmmMask); - __m64 alphaDst = unpack_32_1x64 (d); - - *dst++ = pack_1x64_32 (inOver_1x64 (&ms, - &alpha, - &dest, - &alphaDst)); - - w--; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)dst); - cachePrefetch ((__m128i*)src); - - while (w >= 4) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)dst); - cachePrefetchNext ((__m128i*)src); - - xmmSrc = load128Unaligned ((__m128i*)src); - xmmDst = load128Aligned ((__m128i*)dst); - - unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); +static void +sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint32_t *dst_line, d; + uint32_t *mask_line, m; + uint32_t pack_cmp; + int dst_stride, mask_stride; - inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi); + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; - dst += 4; - src += 4; - w -= 4; - } + src = _pixman_image_get_solid (src_image, dst_image->bits.format); - while (w) - { - uint32_t s = *src++; - uint32_t d = *dst; + if (src == 0) + return; - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expandAlpha_1x64 (ms); - __m64 mask = _mm_movepi64_pi64 (xmmMask); - __m64 dest = unpack_32_1x64 (d); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - *dst++ = pack_1x64_32 (inOver_1x64 (&ms, - &alpha, - &mask, - &dest)); + xmm_src = _mm_unpacklo_epi8 ( + create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = _mm_movepi64_pi64 (xmm_src); + mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); - w--; - } + while (height--) + { + int w = width; + const uint32_t *pm = (uint32_t *)mask_line; + uint32_t *pd = (uint32_t *)dst_line; + + dst_line += dst_stride; + mask_line += mask_stride; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + m = *pm++; + + if (m) + { + d = *pd; + mmx_mask = unpack_32_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *pd = pack_1x64_32 (in_over_1x64 (&mmx_src, + &mmx_alpha, + &mmx_mask, + &mmx_dest)); + } + + pd++; + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_mask = load_128_unaligned ((__m128i*)pm); + + pack_cmp = + _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ + if (pack_cmp != 0xffff) + { + xmm_dst = load_128_aligned ((__m128i*)pd); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + m = *pm++; + + if (m) + { + d = *pd; + mmx_mask = unpack_32_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *pd = pack_1x64_32 ( + in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); + } + + pd++; + w--; + } } - _mm_empty(); + _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSrc_x888xnx8888 +/*--------------------------------------------------------------------- + * composite_over_8888_n_8888 */ -void -fbCompositeSrc_x888xnx8888sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t *dstLine, *dst; - uint32_t *srcLine, *src; - uint32_t mask; - int dstStride, srcStride; - uint16_t w; - - __m128i xmmMask, xmmAlpha; - __m128i xmmSrc, xmmSrcLo, xmmSrcHi; - __m128i xmmDst, xmmDstLo, xmmDstHi; - - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); - fbComposeGetSolid (pMask, mask, pDst->bits.format); - - xmmMask = createMask_16_128 (mask >> 24); - xmmAlpha = Mask00ff; - while (height--) - { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; - w = width; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)dst); - cachePrefetch ((__m128i*)src); - - while (w && (unsigned long)dst & 15) - { - uint32_t s = (*src++) | 0xff000000; - uint32_t d = *dst; - - __m64 src = unpack_32_1x64 (s); - __m64 alpha = _mm_movepi64_pi64 (xmmAlpha); - __m64 mask = _mm_movepi64_pi64 (xmmMask); - __m64 dest = unpack_32_1x64 (d); - - *dst++ = pack_1x64_32 (inOver_1x64 (&src, - &alpha, - &mask, - &dest)); - - w--; - } +static void +sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t mask; + uint16_t w; + int dst_stride, src_stride; - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)dst); - cachePrefetch ((__m128i*)src); + __m128i xmm_mask; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; - while (w >= 4) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)dst); - cachePrefetchNext ((__m128i*)src); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000); - xmmDst = load128Aligned ((__m128i*)dst); + mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); - unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + xmm_mask = create_mask_16_128 (mask >> 24); - inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlpha, &xmmAlpha, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi); + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)dst); + cache_prefetch ((__m128i*)src); + + while (w && (unsigned long)dst & 15) + { + uint32_t s = *src++; + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + __m64 alpha = expand_alpha_1x64 (ms); + __m64 dest = _mm_movepi64_pi64 (xmm_mask); + __m64 alpha_dst = unpack_32_1x64 (d); + + *dst++ = pack_1x64_32 ( + in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); + + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)dst); + cache_prefetch ((__m128i*)src); + + while (w >= 4) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)dst); + cache_prefetch_next ((__m128i*)src); + + xmm_src = load_128_unaligned ((__m128i*)src); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + dst += 4; + src += 4; + w -= 4; + } + + while (w) + { + uint32_t s = *src++; + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + __m64 alpha = expand_alpha_1x64 (ms); + __m64 mask = _mm_movepi64_pi64 (xmm_mask); + __m64 dest = unpack_32_1x64 (d); + + *dst++ = pack_1x64_32 ( + in_over_1x64 (&ms, &alpha, &mask, &dest)); + + w--; + } + } - save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + _mm_empty (); +} - dst += 4; - src += 4; - w -= 4; +/* --------------------------------------------------------------------- + * composite_over_x888_n_8888 + */ +static void +sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t mask; + int dst_stride, src_stride; + uint16_t w; - } + __m128i xmm_mask, xmm_alpha; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - while (w) - { - uint32_t s = (*src++) | 0xff000000; - uint32_t d = *dst; + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - __m64 src = unpack_32_1x64 (s); - __m64 alpha = _mm_movepi64_pi64 (xmmAlpha); - __m64 mask = _mm_movepi64_pi64 (xmmMask); - __m64 dest = unpack_32_1x64 (d); + mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); - *dst++ = pack_1x64_32 (inOver_1x64 (&src, - &alpha, - &mask, - &dest)); + xmm_mask = create_mask_16_128 (mask >> 24); + xmm_alpha = mask_00ff; - w--; - } + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)dst); + cache_prefetch ((__m128i*)src); + + while (w && (unsigned long)dst & 15) + { + uint32_t s = (*src++) | 0xff000000; + uint32_t d = *dst; + + __m64 src = unpack_32_1x64 (s); + __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); + __m64 mask = _mm_movepi64_pi64 (xmm_mask); + __m64 dest = unpack_32_1x64 (d); + + *dst++ = pack_1x64_32 ( + in_over_1x64 (&src, &alpha, &mask, &dest)); + + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)dst); + cache_prefetch ((__m128i*)src); + + while (w >= 4) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)dst); + cache_prefetch_next ((__m128i*)src); + + xmm_src = _mm_or_si128 ( + load_128_unaligned ((__m128i*)src), mask_ff000000); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha, &xmm_alpha, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + dst += 4; + src += 4; + w -= 4; + + } + + while (w) + { + uint32_t s = (*src++) | 0xff000000; + uint32_t d = *dst; + + __m64 src = unpack_32_1x64 (s); + __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); + __m64 mask = _mm_movepi64_pi64 (xmm_mask); + __m64 dest = unpack_32_1x64 (d); + + *dst++ = pack_1x64_32 ( + in_over_1x64 (&src, &alpha, &mask, &dest)); + + w--; + } } - _mm_empty(); + _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSrc_8888x8888 +/* -------------------------------------------------------------------- + * composite_over_8888_8888 */ -void -fbCompositeSrc_8888x8888sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - int dstStride, srcStride; - uint32_t *dstLine, *dst; - uint32_t *srcLine, *src; - - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); - - dst = dstLine; - src = srcLine; +static void +sse2_composite_over_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + int dst_stride, src_stride; + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + dst = dst_line; + src = src_line; while (height--) { - coreCombineOverUsse2 (dst, src, width); + core_combine_over_u_sse2 (dst, src, NULL, width); - dst += dstStride; - src += srcStride; + dst += dst_stride; + src += src_stride; } - _mm_empty(); + _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSrc_8888x0565 +/* ------------------------------------------------------------------ + * composite_over_8888_0565 */ -static inline uint16_t -fbCompositeSrc_8888x0565pixel (uint32_t src, uint16_t dst) +static force_inline uint16_t +composite_over_8888_0565pixel (uint32_t src, uint16_t dst) { - __m64 ms; + __m64 ms; ms = unpack_32_1x64 (src); - return pack565_32_16( pack_1x64_32 (over_1x64 (ms, - expandAlpha_1x64 (ms), - expand565_16_1x64 (dst)))); + return pack_565_32_16 ( + pack_1x64_32 ( + over_1x64 ( + ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst)))); } -void -fbCompositeSrc_8888x0565sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint16_t *dstLine, *dst, d; - uint32_t *srcLine, *src, s; - int dstStride, srcStride; - uint16_t w; - - __m128i xmmAlphaLo, xmmAlphaHi; - __m128i xmmSrc, xmmSrcLo, xmmSrcHi; - __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; - - fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); +static void +sse2_composite_over_8888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst, d; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + uint16_t w; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); #if 0 /* FIXME @@ -2982,241 +3563,262 @@ fbCompositeSrc_8888x0565sse2 (pixman_op_t op, * I copy the code from MMX one and keep the fixme. * If it's a problem there, probably is a problem here. */ - assert (pSrc->pDrawable == pMask->pDrawable); + assert (src_image->drawable == mask_image->drawable); #endif while (height--) { - dst = dstLine; - src = srcLine; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)src); - cachePrefetch ((__m128i*)dst); - - dstLine += dstStride; - srcLine += srcStride; - w = width; - - /* Align dst on a 16-byte boundary */ - while (w && - ((unsigned long)dst & 15)) - { - s = *src++; - d = *dst; - - *dst++ = fbCompositeSrc_8888x0565pixel (s, d); - w--; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)src); - cachePrefetch ((__m128i*)dst); - - /* It's a 8 pixel loop */ - while (w >= 8) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)src); - cachePrefetchNext ((__m128i*)dst); - - /* I'm loading unaligned because I'm not sure about the address alignment. */ - xmmSrc = load128Unaligned ((__m128i*) src); - xmmDst = load128Aligned ((__m128i*) dst); - - /* Unpacking */ - unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); - unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); - - /* I'm loading next 4 pixels from memory before to optimze the memory read. */ - xmmSrc = load128Unaligned ((__m128i*) (src+4)); - - over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst0, &xmmDst1); - - /* Unpacking */ - unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); - expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); - - over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst2, &xmmDst3); - - save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3)); - - w -= 8; - dst += 8; - src += 8; - } - - while (w--) - { - s = *src++; - d = *dst; - - *dst++ = fbCompositeSrc_8888x0565pixel (s, d); - } + dst = dst_line; + src = src_line; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + + dst_line += dst_stride; + src_line += src_stride; + w = width; + + /* Align dst on a 16-byte boundary */ + while (w && + ((unsigned long)dst & 15)) + { + s = *src++; + d = *dst; + + *dst++ = composite_over_8888_0565pixel (s, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + + /* It's a 8 pixel loop */ + while (w >= 8) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)src); + cache_prefetch_next ((__m128i*)dst); + + /* I'm loading unaligned because I'm not sure + * about the address alignment. + */ + xmm_src = load_128_unaligned ((__m128i*) src); + xmm_dst = load_128_aligned ((__m128i*) dst); + + /* Unpacking */ + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + /* I'm loading next 4 pixels from memory + * before to optimze the memory read. + */ + xmm_src = load_128_unaligned ((__m128i*) (src + 4)); + + over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst0, &xmm_dst1); + + /* Unpacking */ + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst2, &xmm_dst3); + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + dst += 8; + src += 8; + } + + while (w--) + { + s = *src++; + d = *dst; + + *dst++ = composite_over_8888_0565pixel (s, d); + } } - _mm_empty(); + _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSolidMask_nx8x8888 +/* ----------------------------------------------------------------- + * composite_over_n_8_8888 */ -void -fbCompositeSolidMask_nx8x8888sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t src, srca; - uint32_t *dstLine, *dst; - uint8_t *maskLine, *mask; - int dstStride, maskStride; - uint16_t w; +static void +sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + uint16_t w; uint32_t m, d; - __m128i xmmSrc, xmmAlpha, xmmDef; - __m128i xmmDst, xmmDstLo, xmmDstHi; - __m128i xmmMask, xmmMaskLo, xmmMaskHi; + __m128i xmm_src, xmm_alpha, xmm_def; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest; + __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; - fbComposeGetSolid(pSrc, src, pDst->bits.format); + src = _pixman_image_get_solid (src_image, dst_image->bits.format); srca = src >> 24; - if (srca == 0) + if (src == 0) return; - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - xmmDef = createMask_2x32_128 (src, src); - xmmSrc = expandPixel_32_1x128 (src); - xmmAlpha = expandAlpha_1x128 (xmmSrc); - mmxSrc = _mm_movepi64_pi64 (xmmSrc); - mmxAlpha = _mm_movepi64_pi64 (xmmAlpha); + xmm_def = create_mask_2x32_128 (src, src); + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = _mm_movepi64_pi64 (xmm_src); + mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); while (height--) { - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; - w = width; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w && (unsigned long)dst & 15) - { - uint8_t m = *mask++; - - if (m) - { - d = *dst; - mmxMask = expandPixel_8_1x64 (m); - mmxDest = unpack_32_1x64 (d); - - *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc, - &mmxAlpha, - &mmxMask, - &mmxDest)); - } - - w--; - dst++; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w >= 4) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)mask); - cachePrefetchNext ((__m128i*)dst); - - m = *((uint32_t*)mask); - - if (srca == 0xff && m == 0xffffffff) - { - save128Aligned ((__m128i*)dst, xmmDef); - } - else if (m) - { - xmmDst = load128Aligned ((__m128i*) dst); - xmmMask = unpack_32_1x128 (m); - xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); - - /* Unpacking */ - unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); - unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); - - expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi); - - save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); - } - - w -= 4; - dst += 4; - mask += 4; - } - - while (w) - { - uint8_t m = *mask++; - - if (m) - { - d = *dst; - mmxMask = expandPixel_8_1x64 (m); - mmxDest = unpack_32_1x64 (d); - - *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc, - &mmxAlpha, - &mmxMask, - &mmxDest)); - } - - w--; - dst++; - } + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w && (unsigned long)dst & 15) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_pixel_8_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, + &mmx_alpha, + &mmx_mask, + &mmx_dest)); + } + + w--; + dst++; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w >= 4) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)mask); + cache_prefetch_next ((__m128i*)dst); + + m = *((uint32_t*)mask); + + if (srca == 0xff && m == 0xffffffff) + { + save_128_aligned ((__m128i*)dst, xmm_def); + } + else if (m) + { + xmm_dst = load_128_aligned ((__m128i*) dst); + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_pixel_8_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, + &mmx_alpha, + &mmx_mask, + &mmx_dest)); + } + + w--; + dst++; + } } - _mm_empty(); + _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSolidMask_nx8x8888 +/* ---------------------------------------------------------------- + * composite_over_n_8_8888 */ pixman_bool_t -pixmanFillsse2 (uint32_t *bits, - int stride, - int bpp, - int x, - int y, - int width, - int height, - uint32_t data) +pixman_fill_sse2 (uint32_t *bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t data) { - uint32_t byte_width; - uint8_t *byte_line; + uint32_t byte_width; + uint8_t *byte_line; - __m128i xmmDef; + __m128i xmm_def; if (bpp == 16 && (data >> 16 != (data & 0xffff))) return FALSE; @@ -3226,430 +3828,459 @@ pixmanFillsse2 (uint32_t *bits, if (bpp == 16) { - stride = stride * (int) sizeof (uint32_t) / 2; - byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); - byte_width = 2 * width; - stride *= 2; + stride = stride * (int) sizeof (uint32_t) / 2; + byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); + byte_width = 2 * width; + stride *= 2; } else { - stride = stride * (int) sizeof (uint32_t) / 4; - byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); - byte_width = 4 * width; - stride *= 4; + stride = stride * (int) sizeof (uint32_t) / 4; + byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); + byte_width = 4 * width; + stride *= 4; } - cachePrefetch ((__m128i*)byte_line); - xmmDef = createMask_2x32_128 (data, data); + cache_prefetch ((__m128i*)byte_line); + xmm_def = create_mask_2x32_128 (data, data); while (height--) { - int w; - uint8_t *d = byte_line; - byte_line += stride; - w = byte_width; - - - cachePrefetchNext ((__m128i*)d); - - while (w >= 2 && ((unsigned long)d & 3)) - { - *(uint16_t *)d = data; - w -= 2; - d += 2; - } - - while (w >= 4 && ((unsigned long)d & 15)) - { - *(uint32_t *)d = data; - - w -= 4; - d += 4; - } - - cachePrefetchNext ((__m128i*)d); - - while (w >= 128) - { - cachePrefetch (((__m128i*)d) + 12); - - save128Aligned ((__m128i*)(d), xmmDef); - save128Aligned ((__m128i*)(d+16), xmmDef); - save128Aligned ((__m128i*)(d+32), xmmDef); - save128Aligned ((__m128i*)(d+48), xmmDef); - save128Aligned ((__m128i*)(d+64), xmmDef); - save128Aligned ((__m128i*)(d+80), xmmDef); - save128Aligned ((__m128i*)(d+96), xmmDef); - save128Aligned ((__m128i*)(d+112), xmmDef); - - d += 128; - w -= 128; - } - - if (w >= 64) - { - cachePrefetch (((__m128i*)d) + 8); - - save128Aligned ((__m128i*)(d), xmmDef); - save128Aligned ((__m128i*)(d+16), xmmDef); - save128Aligned ((__m128i*)(d+32), xmmDef); - save128Aligned ((__m128i*)(d+48), xmmDef); - - d += 64; - w -= 64; - } - - cachePrefetchNext ((__m128i*)d); - - if (w >= 32) - { - save128Aligned ((__m128i*)(d), xmmDef); - save128Aligned ((__m128i*)(d+16), xmmDef); - - d += 32; - w -= 32; - } - - if (w >= 16) - { - save128Aligned ((__m128i*)(d), xmmDef); - - d += 16; - w -= 16; - } - - cachePrefetchNext ((__m128i*)d); - - while (w >= 4) - { - *(uint32_t *)d = data; - - w -= 4; - d += 4; - } - - if (w >= 2) - { - *(uint16_t *)d = data; - w -= 2; - d += 2; - } + int w; + uint8_t *d = byte_line; + byte_line += stride; + w = byte_width; + + + cache_prefetch_next ((__m128i*)d); + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(uint16_t *)d = data; + w -= 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 15)) + { + *(uint32_t *)d = data; + + w -= 4; + d += 4; + } + + cache_prefetch_next ((__m128i*)d); + + while (w >= 128) + { + cache_prefetch (((__m128i*)d) + 12); + + save_128_aligned ((__m128i*)(d), xmm_def); + save_128_aligned ((__m128i*)(d + 16), xmm_def); + save_128_aligned ((__m128i*)(d + 32), xmm_def); + save_128_aligned ((__m128i*)(d + 48), xmm_def); + save_128_aligned ((__m128i*)(d + 64), xmm_def); + save_128_aligned ((__m128i*)(d + 80), xmm_def); + save_128_aligned ((__m128i*)(d + 96), xmm_def); + save_128_aligned ((__m128i*)(d + 112), xmm_def); + + d += 128; + w -= 128; + } + + if (w >= 64) + { + cache_prefetch (((__m128i*)d) + 8); + + save_128_aligned ((__m128i*)(d), xmm_def); + save_128_aligned ((__m128i*)(d + 16), xmm_def); + save_128_aligned ((__m128i*)(d + 32), xmm_def); + save_128_aligned ((__m128i*)(d + 48), xmm_def); + + d += 64; + w -= 64; + } + + cache_prefetch_next ((__m128i*)d); + + if (w >= 32) + { + save_128_aligned ((__m128i*)(d), xmm_def); + save_128_aligned ((__m128i*)(d + 16), xmm_def); + + d += 32; + w -= 32; + } + + if (w >= 16) + { + save_128_aligned ((__m128i*)(d), xmm_def); + + d += 16; + w -= 16; + } + + cache_prefetch_next ((__m128i*)d); + + while (w >= 4) + { + *(uint32_t *)d = data; + + w -= 4; + d += 4; + } + + if (w >= 2) + { + *(uint16_t *)d = data; + w -= 2; + d += 2; + } } - _mm_empty(); + _mm_empty (); return TRUE; } -void -fbCompositeSolidMaskSrc_nx8x8888sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t src, srca; - uint32_t *dstLine, *dst; - uint8_t *maskLine, *mask; - int dstStride, maskStride; - uint16_t w; - uint32_t m; - - __m128i xmmSrc, xmmDef; - __m128i xmmMask, xmmMaskLo, xmmMaskHi; - - fbComposeGetSolid(pSrc, src, pDst->bits.format); +static void +sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + uint16_t w; + uint32_t m; + + __m128i xmm_src, xmm_def; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); srca = src >> 24; - if (srca == 0) + if (src == 0) { - pixmanFillsse2 (pDst->bits.bits, pDst->bits.rowstride, - PIXMAN_FORMAT_BPP (pDst->bits.format), - xDst, yDst, width, height, 0); - return; + pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride, + PIXMAN_FORMAT_BPP (dst_image->bits.format), + dest_x, dest_y, width, height, 0); + return; } - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - xmmDef = createMask_2x32_128 (src, src); - xmmSrc = expandPixel_32_1x128 (src); + xmm_def = create_mask_2x32_128 (src, src); + xmm_src = expand_pixel_32_1x128 (src); while (height--) { - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; - w = width; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w && (unsigned long)dst & 15) - { - uint8_t m = *mask++; - - if (m) - { - *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m))); - } - else - { - *dst = 0; - } - - w--; - dst++; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w >= 4) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)mask); - cachePrefetchNext ((__m128i*)dst); - - m = *((uint32_t*)mask); - - if (srca == 0xff && m == 0xffffffff) - { - save128Aligned ((__m128i*)dst, xmmDef); - } - else if (m) - { - xmmMask = unpack_32_1x128 (m); - xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); - - /* Unpacking */ - unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); - - expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - pixMultiply_2x128 (&xmmSrc, &xmmSrc, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi)); - } - else - { - save128Aligned ((__m128i*)dst, _mm_setzero_si128()); - } - - w -= 4; - dst += 4; - mask += 4; - } - - while (w) - { - uint8_t m = *mask++; - - if (m) - { - *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m))); - } - else - { - *dst = 0; - } - - w--; - dst++; - } + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w && (unsigned long)dst & 15) + { + uint8_t m = *mask++; + + if (m) + { + *dst = pack_1x64_32 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); + } + else + { + *dst = 0; + } + + w--; + dst++; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w >= 4) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)mask); + cache_prefetch_next ((__m128i*)dst); + + m = *((uint32_t*)mask); + + if (srca == 0xff && m == 0xffffffff) + { + save_128_aligned ((__m128i*)dst, xmm_def); + } + else if (m) + { + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_src, &xmm_src, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); + } + else + { + save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + + if (m) + { + *dst = pack_1x64_32 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); + } + else + { + *dst = 0; + } + + w--; + dst++; + } } - _mm_empty(); + _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSolidMask_nx8x0565 +/*----------------------------------------------------------------------- + * composite_over_n_8_0565 */ -void -fbCompositeSolidMask_nx8x0565sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t src, srca; - uint16_t *dstLine, *dst, d; - uint8_t *maskLine, *mask; - int dstStride, maskStride; - uint16_t w; +static void +sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint16_t *dst_line, *dst, d; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + uint16_t w; uint32_t m; - __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest; + __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; - __m128i xmmSrc, xmmAlpha; - __m128i xmmMask, xmmMaskLo, xmmMaskHi; - __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; + __m128i xmm_src, xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - fbComposeGetSolid(pSrc, src, pDst->bits.format); + src = _pixman_image_get_solid (src_image, dst_image->bits.format); srca = src >> 24; - if (srca == 0) + if (src == 0) return; - fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - xmmSrc = expandPixel_32_1x128 (src); - xmmAlpha = expandAlpha_1x128 (xmmSrc); - mmxSrc = _mm_movepi64_pi64 (xmmSrc); - mmxAlpha = _mm_movepi64_pi64 (xmmAlpha); + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = _mm_movepi64_pi64 (xmm_src); + mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); while (height--) { - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; - w = width; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w && (unsigned long)dst & 15) - { - m = *mask++; - - if (m) - { - d = *dst; - mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m)); - mmxDest = expand565_16_1x64 (d); - - *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc, - &mmxAlpha, - &mmxMask, - &mmxDest))); - } - - w--; - dst++; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w >= 8) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)mask); - cachePrefetchNext ((__m128i*)dst); - - xmmDst = load128Aligned ((__m128i*) dst); - unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); - - m = *((uint32_t*)mask); - mask += 4; - - if (m) - { - xmmMask = unpack_32_1x128 (m); - xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); - - /* Unpacking */ - unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); - - expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1); - } - - m = *((uint32_t*)mask); - mask += 4; - - if (m) - { - xmmMask = unpack_32_1x128 (m); - xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); - - /* Unpacking */ - unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); - - expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3); - } - - save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3)); - - w -= 8; - dst += 8; - } - - while (w) - { - m = *mask++; - - if (m) - { - d = *dst; - mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m)); - mmxDest = expand565_16_1x64 (d); - - *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc, - &mmxAlpha, - &mmxMask, - &mmxDest))); - } - - w--; - dst++; - } + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w && (unsigned long)dst & 15) + { + m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); + mmx_dest = expand565_16_1x64 (d); + + *dst = pack_565_32_16 ( + pack_1x64_32 ( + in_over_1x64 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w >= 8) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)mask); + cache_prefetch_next ((__m128i*)dst); + + xmm_dst = load_128_aligned ((__m128i*) dst); + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + + m = *((uint32_t*)mask); + mask += 4; + + if (m) + { + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst0, &xmm_dst1); + } + + m = *((uint32_t*)mask); + mask += 4; + + if (m) + { + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst2, &xmm_dst3); + } + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + dst += 8; + } + + while (w) + { + m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); + mmx_dest = expand565_16_1x64 (d); + + *dst = pack_565_32_16 ( + pack_1x64_32 ( + in_over_1x64 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + } } - _mm_empty(); + _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSrc_8888RevNPx0565 +/* ----------------------------------------------------------------------- + * composite_over_pixbuf_0565 */ -void -fbCompositeSrc_8888RevNPx0565sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint16_t *dstLine, *dst, d; - uint32_t *srcLine, *src, s; - int dstStride, srcStride; - uint16_t w; - uint32_t packCmp; +static void +sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst, d; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + uint16_t w; + uint32_t opaque, zero; __m64 ms; - __m128i xmmSrc, xmmSrcLo, xmmSrcHi; - __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); #if 0 /* FIXME @@ -3657,131 +4288,144 @@ fbCompositeSrc_8888RevNPx0565sse2 (pixman_op_t op, * I copy the code from MMX one and keep the fixme. * If it's a problem there, probably is a problem here. */ - assert (pSrc->pDrawable == pMask->pDrawable); + assert (src_image->drawable == mask_image->drawable); #endif while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; - w = width; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)src); - cachePrefetch ((__m128i*)dst); - - while (w && (unsigned long)dst & 15) - { - s = *src++; - d = *dst; - - ms = unpack_32_1x64 (s); - - *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d)))); - w--; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)src); - cachePrefetch ((__m128i*)dst); - - while (w >= 8) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)src); - cachePrefetchNext ((__m128i*)dst); - - /* First round */ - xmmSrc = load128Unaligned((__m128i*)src); - xmmDst = load128Aligned ((__m128i*)dst); - - packCmp = packAlpha (xmmSrc); - - unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); - unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); - - /* preload next round*/ - xmmSrc = load128Unaligned((__m128i*)(src+4)); - /* preload next round*/ - - if (packCmp == 0xffffffff) - { - invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1); - } - else if (packCmp) - { - overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1); - } - - /* Second round */ - packCmp = packAlpha (xmmSrc); - - unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); - - if (packCmp == 0xffffffff) - { - invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3); - } - else if (packCmp) - { - overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3); - } - - save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3)); - - w -= 8; - src += 8; - dst += 8; - } - - while (w) - { - s = *src++; - d = *dst; - - ms = unpack_32_1x64 (s); - - *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d)))); - w--; - } + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + + while (w && (unsigned long)dst & 15) + { + s = *src++; + d = *dst; + + ms = unpack_32_1x64 (s); + + *dst++ = pack_565_32_16 ( + pack_1x64_32 ( + over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + + while (w >= 8) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)src); + cache_prefetch_next ((__m128i*)dst); + + /* First round */ + xmm_src = load_128_unaligned ((__m128i*)src); + xmm_dst = load_128_aligned ((__m128i*)dst); + + opaque = is_opaque (xmm_src); + zero = is_zero (xmm_src); + + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + + /* preload next round*/ + xmm_src = load_128_unaligned ((__m128i*)(src + 4)); + + if (opaque) + { + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst0, &xmm_dst1); + } + else if (!zero) + { + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst0, &xmm_dst1); + } + + /* Second round */ + opaque = is_opaque (xmm_src); + zero = is_zero (xmm_src); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + + if (opaque) + { + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst2, &xmm_dst3); + } + else if (!zero) + { + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst2, &xmm_dst3); + } + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + src += 8; + dst += 8; + } + + while (w) + { + s = *src++; + d = *dst; + + ms = unpack_32_1x64 (s); + + *dst++ = pack_565_32_16 ( + pack_1x64_32 ( + over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); + w--; + } } - _mm_empty(); + _mm_empty (); } -/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ - -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSrc_8888RevNPx8888 +/* ------------------------------------------------------------------------- + * composite_over_pixbuf_8888 */ -void -fbCompositeSrc_8888RevNPx8888sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t *dstLine, *dst, d; - uint32_t *srcLine, *src, s; - int dstStride, srcStride; - uint16_t w; - uint32_t packCmp; - - __m128i xmmSrcLo, xmmSrcHi; - __m128i xmmDstLo, xmmDstHi; - - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); +static void +sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst, d; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + uint16_t w; + uint32_t opaque, zero; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); #if 0 /* FIXME @@ -3789,938 +4433,1276 @@ fbCompositeSrc_8888RevNPx8888sse2 (pixman_op_t op, * I copy the code from MMX one and keep the fixme. * If it's a problem there, probably is a problem here. */ - assert (pSrc->pDrawable == pMask->pDrawable); + assert (src_image->drawable == mask_image->drawable); #endif while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; - w = width; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)src); - cachePrefetch ((__m128i*)dst); - - while (w && (unsigned long)dst & 15) - { - s = *src++; - d = *dst; - - *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); - - w--; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)src); - cachePrefetch ((__m128i*)dst); - - while (w >= 4) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)src); - cachePrefetchNext ((__m128i*)dst); - - xmmSrcHi = load128Unaligned((__m128i*)src); - - packCmp = packAlpha (xmmSrcHi); - - unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); - - if (packCmp == 0xffffffff) - { - invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi); - - save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); - } - else if (packCmp) - { - xmmDstHi = load128Aligned ((__m128i*)dst); - - unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); - - overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi); - - save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); - } - - w -= 4; - dst += 4; - src += 4; - } - - while (w) - { - s = *src++; - d = *dst; - - *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); - - w--; - } + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + + while (w && (unsigned long)dst & 15) + { + s = *src++; + d = *dst; + + *dst++ = pack_1x64_32 ( + over_rev_non_pre_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (d))); + + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + + while (w >= 4) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)src); + cache_prefetch_next ((__m128i*)dst); + + xmm_src_hi = load_128_unaligned ((__m128i*)src); + + opaque = is_opaque (xmm_src_hi); + zero = is_zero (xmm_src_hi); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + + if (opaque) + { + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + else if (!zero) + { + xmm_dst_hi = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + dst += 4; + src += 4; + } + + while (w) + { + s = *src++; + d = *dst; + + *dst++ = pack_1x64_32 ( + over_rev_non_pre_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (d))); + + w--; + } } - _mm_empty(); + _mm_empty (); } /* ------------------------------------------------------------------------------------------------- - * fbCompositeSolidMask_nx8888x0565C + * composite_over_n_8888_0565_ca */ -void -fbCompositeSolidMask_nx8888x0565Csse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t src, srca; - uint16_t *dstLine, *dst, d; - uint32_t *maskLine, *mask, m; - int dstStride, maskStride; +static void +sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint16_t *dst_line, *dst, d; + uint32_t *mask_line, *mask, m; + int dst_stride, mask_stride; int w; - uint32_t packCmp; + uint32_t pack_cmp; - __m128i xmmSrc, xmmAlpha; - __m128i xmmMask, xmmMaskLo, xmmMaskHi; - __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; + __m128i xmm_src, xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest; + __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; - fbComposeGetSolid(pSrc, src, pDst->bits.format); + src = _pixman_image_get_solid (src_image, dst_image->bits.format); - srca = src >> 24; - if (srca == 0) - return; + if (src == 0) + return; - fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - xmmSrc = expandPixel_32_1x128 (src); - xmmAlpha = expandAlpha_1x128 (xmmSrc); - mmxSrc = _mm_movepi64_pi64 (xmmSrc); - mmxAlpha = _mm_movepi64_pi64 (xmmAlpha); + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = _mm_movepi64_pi64 (xmm_src); + mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); while (height--) { - w = width; - mask = maskLine; - dst = dstLine; - maskLine += maskStride; - dstLine += dstStride; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w && ((unsigned long)dst & 15)) - { - m = *(uint32_t *) mask; - - if (m) - { - d = *dst; - mmxMask = unpack_32_1x64 (m); - mmxDest = expand565_16_1x64 (d); - - *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc, - &mmxAlpha, - &mmxMask, - &mmxDest))); - } - - w--; - dst++; - mask++; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w >= 8) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)mask); - cachePrefetchNext ((__m128i*)dst); - - /* First round */ - xmmMask = load128Unaligned((__m128i*)mask); - xmmDst = load128Aligned((__m128i*)dst); - - packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128())); - - unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); - unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); - - /* preload next round*/ - xmmMask = load128Unaligned((__m128i*)(mask+4)); - /* preload next round*/ - - if (packCmp != 0xffff) - { - inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1); - } - - /* Second round */ - packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128())); - - unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); - - if (packCmp != 0xffff) - { - inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3); - } - - save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3)); - - w -= 8; - dst += 8; - mask += 8; - } - - while (w) - { - m = *(uint32_t *) mask; - - if (m) - { - d = *dst; - mmxMask = unpack_32_1x64 (m); - mmxDest = expand565_16_1x64 (d); - - *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc, - &mmxAlpha, - &mmxMask, - &mmxDest))); - } - - w--; - dst++; - mask++; - } + w = width; + mask = mask_line; + dst = dst_line; + mask_line += mask_stride; + dst_line += dst_stride; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w && ((unsigned long)dst & 15)) + { + m = *(uint32_t *) mask; + + if (m) + { + d = *dst; + mmx_mask = unpack_32_1x64 (m); + mmx_dest = expand565_16_1x64 (d); + + *dst = pack_565_32_16 ( + pack_1x64_32 ( + in_over_1x64 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + mask++; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w >= 8) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)mask); + cache_prefetch_next ((__m128i*)dst); + + /* First round */ + xmm_mask = load_128_unaligned ((__m128i*)mask); + xmm_dst = load_128_aligned ((__m128i*)dst); + + pack_cmp = _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + /* preload next round */ + xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); + + /* preload next round */ + if (pack_cmp != 0xffff) + { + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst0, &xmm_dst1); + } + + /* Second round */ + pack_cmp = _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + if (pack_cmp != 0xffff) + { + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst2, &xmm_dst3); + } + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + dst += 8; + mask += 8; + } + + while (w) + { + m = *(uint32_t *) mask; + + if (m) + { + d = *dst; + mmx_mask = unpack_32_1x64 (m); + mmx_dest = expand565_16_1x64 (d); + + *dst = pack_565_32_16 ( + pack_1x64_32 ( + in_over_1x64 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + mask++; + } } _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeIn_nx8x8 +/* ----------------------------------------------------------------------- + * composite_in_n_8_8 */ -void -fbCompositeIn_nx8x8sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint8_t *dstLine, *dst; - uint8_t *maskLine, *mask; - int dstStride, maskStride; - uint16_t w, d, m; - uint32_t src; - uint8_t sa; - - __m128i xmmAlpha; - __m128i xmmMask, xmmMaskLo, xmmMaskHi; - __m128i xmmDst, xmmDstLo, xmmDstHi; - - fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); - - fbComposeGetSolid(pSrc, src, pDst->bits.format); +static void +sse2_composite_in_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + uint16_t w, d, m; + uint32_t src; + uint8_t sa; + + __m128i xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); sa = src >> 24; if (sa == 0) - return; + return; - xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src)); + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); while (height--) { - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; - w = width; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w && ((unsigned long)dst & 15)) - { - m = (uint32_t) *mask++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); - w--; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w >= 16) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)mask); - cachePrefetchNext ((__m128i*)dst); - - xmmMask = load128Unaligned((__m128i*)mask); - xmmDst = load128Aligned((__m128i*)dst); - - unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); - unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); - - pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi); - - save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - mask += 16; - dst += 16; - w -= 16; - } - - while (w) - { - m = (uint32_t) *mask++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); - w--; - } + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w && ((unsigned long)dst & 15)) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), + unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w >= 16) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)mask); + cache_prefetch_next ((__m128i*)dst); + + xmm_mask = load_128_unaligned ((__m128i*)mask); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + mask += 16; + dst += 16; + w -= 16; + } + + while (w) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } } - _mm_empty(); + _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeIn_8x8 +/* --------------------------------------------------------------------------- + * composite_in_8_8 */ -void -fbCompositeIn_8x8sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint8_t *dstLine, *dst; - uint8_t *srcLine, *src; - int srcStride, dstStride; - uint16_t w; - uint32_t s, d; - - __m128i xmmSrc, xmmSrcLo, xmmSrcHi; - __m128i xmmDst, xmmDstLo, xmmDstHi; - - fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1); +static void +sse2_composite_in_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int src_stride, dst_stride; + uint16_t w; + uint32_t s, d; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; - w = width; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)src); - cachePrefetch ((__m128i*)dst); - - while (w && ((unsigned long)dst & 15)) - { - s = (uint32_t) *src++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d))); - w--; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)src); - cachePrefetch ((__m128i*)dst); - - while (w >= 16) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)src); - cachePrefetchNext ((__m128i*)dst); - - xmmSrc = load128Unaligned((__m128i*)src); - xmmDst = load128Aligned((__m128i*)dst); - - unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); - - pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi); - - save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - src += 16; - dst += 16; - w -= 16; - } - - while (w) - { - s = (uint32_t) *src++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d))); - w--; - } + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + + while (w && ((unsigned long)dst & 15)) + { + s = (uint32_t) *src++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (d))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + + while (w >= 16) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)src); + cache_prefetch_next ((__m128i*)dst); + + xmm_src = load_128_unaligned ((__m128i*)src); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + src += 16; + dst += 16; + w -= 16; + } + + while (w) + { + s = (uint32_t) *src++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); + w--; + } } _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSrcAdd_8888x8x8 +/* ------------------------------------------------------------------------- + * composite_add_8888_8_8 */ -void -fbCompositeSrcAdd_8888x8x8sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint8_t *dstLine, *dst; - uint8_t *maskLine, *mask; - int dstStride, maskStride; - uint16_t w; - uint32_t src; - uint8_t sa; +static void +sse2_composite_add_8888_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + uint16_t w; + uint32_t src; + uint8_t sa; uint32_t m, d; - __m128i xmmAlpha; - __m128i xmmMask, xmmMaskLo, xmmMaskHi; - __m128i xmmDst, xmmDstLo, xmmDstHi; + __m128i xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - fbComposeGetSolid(pSrc, src, pDst->bits.format); + src = _pixman_image_get_solid (src_image, dst_image->bits.format); sa = src >> 24; if (sa == 0) - return; + return; - xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src)); + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); while (height--) { - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; - w = width; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w && ((unsigned long)dst & 15)) - { - m = (uint32_t) *mask++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); - w--; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)mask); - cachePrefetch ((__m128i*)dst); - - while (w >= 16) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)mask); - cachePrefetchNext ((__m128i*)dst); - - xmmMask = load128Unaligned((__m128i*)mask); - xmmDst = load128Aligned((__m128i*)dst); - - unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); - unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); - - pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo); - xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi); - - save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); - - mask += 16; - dst += 16; - w -= 16; - } - - while (w) - { - m = (uint32_t) *mask++; - d = (uint32_t) *dst; - - *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); - w--; - } + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w && ((unsigned long)dst & 15)) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + _mm_adds_pu16 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)mask); + cache_prefetch ((__m128i*)dst); + + while (w >= 16) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)mask); + cache_prefetch_next ((__m128i*)dst); + + xmm_mask = load_128_unaligned ((__m128i*)mask); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); + xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + mask += 16; + dst += 16; + w -= 16; + } + + while (w) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + _mm_adds_pu16 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + + w--; + } } - _mm_empty(); + _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSrcAdd_8000x8000 +/* ---------------------------------------------------------------------- + * composite_add_8000_8000 */ -void -fbCompositeSrcAdd_8000x8000sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint8_t *dstLine, *dst; - uint8_t *srcLine, *src; - int dstStride, srcStride; - uint16_t w; - uint16_t t; - - fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1); - fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); +static void +sse2_composite_add_8000_8000 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + uint16_t w; + uint16_t t; + + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); while (height--) { - dst = dstLine; - src = srcLine; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)src); - cachePrefetch ((__m128i*)dst); - - dstLine += dstStride; - srcLine += srcStride; - w = width; - - /* Small head */ - while (w && (unsigned long)dst & 3) - { - t = (*dst) + (*src++); - *dst++ = t | (0 - (t >> 8)); - w--; - } - - coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, w >> 2); - - /* Small tail */ - dst += w & 0xfffc; - src += w & 0xfffc; - - w &= 3; - - while (w) - { - t = (*dst) + (*src++); - *dst++ = t | (0 - (t >> 8)); - w--; - } + dst = dst_line; + src = src_line; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + + dst_line += dst_stride; + src_line += src_stride; + w = width; + + /* Small head */ + while (w && (unsigned long)dst & 3) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); + w--; + } + + core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); + + /* Small tail */ + dst += w & 0xfffc; + src += w & 0xfffc; + + w &= 3; + + while (w) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); + w--; + } } - _mm_empty(); + _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * fbCompositeSrcAdd_8888x8888 +/* --------------------------------------------------------------------- + * composite_add_8888_8888 */ -void -fbCompositeSrcAdd_8888x8888sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t *dstLine, *dst; - uint32_t *srcLine, *src; - int dstStride, srcStride; - - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); +static void +sse2_composite_add_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; - coreCombineAddUsse2 (dst, src, width); + core_combine_add_u_sse2 (dst, src, NULL, width); } - _mm_empty(); + _mm_empty (); } /* ------------------------------------------------------------------------------------------------- - * fbCompositeCopyAreasse2 + * sse2_composite_copy_area */ -pixman_bool_t -pixmanBltsse2 (uint32_t *src_bits, - uint32_t *dst_bits, - int src_stride, - int dst_stride, - int src_bpp, - int dst_bpp, - int src_x, int src_y, - int dst_x, int dst_y, - int width, int height) -{ - uint8_t * src_bytes; - uint8_t * dst_bytes; - int byte_width; +static pixman_bool_t +pixman_blt_sse2 (uint32_t *src_bits, + uint32_t *dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + uint8_t * src_bytes; + uint8_t * dst_bytes; + int byte_width; if (src_bpp != dst_bpp) - return FALSE; + return FALSE; if (src_bpp == 16) { - src_stride = src_stride * (int) sizeof (uint32_t) / 2; - dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; - src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); - dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); - byte_width = 2 * width; - src_stride *= 2; - dst_stride *= 2; + src_stride = src_stride * (int) sizeof (uint32_t) / 2; + dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; + src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); + dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + byte_width = 2 * width; + src_stride *= 2; + dst_stride *= 2; } else if (src_bpp == 32) { - src_stride = src_stride * (int) sizeof (uint32_t) / 4; - dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; - src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); - dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); - byte_width = 4 * width; - src_stride *= 4; - dst_stride *= 4; + src_stride = src_stride * (int) sizeof (uint32_t) / 4; + dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; + src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); + dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + byte_width = 4 * width; + src_stride *= 4; + dst_stride *= 4; } else { - return FALSE; + return FALSE; } - cachePrefetch ((__m128i*)src_bytes); - cachePrefetch ((__m128i*)dst_bytes); + cache_prefetch ((__m128i*)src_bytes); + cache_prefetch ((__m128i*)dst_bytes); while (height--) { - int w; - uint8_t *s = src_bytes; - uint8_t *d = dst_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; - w = byte_width; - - cachePrefetchNext ((__m128i*)s); - cachePrefetchNext ((__m128i*)d); - - while (w >= 2 && ((unsigned long)d & 3)) - { - *(uint16_t *)d = *(uint16_t *)s; - w -= 2; - s += 2; - d += 2; - } - - while (w >= 4 && ((unsigned long)d & 15)) - { - *(uint32_t *)d = *(uint32_t *)s; - - w -= 4; - s += 4; - d += 4; - } - - cachePrefetchNext ((__m128i*)s); - cachePrefetchNext ((__m128i*)d); - - while (w >= 64) - { - __m128i xmm0, xmm1, xmm2, xmm3; - - /* 128 bytes ahead */ - cachePrefetch (((__m128i*)s) + 8); - cachePrefetch (((__m128i*)d) + 8); - - xmm0 = load128Unaligned ((__m128i*)(s)); - xmm1 = load128Unaligned ((__m128i*)(s+16)); - xmm2 = load128Unaligned ((__m128i*)(s+32)); - xmm3 = load128Unaligned ((__m128i*)(s+48)); - - save128Aligned ((__m128i*)(d), xmm0); - save128Aligned ((__m128i*)(d+16), xmm1); - save128Aligned ((__m128i*)(d+32), xmm2); - save128Aligned ((__m128i*)(d+48), xmm3); - - s += 64; - d += 64; - w -= 64; - } - - cachePrefetchNext ((__m128i*)s); - cachePrefetchNext ((__m128i*)d); - - while (w >= 16) - { - save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) ); - - w -= 16; - d += 16; - s += 16; - } - - cachePrefetchNext ((__m128i*)s); - cachePrefetchNext ((__m128i*)d); - - while (w >= 4) - { - *(uint32_t *)d = *(uint32_t *)s; - - w -= 4; - s += 4; - d += 4; - } - - if (w >= 2) - { - *(uint16_t *)d = *(uint16_t *)s; - w -= 2; - s += 2; - d += 2; - } + int w; + uint8_t *s = src_bytes; + uint8_t *d = dst_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + w = byte_width; + + cache_prefetch_next ((__m128i*)s); + cache_prefetch_next ((__m128i*)d); + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(uint16_t *)d = *(uint16_t *)s; + w -= 2; + s += 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 15)) + { + *(uint32_t *)d = *(uint32_t *)s; + + w -= 4; + s += 4; + d += 4; + } + + cache_prefetch_next ((__m128i*)s); + cache_prefetch_next ((__m128i*)d); + + while (w >= 64) + { + __m128i xmm0, xmm1, xmm2, xmm3; + + /* 128 bytes ahead */ + cache_prefetch (((__m128i*)s) + 8); + cache_prefetch (((__m128i*)d) + 8); + + xmm0 = load_128_unaligned ((__m128i*)(s)); + xmm1 = load_128_unaligned ((__m128i*)(s + 16)); + xmm2 = load_128_unaligned ((__m128i*)(s + 32)); + xmm3 = load_128_unaligned ((__m128i*)(s + 48)); + + save_128_aligned ((__m128i*)(d), xmm0); + save_128_aligned ((__m128i*)(d + 16), xmm1); + save_128_aligned ((__m128i*)(d + 32), xmm2); + save_128_aligned ((__m128i*)(d + 48), xmm3); + + s += 64; + d += 64; + w -= 64; + } + + cache_prefetch_next ((__m128i*)s); + cache_prefetch_next ((__m128i*)d); + + while (w >= 16) + { + save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); + + w -= 16; + d += 16; + s += 16; + } + + cache_prefetch_next ((__m128i*)s); + cache_prefetch_next ((__m128i*)d); + + while (w >= 4) + { + *(uint32_t *)d = *(uint32_t *)s; + + w -= 4; + s += 4; + d += 4; + } + + if (w >= 2) + { + *(uint16_t *)d = *(uint16_t *)s; + w -= 2; + s += 2; + d += 2; + } } - _mm_empty(); + _mm_empty (); return TRUE; } -void -fbCompositeCopyAreasse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - pixmanBltsse2 (pSrc->bits.bits, - pDst->bits.bits, - pSrc->bits.rowstride, - pDst->bits.rowstride, - PIXMAN_FORMAT_BPP (pSrc->bits.format), - PIXMAN_FORMAT_BPP (pDst->bits.format), - xSrc, ySrc, xDst, yDst, width, height); +static void +sse2_composite_copy_area (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + pixman_blt_sse2 (src_image->bits.bits, + dst_image->bits.bits, + src_image->bits.rowstride, + dst_image->bits.rowstride, + PIXMAN_FORMAT_BPP (src_image->bits.format), + PIXMAN_FORMAT_BPP (dst_image->bits.format), + src_x, src_y, dest_x, dest_y, width, height); } #if 0 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */ void -fbCompositeOver_x888x8x8888sse2 (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t *src, *srcLine, s; - uint32_t *dst, *dstLine, d; - uint8_t *mask, *maskLine; - uint32_t m; - int srcStride, maskStride, dstStride; +sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *src, *src_line, s; + uint32_t *dst, *dst_line, d; + uint8_t *mask, *mask_line; + uint32_t m; + int src_stride, mask_stride, dst_stride; uint16_t w; - __m128i xmmSrc, xmmSrcLo, xmmSrcHi; - __m128i xmmDst, xmmDstLo, xmmDstHi; - __m128i xmmMask, xmmMaskLo, xmmMaskHi; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); while (height--) { - src = srcLine; - srcLine += srcStride; - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; - - w = width; - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)src); - cachePrefetch ((__m128i*)dst); - cachePrefetch ((__m128i*)mask); - - while (w && (unsigned long)dst & 15) - { - s = 0xff000000 | *src++; - m = (uint32_t) *mask++; - d = *dst; - - __m64 ms = unpack_32_1x64 (s); - - if (m != 0xff) - { - ms = inOver_1x64 (ms, - xMask00ff, - expandAlphaRev_1x64 (unpack_32_1x64 (m)), - unpack_32_1x64 (d)); - } - - *dst++ = pack_1x64_32 (ms); - w--; - } - - /* call prefetch hint to optimize cache load*/ - cachePrefetch ((__m128i*)src); - cachePrefetch ((__m128i*)dst); - cachePrefetch ((__m128i*)mask); - - while (w >= 4) - { - /* fill cache line with next memory */ - cachePrefetchNext ((__m128i*)src); - cachePrefetchNext ((__m128i*)dst); - cachePrefetchNext ((__m128i*)mask); - - m = *(uint32_t*) mask; - xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000); - - if (m == 0xffffffff) - { - save128Aligned ((__m128i*)dst, xmmSrc); - } - else - { - xmmDst = load128Aligned ((__m128i*)dst); - - xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); - - unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); - unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); - unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); - - expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); - - inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); - - save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); - } - - src += 4; - dst += 4; - mask += 4; - w -= 4; - } - - while (w) - { - m = (uint32_t) *mask++; - - if (m) - { - s = 0xff000000 | *src; - - if (m == 0xff) - { - *dst = s; - } - else - { - d = *dst; - - *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s), - xMask00ff, - expandAlphaRev_1x64 (unpack_32_1x64 (m)), - unpack_32_1x64 (d))); - } - - } - - src++; - dst++; - w--; - } + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + cache_prefetch ((__m128i*)mask); + + while (w && (unsigned long)dst & 15) + { + s = 0xff000000 | *src++; + m = (uint32_t) *mask++; + d = *dst; + + __m64 ms = unpack_32_1x64 (s); + + if (m != 0xff) + { + ms = in_over_1x64 (ms, + mask_x00ff, + expand_alpha_rev_1x64 (unpack_32_1x64 (m)), + unpack_32_1x64 (d)); + } + + *dst++ = pack_1x64_32 (ms); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + cache_prefetch ((__m128i*)mask); + + while (w >= 4) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)src); + cache_prefetch_next ((__m128i*)dst); + cache_prefetch_next ((__m128i*)mask); + + m = *(uint32_t*) mask; + xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000); + + if (m == 0xffffffff) + { + save_128_aligned ((__m128i*)dst, xmm_src); + } + else + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + xmm_mask = _mm_unpacklo_epi16 ( + unpack_32_1x128 (m), _mm_setzero_si128 ()); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (xmm_src_lo, xmm_src_hi, + mask_00ff, mask_00ff, + xmm_mask_lo, xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + src += 4; + dst += 4; + mask += 4; + w -= 4; + } + + while (w) + { + m = (uint32_t) *mask++; + + if (m) + { + s = 0xff000000 | *src; + + if (m == 0xff) + { + *dst = s; + } + else + { + d = *dst; + + *dst = pack_1x64_32 ( + in_over_1x64 ( + unpack_32_1x64 (s), + mask_x00ff, + expand_alpha_rev_1x64 (unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + } + + } + + src++; + dst++; + w--; + } + } + + _mm_empty (); +} + +#endif + +static const pixman_fast_path_t sse2_fast_paths[] = +{ + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 }, +#if 0 + /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */ + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 }, +#endif + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 }, + + { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 }, + { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 }, + { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 }, + { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_8888_8_8, 0 }, + + { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 }, + { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 }, + { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 }, + { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 }, + { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 }, + { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 }, + { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 }, + { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 }, + { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 }, + { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 }, + { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 }, + { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 }, + + { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 }, + { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 }, + + { PIXMAN_OP_NONE }, +}; + +/* + * Work around GCC bug causing crashes in Mozilla with SSE2 + * + * When using -msse, gcc generates movdqa instructions assuming that + * the stack is 16 byte aligned. Unfortunately some applications, such + * as Mozilla and Mono, end up aligning the stack to 4 bytes, which + * causes the movdqa instructions to fail. + * + * The __force_align_arg_pointer__ makes gcc generate a prologue that + * realigns the stack pointer to 16 bytes. + * + * On x86-64 this is not necessary because the standard ABI already + * calls for a 16 byte aligned stack. + * + * See https://bugs.freedesktop.org/show_bug.cgi?id=15693 + */ +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +static void +sse2_composite (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src, + pixman_image_t * mask, + pixman_image_t * dest, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + if (_pixman_run_fast_path (sse2_fast_paths, imp, + op, src, mask, dest, + src_x, src_y, + mask_x, mask_y, + dest_x, dest_y, + width, height)) + { + return; + } + + _pixman_implementation_composite (imp->delegate, op, + src, mask, dest, + src_x, src_y, + mask_x, mask_y, + dest_x, dest_y, + width, height); +} + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +static pixman_bool_t +sse2_blt (pixman_implementation_t *imp, + uint32_t * src_bits, + uint32_t * dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + if (!pixman_blt_sse2 ( + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dst_x, dst_y, width, height)) + + { + return _pixman_implementation_blt ( + imp->delegate, + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dst_x, dst_y, width, height); } - _mm_empty(); + return TRUE; +} + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +static pixman_bool_t +sse2_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor)) + { + return _pixman_implementation_fill ( + imp->delegate, bits, stride, bpp, x, y, width, height, xor); + } + + return TRUE; +} + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +pixman_implementation_t * +_pixman_implementation_create_sse2 (void) +{ + pixman_implementation_t *mmx = _pixman_implementation_create_mmx (); + pixman_implementation_t *imp = _pixman_implementation_create (mmx); + + /* SSE2 constants */ + mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); + mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); + mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); + mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); + mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); + mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); + mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); + mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); + mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); + mask_0080 = create_mask_16_128 (0x0080); + mask_00ff = create_mask_16_128 (0x00ff); + mask_0101 = create_mask_16_128 (0x0101); + mask_ffff = create_mask_16_128 (0xffff); + mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); + mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); + + /* MMX constants */ + mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f); + mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840); + + mask_x0080 = create_mask_16_64 (0x0080); + mask_x00ff = create_mask_16_64 (0x00ff); + mask_x0101 = create_mask_16_64 (0x0101); + mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000); + + _mm_empty (); + + /* Set up function pointers */ + + /* SSE code patch for fbcompose.c */ + imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; + imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; + imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; + imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; + imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; + imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; + imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; + imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; + imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; + imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; + + imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; + + imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; + imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; + imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; + imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; + imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; + imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; + + imp->composite = sse2_composite; + imp->blt = sse2_blt; + imp->fill = sse2_fill; + + return imp; } -#endif /* #if 0 */ #endif /* USE_SSE2 */ |