aboutsummaryrefslogtreecommitdiff
path: root/pixman/pixman/pixman-sse2.c
diff options
context:
space:
mode:
Diffstat (limited to 'pixman/pixman/pixman-sse2.c')
-rw-r--r--pixman/pixman/pixman-sse2.c7856
1 files changed, 4419 insertions, 3437 deletions
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index 0f36436b6..2fa956e21 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -23,7 +23,7 @@
*
* Author: Rodrigo Kumpera (kumpera@gmail.com)
* André Tupinambá (andrelrt@gmail.com)
- *
+ *
* Based on work by Owen Taylor and Søren Sandmann
*/
#ifdef HAVE_CONFIG_H
@@ -33,405 +33,478 @@
#include <mmintrin.h>
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
#include <emmintrin.h> /* for SSE2 intrinsics */
-
-#include "pixman-sse2.h"
-
-#ifdef USE_SSE2
-
-#ifdef _MSC_VER
-#undef inline
-#define inline __forceinline
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+
+#if defined(_MSC_VER) && defined(_M_AMD64)
+/* Windows 64 doesn't allow MMX to be used, so
+ * the pixman-x64-mmx-emulation.h file contains
+ * implementations of those MMX intrinsics that
+ * are used in the SSE2 implementation.
+ */
+# include "pixman-x64-mmx-emulation.h"
#endif
-#ifdef __GNUC__
-# define inline __inline__ __attribute__ ((__always_inline__))
-#endif
+#ifdef USE_SSE2
-/* -------------------------------------------------------------------------------------------------
+/* --------------------------------------------------------------------
* Locals
*/
-static __m64 xMask0080;
-static __m64 xMask00ff;
-static __m64 xMask0101;
-static __m64 xMaskAlpha;
+static __m64 mask_x0080;
+static __m64 mask_x00ff;
+static __m64 mask_x0101;
+static __m64 mask_x_alpha;
-static __m64 xMask565rgb;
-static __m64 xMask565Unpack;
+static __m64 mask_x565_rgb;
+static __m64 mask_x565_unpack;
-static __m128i Mask0080;
-static __m128i Mask00ff;
-static __m128i Mask0101;
-static __m128i Maskffff;
-static __m128i Maskff000000;
-static __m128i MaskAlpha;
+static __m128i mask_0080;
+static __m128i mask_00ff;
+static __m128i mask_0101;
+static __m128i mask_ffff;
+static __m128i mask_ff000000;
+static __m128i mask_alpha;
-static __m128i Mask565r;
-static __m128i Mask565g1, Mask565g2;
-static __m128i Mask565b;
-static __m128i MaskRed;
-static __m128i MaskGreen;
-static __m128i MaskBlue;
+static __m128i mask_565_r;
+static __m128i mask_565_g1, mask_565_g2;
+static __m128i mask_565_b;
+static __m128i mask_red;
+static __m128i mask_green;
+static __m128i mask_blue;
-static __m128i Mask565FixRB;
-static __m128i Mask565FixG;
+static __m128i mask_565_fix_rb;
+static __m128i mask_565_fix_g;
-/* -------------------------------------------------------------------------------------------------
+/* ----------------------------------------------------------------------
* SSE2 Inlines
*/
-static inline __m128i
+static force_inline __m128i
unpack_32_1x128 (uint32_t data)
{
- return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
+ return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
}
-static inline void
-unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi)
+static force_inline void
+unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
{
- *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
- *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
+ *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
+ *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
}
-static inline __m128i
-unpack565to8888 (__m128i lo)
+static force_inline __m128i
+unpack_565_to_8888 (__m128i lo)
{
__m128i r, g, b, rb, t;
-
- r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed);
- g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen);
- b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue);
+
+ r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
+ g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
+ b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
rb = _mm_or_si128 (r, b);
- t = _mm_and_si128 (rb, Mask565FixRB);
+ t = _mm_and_si128 (rb, mask_565_fix_rb);
t = _mm_srli_epi32 (t, 5);
rb = _mm_or_si128 (rb, t);
- t = _mm_and_si128 (g, Mask565FixG);
+ t = _mm_and_si128 (g, mask_565_fix_g);
t = _mm_srli_epi32 (t, 6);
g = _mm_or_si128 (g, t);
-
+
return _mm_or_si128 (rb, g);
}
-static inline void
-unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
+static force_inline void
+unpack_565_128_4x128 (__m128i data,
+ __m128i* data0,
+ __m128i* data1,
+ __m128i* data2,
+ __m128i* data3)
{
__m128i lo, hi;
lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
- lo = unpack565to8888 (lo);
- hi = unpack565to8888 (hi);
+ lo = unpack_565_to_8888 (lo);
+ hi = unpack_565_to_8888 (hi);
unpack_128_2x128 (lo, data0, data1);
unpack_128_2x128 (hi, data2, data3);
}
-static inline uint16_t
-pack565_32_16 (uint32_t pixel)
+static force_inline uint16_t
+pack_565_32_16 (uint32_t pixel)
{
- return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
+ return (uint16_t) (((pixel >> 8) & 0xf800) |
+ ((pixel >> 5) & 0x07e0) |
+ ((pixel >> 3) & 0x001f));
}
-static inline __m128i
+static force_inline __m128i
pack_2x128_128 (__m128i lo, __m128i hi)
{
return _mm_packus_epi16 (lo, hi);
}
-static inline __m128i
-pack565_2x128_128 (__m128i lo, __m128i hi)
+static force_inline __m128i
+pack_565_2x128_128 (__m128i lo, __m128i hi)
{
__m128i data;
__m128i r, g1, g2, b;
- data = pack_2x128_128 ( lo, hi );
+ data = pack_2x128_128 (lo, hi);
- r = _mm_and_si128 (data , Mask565r);
- g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1);
- g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2);
- b = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b);
+ r = _mm_and_si128 (data, mask_565_r);
+ g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+ g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+ b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
}
-static inline __m128i
-pack565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
+static force_inline __m128i
+pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
{
- return _mm_packus_epi16 (pack565_2x128_128 (*xmm0, *xmm1), pack565_2x128_128 (*xmm2, *xmm3));
+ return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+ pack_565_2x128_128 (*xmm2, *xmm3));
}
-static inline uint32_t
-packAlpha (__m128i x)
+static force_inline int
+is_opaque (__m128i x)
{
- return _mm_cvtsi128_si32 (_mm_packus_epi16 (_mm_packus_epi16 (_mm_srli_epi32 (x, 24),
- _mm_setzero_si128 ()),
- _mm_setzero_si128 ()));
+ __m128i ffs = _mm_cmpeq_epi8 (x, x);
+
+ return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
}
-static inline __m128i
-expandPixel_32_1x128 (uint32_t data)
+static force_inline int
+is_zero (__m128i x)
{
- return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
+ return _mm_movemask_epi8 (
+ _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
}
-static inline __m128i
-expandAlpha_1x128 (__m128i data)
+static force_inline int
+is_transparent (__m128i x)
{
- return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+ return (_mm_movemask_epi8 (
+ _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
}
-static inline void
-expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
+static force_inline __m128i
+expand_pixel_32_1x128 (uint32_t data)
{
- __m128i lo, hi;
+ return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
+}
- lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3));
- hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3));
- *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
- *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
+static force_inline __m128i
+expand_alpha_1x128 (__m128i data)
+{
+ return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+ _MM_SHUFFLE (3, 3, 3, 3)),
+ _MM_SHUFFLE (3, 3, 3, 3));
}
-static inline void
-expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
+static force_inline void
+expand_alpha_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi)
{
__m128i lo, hi;
- lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0));
- hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0));
- *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
- *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+ *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+ *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
}
-static inline void
-pixMultiply_2x128 (__m128i* dataLo, __m128i* dataHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* retLo, __m128i* retHi)
+static force_inline void
+expand_alpha_rev_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi)
{
__m128i lo, hi;
- lo = _mm_mullo_epi16 (*dataLo, *alphaLo);
- hi = _mm_mullo_epi16 (*dataHi, *alphaHi);
- lo = _mm_adds_epu16 (lo, Mask0080);
- hi = _mm_adds_epu16 (hi, Mask0080);
- *retLo = _mm_mulhi_epu16 (lo, Mask0101);
- *retHi = _mm_mulhi_epu16 (hi, Mask0101);
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+ *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+ *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
}
-static inline void
-pixAddMultiply_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaDstLo, __m128i* alphaDstHi,
- __m128i* dstLo, __m128i* dstHi, __m128i* alphaSrcLo, __m128i* alphaSrcHi,
- __m128i* retLo, __m128i* retHi)
+static force_inline void
+pix_multiply_2x128 (__m128i* data_lo,
+ __m128i* data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* ret_lo,
+ __m128i* ret_hi)
{
__m128i lo, hi;
- __m128i mulLo, mulHi;
- lo = _mm_mullo_epi16 (*srcLo, *alphaDstLo);
- hi = _mm_mullo_epi16 (*srcHi, *alphaDstHi);
- mulLo = _mm_mullo_epi16 (*dstLo, *alphaSrcLo);
- mulHi = _mm_mullo_epi16 (*dstHi, *alphaSrcHi);
- lo = _mm_adds_epu16 (lo, Mask0080);
- hi = _mm_adds_epu16 (hi, Mask0080);
- lo = _mm_adds_epu16 (lo, mulLo);
- hi = _mm_adds_epu16 (hi, mulHi);
- *retLo = _mm_mulhi_epu16 (lo, Mask0101);
- *retHi = _mm_mulhi_epu16 (hi, Mask0101);
+ lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
+ hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
+ lo = _mm_adds_epu16 (lo, mask_0080);
+ hi = _mm_adds_epu16 (hi, mask_0080);
+ *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+ *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
+}
+
+static force_inline void
+pix_add_multiply_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_dst_lo,
+ __m128i* alpha_dst_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi,
+ __m128i* alpha_src_lo,
+ __m128i* alpha_src_hi,
+ __m128i* ret_lo,
+ __m128i* ret_hi)
+{
+ __m128i lo, hi;
+ __m128i mul_lo, mul_hi;
+
+ lo = _mm_mullo_epi16 (*src_lo, *alpha_dst_lo);
+ hi = _mm_mullo_epi16 (*src_hi, *alpha_dst_hi);
+ mul_lo = _mm_mullo_epi16 (*dst_lo, *alpha_src_lo);
+ mul_hi = _mm_mullo_epi16 (*dst_hi, *alpha_src_hi);
+ lo = _mm_adds_epu16 (lo, mask_0080);
+ hi = _mm_adds_epu16 (hi, mask_0080);
+ lo = _mm_adds_epu16 (lo, mul_lo);
+ hi = _mm_adds_epu16 (hi, mul_hi);
+ *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+ *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
}
-static inline void
-negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi)
+static force_inline void
+negate_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* neg_lo,
+ __m128i* neg_hi)
{
- *negLo = _mm_xor_si128 (dataLo, Mask00ff);
- *negHi = _mm_xor_si128 (dataHi, Mask00ff);
+ *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
+ *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
}
-static inline void
-invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi)
+static force_inline void
+invert_colors_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* inv_lo,
+ __m128i* inv_hi)
{
__m128i lo, hi;
- lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2));
- hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2));
- *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
- *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+ *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+ *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
}
-static inline void
-over_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* dstLo, __m128i* dstHi)
+static force_inline void
+over_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
{
__m128i t1, t2;
- negate_2x128 (*alphaLo, *alphaHi, &t1, &t2);
+ negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
- pixMultiply_2x128 (dstLo, dstHi, &t1, &t2, dstLo, dstHi);
+ pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
- *dstLo = _mm_adds_epu8 (*srcLo, *dstLo);
- *dstHi = _mm_adds_epu8 (*srcHi, *dstHi);
+ *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
+ *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
}
-static inline void
-overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi)
+static force_inline void
+over_rev_non_pre_2x128 (__m128i src_lo,
+ __m128i src_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
{
__m128i lo, hi;
- __m128i alphaLo, alphaHi;
+ __m128i alpha_lo, alpha_hi;
- expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi);
+ expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
- lo = _mm_or_si128 (alphaLo, MaskAlpha);
- hi = _mm_or_si128 (alphaHi, MaskAlpha);
+ lo = _mm_or_si128 (alpha_lo, mask_alpha);
+ hi = _mm_or_si128 (alpha_hi, mask_alpha);
- invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi);
+ invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
- pixMultiply_2x128 (&srcLo, &srcHi, &lo, &hi, &lo, &hi);
+ pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
- over_2x128 (&lo, &hi, &alphaLo, &alphaHi, dstLo, dstHi);
+ over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
}
-static inline void
-inOver_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi,
- __m128i* maskLo, __m128i* maskHi, __m128i* dstLo, __m128i* dstHi)
+static force_inline void
+in_over_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* mask_lo,
+ __m128i* mask_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
{
- __m128i sLo, sHi;
- __m128i aLo, aHi;
+ __m128i s_lo, s_hi;
+ __m128i a_lo, a_hi;
- pixMultiply_2x128 ( srcLo, srcHi, maskLo, maskHi, &sLo, &sHi);
- pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi);
+ pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+ pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
- over_2x128 (&sLo, &sHi, &aLo, &aHi, dstLo, dstHi);
+ over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
}
-static inline void
-cachePrefetch (__m128i* addr)
+static force_inline void
+cache_prefetch (__m128i* addr)
{
_mm_prefetch (addr, _MM_HINT_T0);
}
-static inline void
-cachePrefetchNext (__m128i* addr)
+static force_inline void
+cache_prefetch_next (__m128i* addr)
{
- _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
+ _mm_prefetch (addr + 4, _MM_HINT_T0); /* 64 bytes ahead */
}
/* load 4 pixels from a 16-byte boundary aligned address */
-static inline __m128i
-load128Aligned (__m128i* src)
+static force_inline __m128i
+load_128_aligned (__m128i* src)
{
return _mm_load_si128 (src);
}
/* load 4 pixels from a unaligned address */
-static inline __m128i
-load128Unaligned (__m128i* src)
+static force_inline __m128i
+load_128_unaligned (const __m128i* src)
{
return _mm_loadu_si128 (src);
}
-/* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
-static inline void
-save128WriteCombining (__m128i* dst, __m128i data)
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
+static force_inline void
+save_128_write_combining (__m128i* dst,
+ __m128i data)
{
_mm_stream_si128 (dst, data);
}
/* save 4 pixels on a 16-byte boundary aligned address */
-static inline void
-save128Aligned (__m128i* dst, __m128i data)
+static force_inline void
+save_128_aligned (__m128i* dst,
+ __m128i data)
{
_mm_store_si128 (dst, data);
}
/* save 4 pixels on a unaligned address */
-static inline void
-save128Unaligned (__m128i* dst, __m128i data)
+static force_inline void
+save_128_unaligned (__m128i* dst,
+ __m128i data)
{
_mm_storeu_si128 (dst, data);
}
-/* -------------------------------------------------------------------------------------------------
+/* ------------------------------------------------------------------
* MMX inlines
*/
-static inline __m64
+static force_inline __m64
unpack_32_1x64 (uint32_t data)
{
- return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
+ return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
}
-static inline __m64
-expandAlpha_1x64 (__m64 data)
+static force_inline __m64
+expand_alpha_1x64 (__m64 data)
{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
+ return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
}
-static inline __m64
-expandAlphaRev_1x64 (__m64 data)
+static force_inline __m64
+expand_alpha_rev_1x64 (__m64 data)
{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
+ return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
}
-static inline __m64
-expandPixel_8_1x64 (uint8_t data)
+static force_inline __m64
+expand_pixel_8_1x64 (uint8_t data)
{
- return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
+ return _mm_shuffle_pi16 (
+ unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
}
-static inline __m64
-pixMultiply_1x64 (__m64 data, __m64 alpha)
+static force_inline __m64
+pix_multiply_1x64 (__m64 data,
+ __m64 alpha)
{
return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
- xMask0080),
- xMask0101);
+ mask_x0080),
+ mask_x0101);
}
-static inline __m64
-pixAddMultiply_1x64 (__m64* src, __m64* alphaDst, __m64* dst, __m64* alphaSrc)
+static force_inline __m64
+pix_add_multiply_1x64 (__m64* src,
+ __m64* alpha_dst,
+ __m64* dst,
+ __m64* alpha_src)
{
- return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alphaDst),
- xMask0080),
- _mm_mullo_pi16 (*dst, *alphaSrc)),
- xMask0101);
+ return _mm_mulhi_pu16 (
+ _mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alpha_dst),
+ mask_x0080),
+ _mm_mullo_pi16 (*dst, *alpha_src)),
+ mask_x0101);
}
-static inline __m64
+static force_inline __m64
negate_1x64 (__m64 data)
{
- return _mm_xor_si64 (data, xMask00ff);
+ return _mm_xor_si64 (data, mask_x00ff);
}
-static inline __m64
-invertColors_1x64 (__m64 data)
+static force_inline __m64
+invert_colors_1x64 (__m64 data)
{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
+ return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
}
-static inline __m64
+static force_inline __m64
over_1x64 (__m64 src, __m64 alpha, __m64 dst)
{
- return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha)));
+ return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
}
-static inline __m64
-inOver_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
+static force_inline __m64
+in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
{
- return over_1x64 (pixMultiply_1x64 (*src, *mask),
- pixMultiply_1x64 (*alpha, *mask),
+ return over_1x64 (pix_multiply_1x64 (*src, *mask),
+ pix_multiply_1x64 (*alpha, *mask),
*dst);
}
-static inline __m64
-overRevNonPre_1x64 (__m64 src, __m64 dst)
+static force_inline __m64
+over_rev_non_pre_1x64 (__m64 src, __m64 dst)
{
- __m64 alpha = expandAlpha_1x64 (src);
+ __m64 alpha = expand_alpha_1x64 (src);
- return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src),
- _mm_or_si64 (alpha, xMaskAlpha)),
+ return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
+ _mm_or_si64 (alpha, mask_x_alpha)),
alpha,
dst);
}
-static inline uint32_t
-pack_1x64_32( __m64 data )
+static force_inline uint32_t
+pack_1x64_32 (__m64 data)
{
- return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
+ return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
}
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
@@ -448,7 +521,7 @@ pack_1x64_32( __m64 data )
* Note the trick here - the top word is shifted by another nibble to
* avoid it bumping into the middle word
*/
-static inline __m64
+static force_inline __m64
expand565_16_1x64 (uint16_t pixel)
{
__m64 p;
@@ -461,702 +534,965 @@ expand565_16_1x64 (uint16_t pixel)
p = _mm_or_si64 (t1, p);
p = _mm_or_si64 (t2, p);
- p = _mm_and_si64 (p, xMask565rgb);
- p = _mm_mullo_pi16 (p, xMask565Unpack);
+ p = _mm_and_si64 (p, mask_x565_rgb);
+ p = _mm_mullo_pi16 (p, mask_x565_unpack);
return _mm_srli_pi16 (p, 8);
}
-/* -------------------------------------------------------------------------------------------------
+/* ----------------------------------------------------------------------------
* Compose Core transformations
*/
-static inline uint32_t
-coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst)
+static force_inline uint32_t
+core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
{
- uint8_t a;
- __m64 ms;
+ uint8_t a;
+ __m64 ms;
a = src >> 24;
if (a == 0xff)
{
- return src;
+ return src;
}
- else if (a)
+ else if (src)
{
- ms = unpack_32_1x64 (src);
- return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst)));
+ ms = unpack_32_1x64 (src);
+ return pack_1x64_32 (
+ over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
}
return dst;
}
-static inline void
-coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, int w)
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
{
- uint32_t pa;
- uint32_t s, d;
+ uint32_t s = *ps;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmAlphaLo, xmmAlphaHi;
+ if (pm)
+ {
+ __m64 ms, mm;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ mm = unpack_32_1x64 (*pm);
+ mm = expand_alpha_1x64 (mm);
- /* Align dst on a 16-byte boundary */
- while (w &&
- ((unsigned long)pd & 15))
+ ms = unpack_32_1x64 (s);
+ ms = pix_multiply_1x64 (ms, mm);
+
+ s = pack_1x64_32 (ms);
+ }
+
+ return s;
+}
+
+static force_inline __m128i
+combine4 (const __m128i *ps, const __m128i *pm)
+{
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_msk_lo, xmm_msk_hi;
+ __m128i s;
+
+ if (pm)
{
- d = *pd;
- s = *ps++;
+ xmm_msk_lo = load_128_unaligned (pm);
- *pd++ = coreCombineOverUPixelsse2 (s, d);
- w--;
+ if (is_transparent (xmm_msk_lo))
+ return _mm_setzero_si128 ();
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ s = load_128_unaligned (ps);
- while (w >= 4)
+ if (pm)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
+ unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
- /* I'm loading unaligned because I'm not sure about the address alignment. */
- xmmSrcHi = load128Unaligned ((__m128i*) ps);
+ expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
- /* Check the alpha channel */
- pa = packAlpha (xmmSrcHi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_msk_lo, &xmm_msk_hi,
+ &xmm_src_lo, &xmm_src_hi);
- if (pa == 0xffffffff)
- {
- save128Aligned ((__m128i*)pd, xmmSrcHi);
- }
- else if (pa)
- {
- xmmDstHi = load128Aligned ((__m128i*) pd);
+ s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
+ }
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+ return s;
+}
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
+static force_inline void
+core_combine_over_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
+{
+ uint32_t s, d;
- over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
- /* rebuid the 4 pixel data and save*/
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
- w -= 4;
- ps += 4;
- pd += 4;
+ /* Align dst on a 16-byte boundary */
+ while (w && ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ ps++;
+ if (pm)
+ pm++;
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
+ while (w >= 4)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ /* I'm loading unaligned because I'm not sure about
+ * the address alignment.
+ */
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+ if (is_opaque (xmm_src_hi))
+ {
+ save_128_aligned ((__m128i*)pd, xmm_src_hi);
+ }
+ else if (!is_zero (xmm_src_hi))
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (
+ xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ ps += 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
}
while (w)
{
- d = *pd;
- s = *ps++;
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ ps++;
+ if (pm)
+ pm++;
- *pd++ = coreCombineOverUPixelsse2 (s, d);
- w--;
+ w--;
}
}
-static inline void
-coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, int w)
+static force_inline void
+core_combine_over_reverse_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
{
uint32_t s, d;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmAlphaLo, xmmAlphaHi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
/* Align dst on a 16-byte boundary */
while (w &&
((unsigned long)pd & 15))
{
- d = *pd;
- s = *ps++;
-
- *pd++ = coreCombineOverUPixelsse2 (d, s);
- w--;
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ *pd++ = core_combine_over_u_pixel_sse2 (d, s);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
- /* I'm loading unaligned because I'm not sure about the address alignment. */
- xmmSrcHi = load128Unaligned ((__m128i*) ps);
- xmmDstHi = load128Aligned ((__m128i*) pd);
+ /* I'm loading unaligned because I'm not sure
+ * about the address alignment.
+ */
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
- over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmSrcLo, &xmmSrcHi);
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_src_lo, &xmm_src_hi);
- /* rebuid the 4 pixel data and save*/
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi));
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_src_lo, xmm_src_hi));
- w -= 4;
- ps += 4;
- pd += 4;
+ w -= 4;
+ ps += 4;
+ pd += 4;
+
+ if (pm)
+ pm += 4;
}
while (w)
{
- d = *pd;
- s = *ps++;
-
- *pd++ = coreCombineOverUPixelsse2 (d, s);
- w--;
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ *pd++ = core_combine_over_u_pixel_sse2 (d, s);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
}
}
-static inline uint32_t
-coreCombineInUPixelsse2 (uint32_t src, uint32_t dst)
+static force_inline uint32_t
+core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
{
uint32_t maska = src >> 24;
if (maska == 0)
{
- return 0;
+ return 0;
}
else if (maska != 0xff)
{
- return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src))));
+ return pack_1x64_32 (
+ pix_multiply_1x64 (unpack_32_1x64 (dst),
+ expand_alpha_1x64 (unpack_32_1x64 (src))));
}
return dst;
}
-static inline void
-coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, int w)
+static force_inline void
+core_combine_in_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
{
uint32_t s, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
- s = *ps++;
- d = *pd;
-
- *pd++ = coreCombineInUPixelsse2 (d, s);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixelsse2 (d, s);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
-
- xmmDstHi = load128Aligned ((__m128i*) pd);
- xmmSrcHi = load128Unaligned ((__m128i*) ps);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
-
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
}
while (w)
{
- s = *ps++;
- d = *pd;
-
- *pd++ = coreCombineInUPixelsse2 (d, s);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixelsse2 (d, s);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
}
}
-static inline void
-coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, int w)
+static force_inline void
+core_combine_reverse_in_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
- s = *ps++;
- d = *pd;
-
- *pd++ = coreCombineInUPixelsse2 (s, d);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixelsse2 (s, d);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
-
- xmmDstHi = load128Aligned ((__m128i*) pd);
- xmmSrcHi = load128Unaligned ((__m128i*) ps);
-
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
}
while (w)
{
- s = *ps++;
- d = *pd;
-
- *pd++ = coreCombineInUPixelsse2 (s, d);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixelsse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
}
}
-static inline void
-coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, int w)
+static force_inline void
+core_combine_reverse_out_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
{
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
- uint32_t s = *ps++;
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
- w--;
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d), negate_1x64 (
+ expand_alpha_1x64 (unpack_32_1x64 (s)))));
+
+ if (pm)
+ pm++;
+ ps++;
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- xmmSrcHi = load128Unaligned ((__m128i*) ps);
- xmmDstHi = load128Aligned ((__m128i*) pd);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- negate_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ ps += 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
- ps += 4;
- pd += 4;
- w -= 4;
+ w -= 4;
}
while (w)
{
- uint32_t s = *ps++;
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
- w--;
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d), negate_1x64 (
+ expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ ps++;
+ if (pm)
+ pm++;
+ w--;
}
}
-static inline void
-coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, int w)
+static force_inline void
+core_combine_out_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
{
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
- uint32_t s = *ps++;
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
- w--;
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), negate_1x64 (
+ expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ w--;
+ ps++;
+ if (pm)
+ pm++;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
- xmmSrcHi = load128Unaligned ((__m128i*) ps);
- xmmDstHi = load128Aligned ((__m128i*) pd);
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
- negate_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- ps += 4;
- pd += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
}
while (w)
{
- uint32_t s = *ps++;
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
- w--;
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), negate_1x64 (
+ expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ w--;
+ ps++;
+ if (pm)
+ pm++;
}
}
-static inline uint32_t
-coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst)
+static force_inline uint32_t
+core_combine_atop_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
{
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
- __m64 sa = negate_1x64 (expandAlpha_1x64 (s));
- __m64 da = expandAlpha_1x64 (d);
+ __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
+ __m64 da = expand_alpha_1x64 (d);
- return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
+ return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
}
-static inline void
-coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w)
+static force_inline void
+core_combine_atop_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
{
uint32_t s, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
- s = *ps++;
- d = *pd;
-
- *pd++ = coreCombineAtopUPixelsse2 (s, d);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
-
- xmmSrcHi = load128Unaligned ((__m128i*) ps);
- xmmDstHi = load128Aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
-
- pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
- &xmmDstLo, &xmmDstHi );
-
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
}
while (w)
{
- s = *ps++;
- d = *pd;
-
- *pd++ = coreCombineAtopUPixelsse2 (s, d);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
}
}
-static inline uint32_t
-coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst)
+static force_inline uint32_t
+core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
{
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
- __m64 sa = expandAlpha_1x64 (s);
- __m64 da = negate_1x64 (expandAlpha_1x64 (d));
+ __m64 sa = expand_alpha_1x64 (s);
+ __m64 da = negate_1x64 (expand_alpha_1x64 (d));
- return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
+ return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
}
-static inline void
-coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w)
+static force_inline void
+core_combine_reverse_atop_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
{
uint32_t s, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
- s = *ps++;
- d = *pd;
-
- *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
-
- xmmSrcHi = load128Unaligned ((__m128i*) ps);
- xmmDstHi = load128Aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
- &xmmDstLo, &xmmDstHi );
-
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
}
while (w)
{
- s = *ps++;
- d = *pd;
-
- *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
}
}
-static inline uint32_t
-coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst)
+static force_inline uint32_t
+core_combine_xor_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
{
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
- __m64 negD = negate_1x64 (expandAlpha_1x64 (d));
- __m64 negS = negate_1x64 (expandAlpha_1x64 (s));
+ __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
+ __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
- return pack_1x64_32 (pixAddMultiply_1x64 (&s, &negD, &d, &negS));
+ return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
}
-static inline void
-coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, int width)
+static force_inline void
+core_combine_xor_u_sse2 (uint32_t* dst,
+ const uint32_t* src,
+ const uint32_t *mask,
+ int width)
{
int w = width;
uint32_t s, d;
uint32_t* pd = dst;
const uint32_t* ps = src;
+ const uint32_t* pm = mask;
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && ((unsigned long) pd & 15))
{
- s = *ps++;
- d = *pd;
-
- *pd++ = coreCombineXorUPixelsse2 (s, d);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
-
- xmmSrc = load128Unaligned ((__m128i*) ps);
- xmmDst = load128Aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
- &xmmDstLo, &xmmDstHi );
-
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
+ xmm_dst = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
}
while (w)
{
- s = *ps++;
- d = *pd;
-
- *pd++ = coreCombineXorUPixelsse2 (s, d);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
}
}
-static inline void
-coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, int width)
+static force_inline void
+core_combine_add_u_sse2 (uint32_t* dst,
+ const uint32_t* src,
+ const uint32_t* mask,
+ int width)
{
int w = width;
- uint32_t s,d;
+ uint32_t s, d;
uint32_t* pd = dst;
const uint32_t* ps = src;
+ const uint32_t* pm = mask;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- d = *pd;
- *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ ps++;
+ if (pm)
+ pm++;
+ *pd++ = _mm_cvtsi64_si32 (
+ _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
-
- save128Aligned( (__m128i*)pd,
- _mm_adds_epu8( load128Unaligned((__m128i*)ps),
- load128Aligned ((__m128i*)pd)) );
- pd += 4;
- ps += 4;
- w -= 4;
+ __m128i s;
+
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ s = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+ save_128_aligned (
+ (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
+
+ pd += 4;
+ ps += 4;
+ if (pm)
+ pm += 4;
+ w -= 4;
}
while (w--)
{
- s = *ps++;
- d = *pd;
- *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ ps++;
+ *pd++ = _mm_cvtsi64_si32 (
+ _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+ if (pm)
+ pm++;
}
}
-static inline uint32_t
-coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
+static force_inline uint32_t
+core_combine_saturate_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
{
__m64 ms = unpack_32_1x64 (src);
__m64 md = unpack_32_1x64 (dst);
@@ -1165,1816 +1501,2061 @@ coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
if (sa > da)
{
- ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (FbIntDiv(da, sa) << 24)));
+ ms = pix_multiply_1x64 (
+ ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
}
return pack_1x64_32 (_mm_adds_pu16 (md, ms));
}
-static inline void
-coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, int w)
+static force_inline void
+core_combine_saturate_u_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
{
- uint32_t s,d;
+ uint32_t s, d;
- uint32_t packCmp;
- __m128i xmmSrc, xmmDst;
+ uint32_t pack_cmp;
+ __m128i xmm_src, xmm_dst;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
-
- xmmDst = load128Aligned ((__m128i*)pd);
- xmmSrc = load128Unaligned((__m128i*)ps);
-
- packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24),
- _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24)));
-
- /* if some alpha src is grater than respective ~alpha dst */
- if (packCmp)
- {
- s = *ps++;
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
-
- s = *ps++;
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
-
- s = *ps++;
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
-
- s = *ps++;
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
- }
- else
- {
- save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc));
-
- pd += 4;
- ps += 4;
- }
-
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_dst = load_128_aligned ((__m128i*)pd);
+ xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpgt_epi32 (
+ _mm_srli_epi32 (xmm_src, 24),
+ _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
+
+ /* if some alpha src is grater than respective ~alpha dst */
+ if (pack_cmp)
+ {
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+ }
+ else
+ {
+ save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
+
+ pd += 4;
+ ps += 4;
+ if (pm)
+ pm += 4;
+ }
+
+ w -= 4;
}
while (w--)
{
- s = *ps++;
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ ps++;
+ if (pm)
+ pm++;
}
}
-static inline void
-coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+static force_inline void
+core_combine_src_ca_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, m;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmMaskLo, xmmMaskHi;
- __m128i xmmDstLo, xmmDstHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
- w--;
+ s = *ps++;
+ m = *pm++;
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
- w--;
+ s = *ps++;
+ m = *pm++;
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+ w--;
}
}
-static inline uint32_t
-coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+static force_inline uint32_t
+core_combine_over_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
{
__m64 s = unpack_32_1x64 (src);
- __m64 expAlpha = expandAlpha_1x64 (s);
- __m64 unpkMask = unpack_32_1x64 (mask);
- __m64 unpkDst = unpack_32_1x64 (dst);
+ __m64 expAlpha = expand_alpha_1x64 (s);
+ __m64 unpk_mask = unpack_32_1x64 (mask);
+ __m64 unpk_dst = unpack_32_1x64 (dst);
- return pack_1x64_32 (inOver_1x64 (&s, &expAlpha, &unpkMask, &unpkDst));
+ return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
}
-static inline void
-coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+static force_inline void
+core_combine_over_ca_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineOverCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
-
- inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineOverCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+ w--;
}
}
-static inline uint32_t
-coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+static force_inline uint32_t
+core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
{
__m64 d = unpack_32_1x64 (dst);
- return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
+ return pack_1x64_32 (
+ over_1x64 (d, expand_alpha_1x64 (d),
+ pix_multiply_1x64 (unpack_32_1x64 (src),
+ unpack_32_1x64 (mask))));
}
-static inline void
-coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+static force_inline void
+core_combine_over_reverse_ca_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+ w--;
}
}
-static inline void
-coreCombineInCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
+static force_inline void
+core_combine_in_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
+ expand_alpha_1x64 (unpack_32_1x64 (d))));
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expandAlpha_1x64 (unpack_32_1x64 (d))));
- w--;
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (m)),
+ expand_alpha_1x64 (unpack_32_1x64 (d))));
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expandAlpha_1x64 (unpack_32_1x64 (d))));
- w--;
+ w--;
}
}
-static inline void
-coreCombineInReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
+static force_inline void
+core_combine_in_reverse_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
- pixMultiply_1x64 (unpack_32_1x64 (m),
- expandAlpha_1x64 (unpack_32_1x64 (s)))));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d),
+ pix_multiply_1x64 (unpack_32_1x64 (m),
+ expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
-
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
- pixMultiply_1x64 (unpack_32_1x64 (m),
- expandAlpha_1x64 (unpack_32_1x64 (s)))));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d),
+ pix_multiply_1x64 (unpack_32_1x64 (m),
+ expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ w--;
}
}
-static inline void
-coreCombineOutCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
+static force_inline void
+core_combine_out_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (m)),
+ negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
- negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (m)),
+ negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
- w--;
+ w--;
}
}
-static inline void
-coreCombineOutReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
+static force_inline void
+core_combine_out_reverse_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
- negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
- expandAlpha_1x64 (unpack_32_1x64 (s))))));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d),
+ negate_1x64 (pix_multiply_1x64 (
+ unpack_32_1x64 (m),
+ expand_alpha_1x64 (unpack_32_1x64 (s))))));
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
- negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
- negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
- expandAlpha_1x64 (unpack_32_1x64 (s))))));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d),
+ negate_1x64 (pix_multiply_1x64 (
+ unpack_32_1x64 (m),
+ expand_alpha_1x64 (unpack_32_1x64 (s))))));
+ w--;
}
}
-static inline uint32_t
-coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+static force_inline uint32_t
+core_combine_atop_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
{
__m64 m = unpack_32_1x64 (mask);
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
- __m64 sa = expandAlpha_1x64 (s);
- __m64 da = expandAlpha_1x64 (d);
+ __m64 sa = expand_alpha_1x64 (s);
+ __m64 da = expand_alpha_1x64 (d);
- s = pixMultiply_1x64 (s, m);
- m = negate_1x64 (pixMultiply_1x64 (m, sa));
+ s = pix_multiply_1x64 (s, m);
+ m = negate_1x64 (pix_multiply_1x64 (m, sa));
- return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
+ return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
}
-static inline void
-coreCombineAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
+static force_inline void
+core_combine_atop_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
-
- negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
- &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+ w--;
}
}
-static inline uint32_t
-coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+static force_inline uint32_t
+core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
{
__m64 m = unpack_32_1x64 (mask);
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
- __m64 da = negate_1x64 (expandAlpha_1x64 (d));
- __m64 sa = expandAlpha_1x64 (s);
+ __m64 da = negate_1x64 (expand_alpha_1x64 (d));
+ __m64 sa = expand_alpha_1x64 (s);
- s = pixMultiply_1x64 (s, m);
- m = pixMultiply_1x64 (m, sa);
+ s = pix_multiply_1x64 (s, m);
+ m = pix_multiply_1x64 (m, sa);
- return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
+ return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
}
-static inline void
-coreCombineReverseAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
+static force_inline void
+core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
-
- negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
- &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+ w--;
}
}
-static inline uint32_t
-coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+static force_inline uint32_t
+core_combine_xor_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
{
__m64 a = unpack_32_1x64 (mask);
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
- __m64 alphaDst = negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s)));
- __m64 dest = pixMultiply_1x64 (s, a);
- __m64 alphaSrc = negate_1x64 (expandAlpha_1x64 (d));
+ __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
+ a, expand_alpha_1x64 (s)));
+ __m64 dest = pix_multiply_1x64 (s, a);
+ __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
- return pack_1x64_32 (pixAddMultiply_1x64 (&d,
- &alphaDst,
- &dest,
- &alphaSrc));
+ return pack_1x64_32 (pix_add_multiply_1x64 (&d,
+ &alpha_dst,
+ &dest,
+ &alpha_src));
}
-static inline void
-coreCombineXorCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
+static force_inline void
+core_combine_xor_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineXorCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
-
- negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
- negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
- &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineXorCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+ w--;
}
}
-static inline void
-coreCombineAddCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
+static force_inline void
+core_combine_add_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
+ unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+ w--;
}
/* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ cache_prefetch ((__m128i*)ps);
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo),
- _mm_adds_epu8 (xmmSrcHi, xmmDstHi)));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)ps);
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (
+ _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+ _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
+ unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+ w--;
}
}
-/* -------------------------------------------------------------------------------------------------
- * fbComposeSetupSSE2
+/* ---------------------------------------------------
+ * fb_compose_setup_sSE2
*/
-static inline __m64
-createMask_16_64 (uint16_t mask)
+static force_inline __m64
+create_mask_16_64 (uint16_t mask)
{
return _mm_set1_pi16 (mask);
}
-static inline __m128i
-createMask_16_128 (uint16_t mask)
+static force_inline __m128i
+create_mask_16_128 (uint16_t mask)
{
return _mm_set1_epi16 (mask);
}
-static inline __m64
-createMask_2x32_64 (uint32_t mask0, uint32_t mask1)
+static force_inline __m64
+create_mask_2x32_64 (uint32_t mask0,
+ uint32_t mask1)
{
return _mm_set_pi32 (mask0, mask1);
}
-static inline __m128i
-createMask_2x32_128 (uint32_t mask0, uint32_t mask1)
+static force_inline __m128i
+create_mask_2x32_128 (uint32_t mask0,
+ uint32_t mask1)
{
return _mm_set_epi32 (mask0, mask1, mask0, mask1);
}
/* SSE2 code patch for fbcompose.c */
-static FASTCALL void
-sse2CombineMaskU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_over_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineReverseInUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_over_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineOverU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineOverUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_over_reverse_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineOverReverseU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineOverReverseUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_in_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineInU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineInUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_reverse_in_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineInReverseU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineReverseInUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_out_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineOutU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineOutUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_reverse_out_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineOutReverseU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineReverseOutUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_atop_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineAtopU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineAtopUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineAtopReverseU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineReverseAtopUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_xor_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineXorU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_add_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineXorUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_add_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineAddU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineAddUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_saturate_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineSaturateU (uint32_t *dst, const uint32_t *src, int width)
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineSaturateUsse2 (dst, src, width);
- _mm_empty();
+ core_combine_src_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineSrcC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineSrcCsse2 (dst, src, mask, width);
- _mm_empty();
+ core_combine_over_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineOverC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineOverCsse2 (dst, src, mask, width);
- _mm_empty();
+ core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineOverReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineOverReverseCsse2 (dst, src, mask, width);
- _mm_empty();
+ core_combine_in_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineInC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineInCsse2 (dst, src, mask, width);
- _mm_empty();
+ core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineInReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineInReverseCsse2 (dst, src, mask, width);
- _mm_empty();
+ core_combine_out_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineOutC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineOutCsse2 (dst, src, mask, width);
- _mm_empty();
+ core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineOutReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineOutReverseCsse2 (dst, src, mask, width);
- _mm_empty();
+ core_combine_atop_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineAtopC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineAtopCsse2 (dst, src, mask, width);
- _mm_empty();
+ core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineAtopReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineReverseAtopCsse2 (dst, src, mask, width);
- _mm_empty();
+ core_combine_xor_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineXorC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
- coreCombineXorCsse2 (dst, src, mask, width);
- _mm_empty();
+ core_combine_add_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
}
-static FASTCALL void
-sse2CombineAddC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
-{
- coreCombineAddCsse2 (dst, src, mask, width);
- _mm_empty();
-}
+/* -------------------------------------------------------------------
+ * composite_over_n_8888
+ */
-void
-fbComposeSetupSSE2(void)
+static void
+sse2_composite_over_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
{
- static pixman_bool_t initialized = FALSE;
-
- if (initialized)
- return;
-
- /* check if we have SSE2 support and initialize accordingly */
- if (pixman_have_sse2())
- {
- /* SSE2 constants */
- Mask565r = createMask_2x32_128 (0x00f80000, 0x00f80000);
- Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000);
- Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0);
- Mask565b = createMask_2x32_128 (0x0000001f, 0x0000001f);
- MaskRed = createMask_2x32_128 (0x00f80000, 0x00f80000);
- MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00);
- MaskBlue = createMask_2x32_128 (0x000000f8, 0x000000f8);
- Mask565FixRB = createMask_2x32_128 (0x00e000e0, 0x00e000e0);
- Mask565FixG = createMask_2x32_128 (0x0000c000, 0x0000c000);
- Mask0080 = createMask_16_128 (0x0080);
- Mask00ff = createMask_16_128 (0x00ff);
- Mask0101 = createMask_16_128 (0x0101);
- Maskffff = createMask_16_128 (0xffff);
- Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000);
- MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000);
-
- /* MMX constants */
- xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f);
- xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840);
-
- xMask0080 = createMask_16_64 (0x0080);
- xMask00ff = createMask_16_64 (0x00ff);
- xMask0101 = createMask_16_64 (0x0101);
- xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000);
-
- /* SSE code patch for fbcompose.c */
- pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = sse2CombineOverU;
- pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU;
- pixman_composeFunctions.combineU[PIXMAN_OP_IN] = sse2CombineInU;
- pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU;
- pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = sse2CombineOutU;
-
- pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU;
- pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = sse2CombineAtopU;
- pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU;
- pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = sse2CombineXorU;
- pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = sse2CombineAddU;
-
- pixman_composeFunctions.combineU[PIXMAN_OP_SATURATE] = sse2CombineSaturateU;
-
- pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = sse2CombineSrcC;
- pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = sse2CombineOverC;
- pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC;
- pixman_composeFunctions.combineC[PIXMAN_OP_IN] = sse2CombineInC;
- pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC;
- pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = sse2CombineOutC;
- pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC;
- pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = sse2CombineAtopC;
- pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC;
- pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = sse2CombineXorC;
- pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = sse2CombineAddC;
-
- pixman_composeFunctions.combineMaskU = sse2CombineMaskU;
- }
-
- initialized = TRUE;
-
- _mm_empty();
-}
+ uint32_t src;
+ uint32_t *dst_line, *dst, d;
+ uint16_t w;
+ int dst_stride;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolid_nx8888
- */
-
-void
-fbCompositeSolid_nx8888sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t src;
- uint32_t *dstLine, *dst, d;
- uint16_t w;
- int dstStride;
- __m128i xmmSrc, xmmAlpha;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
-
- fbComposeGetSolid(pSrc, src, pDst->bits.format);
-
- if (src >> 24 == 0)
+ if (src == 0)
return;
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- xmmSrc = expandPixel_32_1x128 (src);
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
while (height--)
{
- dst = dstLine;
+ dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
- dstLine += dstStride;
- w = width;
+ dst_line += dst_stride;
+ w = width;
- while (w && (unsigned long)dst & 15)
- {
- d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
- _mm_movepi64_pi64 (xmmAlpha),
- unpack_32_1x64 (d)));
- w--;
- }
+ while (w && (unsigned long)dst & 15)
+ {
+ d = *dst;
+ *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+ _mm_movepi64_pi64 (xmm_alpha),
+ unpack_32_1x64 (d)));
+ w--;
+ }
- cachePrefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)dst);
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)dst);
+ while (w >= 4)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)dst);
- xmmDst = load128Aligned ((__m128i*)dst);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDstLo, &xmmDstHi);
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst_lo, &xmm_dst_hi);
- /* rebuid the 4 pixel data and save*/
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- w -= 4;
- dst += 4;
- }
+ w -= 4;
+ dst += 4;
+ }
- while (w)
- {
- d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
- _mm_movepi64_pi64 (xmmAlpha),
- unpack_32_1x64 (d)));
- w--;
- }
+ while (w)
+ {
+ d = *dst;
+ *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+ _mm_movepi64_pi64 (xmm_alpha),
+ unpack_32_1x64 (d)));
+ w--;
+ }
}
- _mm_empty();
-}
-
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolid_nx0565
- */
-void
-fbCompositeSolid_nx0565sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t src;
- uint16_t *dstLine, *dst, d;
- uint16_t w;
- int dstStride;
- __m128i xmmSrc, xmmAlpha;
- __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
-
- fbComposeGetSolid(pSrc, src, pDst->bits.format);
-
- if (src >> 24 == 0)
- return;
-
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-
- xmmSrc = expandPixel_32_1x128 (src);
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
-
- while (height--)
- {
- dst = dstLine;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
-
- dstLine += dstStride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- d = *dst;
-
- *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
- _mm_movepi64_pi64 (xmmAlpha),
- expand565_16_1x64 (d))));
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 8)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)dst);
-
- xmmDst = load128Aligned ((__m128i*)dst);
-
- unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
-
- over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst0, &xmmDst1);
- over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst2, &xmmDst3);
-
- xmmDst = pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
- save128Aligned ((__m128i*)dst, xmmDst);
-
- dst += 8;
- w -= 8;
- }
-
- while (w--)
- {
- d = *dst;
- *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
- _mm_movepi64_pi64 (xmmAlpha),
- expand565_16_1x64 (d))));
- }
- }
-
- _mm_empty();
+ _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolidMask_nx8888x8888C
+/* ---------------------------------------------------------------------
+ * composite_over_n_0565
*/
+static void
+sse2_composite_over_n_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint16_t *dst_line, *dst, d;
+ uint16_t w;
+ int dst_stride;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-void
-fbCompositeSolidMask_nx8888x8888Csse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t src, srca;
- uint32_t *dstLine, d;
- uint32_t *maskLine, m;
- uint32_t packCmp;
- int dstStride, maskStride;
-
- __m128i xmmSrc, xmmAlpha;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
-
- __m64 mmxSrc, mmxAlpha, mmxMask, mmxDst;
-
- fbComposeGetSolid(pSrc, src, pDst->bits.format);
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- srca = src >> 24;
- if (srca == 0)
+ if (src == 0)
return;
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ());
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
- mmxSrc = _mm_movepi64_pi64 (xmmSrc);
- mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
while (height--)
{
- int w = width;
- uint32_t *pm = (uint32_t *)maskLine;
- uint32_t *pd = (uint32_t *)dstLine;
-
- dstLine += dstStride;
- maskLine += maskStride;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
- while (w && (unsigned long)pd & 15)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
- mmxMask = unpack_32_1x64 (m);
- mmxDst = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDst));
- }
-
- pd++;
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmMask = load128Unaligned ((__m128i*)pm);
-
- packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
-
- /* if all bits in mask are zero, packCmp are equal to 0xffff */
- if (packCmp != 0xffff)
- {
- xmmDst = load128Aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
- inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
-
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
- mmxMask = unpack_32_1x64 (m);
- mmxDst = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDst));
- }
-
- pd++;
- w--;
- }
+ dst = dst_line;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ d = *dst;
+
+ *dst++ = pack_565_32_16 (
+ pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+ _mm_movepi64_pi64 (xmm_alpha),
+ expand565_16_1x64 (d))));
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+
+ while (w >= 8)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)dst);
+
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst0, &xmm_dst1);
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst2, &xmm_dst3);
+
+ xmm_dst = pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ save_128_aligned ((__m128i*)dst, xmm_dst);
+
+ dst += 8;
+ w -= 8;
+ }
+
+ while (w--)
+ {
+ d = *dst;
+ *dst++ = pack_565_32_16 (
+ pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+ _mm_movepi64_pi64 (xmm_alpha),
+ expand565_16_1x64 (d))));
+ }
}
- _mm_empty();
+ _mm_empty ();
}
-
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_8888x8x8888
+/* ---------------------------------------------------------------------------
+ * composite_over_n_8888_8888_ca
*/
-void
-fbCompositeSrc_8888x8x8888sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t *dstLine, *dst;
- uint32_t *srcLine, *src;
- uint32_t mask;
- uint16_t w;
- int dstStride, srcStride;
-
- __m128i xmmMask;
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
- __m128i xmmAlphaLo, xmmAlphaHi;
-
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
- fbComposeGetSolid (pMask, mask, pDst->bits.format);
-
- xmmMask = createMask_16_128 (mask >> 24);
-
- while (height--)
- {
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)src);
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t s = *src++;
- uint32_t d = *dst;
-
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expandAlpha_1x64 (ms);
- __m64 dest = _mm_movepi64_pi64 (xmmMask);
- __m64 alphaDst = unpack_32_1x64 (d);
-
- *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
- &alpha,
- &dest,
- &alphaDst));
-
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)src);
-
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)dst);
- cachePrefetchNext ((__m128i*)src);
-
- xmmSrc = load128Unaligned ((__m128i*)src);
- xmmDst = load128Aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
+static void
+sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint32_t *dst_line, d;
+ uint32_t *mask_line, m;
+ uint32_t pack_cmp;
+ int dst_stride, mask_stride;
- inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
- dst += 4;
- src += 4;
- w -= 4;
- }
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- while (w)
- {
- uint32_t s = *src++;
- uint32_t d = *dst;
+ if (src == 0)
+ return;
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expandAlpha_1x64 (ms);
- __m64 mask = _mm_movepi64_pi64 (xmmMask);
- __m64 dest = unpack_32_1x64 (d);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
- *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
- &alpha,
- &mask,
- &dest));
+ xmm_src = _mm_unpacklo_epi8 (
+ create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = _mm_movepi64_pi64 (xmm_src);
+ mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
- w--;
- }
+ while (height--)
+ {
+ int w = width;
+ const uint32_t *pm = (uint32_t *)mask_line;
+ uint32_t *pd = (uint32_t *)dst_line;
+
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
+ while (w && (unsigned long)pd & 15)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ mmx_mask = unpack_32_1x64 (m);
+ mmx_dest = unpack_32_1x64 (d);
+
+ *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)pd);
+ cache_prefetch ((__m128i*)pm);
+
+ while (w >= 4)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)pd);
+ cache_prefetch_next ((__m128i*)pm);
+
+ xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+ pack_cmp =
+ _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+ if (pack_cmp != 0xffff)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)pd);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ mmx_mask = unpack_32_1x64 (m);
+ mmx_dest = unpack_32_1x64 (d);
+
+ *pd = pack_1x64_32 (
+ in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
}
- _mm_empty();
+ _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_x888xnx8888
+/*---------------------------------------------------------------------
+ * composite_over_8888_n_8888
*/
-void
-fbCompositeSrc_x888xnx8888sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t *dstLine, *dst;
- uint32_t *srcLine, *src;
- uint32_t mask;
- int dstStride, srcStride;
- uint16_t w;
-
- __m128i xmmMask, xmmAlpha;
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
-
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
- fbComposeGetSolid (pMask, mask, pDst->bits.format);
-
- xmmMask = createMask_16_128 (mask >> 24);
- xmmAlpha = Mask00ff;
- while (height--)
- {
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)src);
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t s = (*src++) | 0xff000000;
- uint32_t d = *dst;
-
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
- __m64 mask = _mm_movepi64_pi64 (xmmMask);
- __m64 dest = unpack_32_1x64 (d);
-
- *dst++ = pack_1x64_32 (inOver_1x64 (&src,
- &alpha,
- &mask,
- &dest));
-
- w--;
- }
+static void
+sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ uint32_t mask;
+ uint16_t w;
+ int dst_stride, src_stride;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)src);
+ __m128i xmm_mask;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)dst);
- cachePrefetchNext ((__m128i*)src);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
- xmmDst = load128Aligned ((__m128i*)dst);
+ mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
+ xmm_mask = create_mask_16_128 (mask >> 24);
- inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlpha, &xmmAlpha, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)src);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = *src++;
+ uint32_t d = *dst;
+
+ __m64 ms = unpack_32_1x64 (s);
+ __m64 alpha = expand_alpha_1x64 (ms);
+ __m64 dest = _mm_movepi64_pi64 (xmm_mask);
+ __m64 alpha_dst = unpack_32_1x64 (d);
+
+ *dst++ = pack_1x64_32 (
+ in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)src);
+
+ while (w >= 4)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)dst);
+ cache_prefetch_next ((__m128i*)src);
+
+ xmm_src = load_128_unaligned ((__m128i*)src);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t s = *src++;
+ uint32_t d = *dst;
+
+ __m64 ms = unpack_32_1x64 (s);
+ __m64 alpha = expand_alpha_1x64 (ms);
+ __m64 mask = _mm_movepi64_pi64 (xmm_mask);
+ __m64 dest = unpack_32_1x64 (d);
+
+ *dst++ = pack_1x64_32 (
+ in_over_1x64 (&ms, &alpha, &mask, &dest));
+
+ w--;
+ }
+ }
- save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ _mm_empty ();
+}
- dst += 4;
- src += 4;
- w -= 4;
+/* ---------------------------------------------------------------------
+ * composite_over_x888_n_8888
+ */
+static void
+sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ uint32_t mask;
+ int dst_stride, src_stride;
+ uint16_t w;
- }
+ __m128i xmm_mask, xmm_alpha;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- while (w)
- {
- uint32_t s = (*src++) | 0xff000000;
- uint32_t d = *dst;
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
- __m64 mask = _mm_movepi64_pi64 (xmmMask);
- __m64 dest = unpack_32_1x64 (d);
+ mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
- *dst++ = pack_1x64_32 (inOver_1x64 (&src,
- &alpha,
- &mask,
- &dest));
+ xmm_mask = create_mask_16_128 (mask >> 24);
+ xmm_alpha = mask_00ff;
- w--;
- }
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)src);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = (*src++) | 0xff000000;
+ uint32_t d = *dst;
+
+ __m64 src = unpack_32_1x64 (s);
+ __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
+ __m64 mask = _mm_movepi64_pi64 (xmm_mask);
+ __m64 dest = unpack_32_1x64 (d);
+
+ *dst++ = pack_1x64_32 (
+ in_over_1x64 (&src, &alpha, &mask, &dest));
+
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)src);
+
+ while (w >= 4)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)dst);
+ cache_prefetch_next ((__m128i*)src);
+
+ xmm_src = _mm_or_si128 (
+ load_128_unaligned ((__m128i*)src), mask_ff000000);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+
+ }
+
+ while (w)
+ {
+ uint32_t s = (*src++) | 0xff000000;
+ uint32_t d = *dst;
+
+ __m64 src = unpack_32_1x64 (s);
+ __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
+ __m64 mask = _mm_movepi64_pi64 (xmm_mask);
+ __m64 dest = unpack_32_1x64 (d);
+
+ *dst++ = pack_1x64_32 (
+ in_over_1x64 (&src, &alpha, &mask, &dest));
+
+ w--;
+ }
}
- _mm_empty();
+ _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_8888x8888
+/* --------------------------------------------------------------------
+ * composite_over_8888_8888
*/
-void
-fbCompositeSrc_8888x8888sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- int dstStride, srcStride;
- uint32_t *dstLine, *dst;
- uint32_t *srcLine, *src;
-
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-
- dst = dstLine;
- src = srcLine;
+static void
+sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ int dst_stride, src_stride;
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ dst = dst_line;
+ src = src_line;
while (height--)
{
- coreCombineOverUsse2 (dst, src, width);
+ core_combine_over_u_sse2 (dst, src, NULL, width);
- dst += dstStride;
- src += srcStride;
+ dst += dst_stride;
+ src += src_stride;
}
- _mm_empty();
+ _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_8888x0565
+/* ------------------------------------------------------------------
+ * composite_over_8888_0565
*/
-static inline uint16_t
-fbCompositeSrc_8888x0565pixel (uint32_t src, uint16_t dst)
+static force_inline uint16_t
+composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
{
- __m64 ms;
+ __m64 ms;
ms = unpack_32_1x64 (src);
- return pack565_32_16( pack_1x64_32 (over_1x64 (ms,
- expandAlpha_1x64 (ms),
- expand565_16_1x64 (dst))));
+ return pack_565_32_16 (
+ pack_1x64_32 (
+ over_1x64 (
+ ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
}
-void
-fbCompositeSrc_8888x0565sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint16_t *dstLine, *dst, d;
- uint32_t *srcLine, *src, s;
- int dstStride, srcStride;
- uint16_t w;
-
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
-
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+static void
+sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ uint16_t w;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
#if 0
/* FIXME
@@ -2982,241 +3563,262 @@ fbCompositeSrc_8888x0565sse2 (pixman_op_t op,
* I copy the code from MMX one and keep the fixme.
* If it's a problem there, probably is a problem here.
*/
- assert (pSrc->pDrawable == pMask->pDrawable);
+ assert (src_image->drawable == mask_image->drawable);
#endif
while (height--)
{
- dst = dstLine;
- src = srcLine;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
- dstLine += dstStride;
- srcLine += srcStride;
- w = width;
-
- /* Align dst on a 16-byte boundary */
- while (w &&
- ((unsigned long)dst & 15))
- {
- s = *src++;
- d = *dst;
-
- *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
- /* It's a 8 pixel loop */
- while (w >= 8)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)src);
- cachePrefetchNext ((__m128i*)dst);
-
- /* I'm loading unaligned because I'm not sure about the address alignment. */
- xmmSrc = load128Unaligned ((__m128i*) src);
- xmmDst = load128Aligned ((__m128i*) dst);
-
- /* Unpacking */
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
-
- /* I'm loading next 4 pixels from memory before to optimze the memory read. */
- xmmSrc = load128Unaligned ((__m128i*) (src+4));
-
- over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst0, &xmmDst1);
-
- /* Unpacking */
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
-
- over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst2, &xmmDst3);
-
- save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
-
- w -= 8;
- dst += 8;
- src += 8;
- }
-
- while (w--)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
- }
+ dst = dst_line;
+ src = src_line;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Align dst on a 16-byte boundary */
+ while (w &&
+ ((unsigned long)dst & 15))
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = composite_over_8888_0565pixel (s, d);
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
+ /* It's a 8 pixel loop */
+ while (w >= 8)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+ cache_prefetch_next ((__m128i*)dst);
+
+ /* I'm loading unaligned because I'm not sure
+ * about the address alignment.
+ */
+ xmm_src = load_128_unaligned ((__m128i*) src);
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ /* I'm loading next 4 pixels from memory
+ * before to optimze the memory read.
+ */
+ xmm_src = load_128_unaligned ((__m128i*) (src + 4));
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst0, &xmm_dst1);
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst2, &xmm_dst3);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ src += 8;
+ }
+
+ while (w--)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = composite_over_8888_0565pixel (s, d);
+ }
}
- _mm_empty();
+ _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolidMask_nx8x8888
+/* -----------------------------------------------------------------
+ * composite_over_n_8_8888
*/
-void
-fbCompositeSolidMask_nx8x8888sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t src, srca;
- uint32_t *dstLine, *dst;
- uint8_t *maskLine, *mask;
- int dstStride, maskStride;
- uint16_t w;
+static void
+sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ uint16_t w;
uint32_t m, d;
- __m128i xmmSrc, xmmAlpha, xmmDef;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
+ __m128i xmm_src, xmm_alpha, xmm_def;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
+ __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
- fbComposeGetSolid(pSrc, src, pDst->bits.format);
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
srca = src >> 24;
- if (srca == 0)
+ if (src == 0)
return;
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- xmmDef = createMask_2x32_128 (src, src);
- xmmSrc = expandPixel_32_1x128 (src);
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
- mmxSrc = _mm_movepi64_pi64 (xmmSrc);
- mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+ xmm_def = create_mask_2x32_128 (src, src);
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = _mm_movepi64_pi64 (xmm_src);
+ mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w && (unsigned long)dst & 15)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmxMask = expandPixel_8_1x64 (m);
- mmxDest = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest));
- }
-
- w--;
- dst++;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
-
- m = *((uint32_t*)mask);
-
- if (srca == 0xff && m == 0xffffffff)
- {
- save128Aligned ((__m128i*)dst, xmmDef);
- }
- else if (m)
- {
- xmmDst = load128Aligned ((__m128i*) dst);
- xmmMask = unpack_32_1x128 (m);
- xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
-
- /* Unpacking */
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
- expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
-
- w -= 4;
- dst += 4;
- mask += 4;
- }
-
- while (w)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmxMask = expandPixel_8_1x64 (m);
- mmxDest = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest));
- }
-
- w--;
- dst++;
- }
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_pixel_8_1x64 (m);
+ mmx_dest = unpack_32_1x64 (d);
+
+ *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ w--;
+ dst++;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w >= 4)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
+ m = *((uint32_t*)mask);
+
+ if (srca == 0xff && m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_def);
+ }
+ else if (m)
+ {
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_pixel_8_1x64 (m);
+ mmx_dest = unpack_32_1x64 (d);
+
+ *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ w--;
+ dst++;
+ }
}
- _mm_empty();
+ _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolidMask_nx8x8888
+/* ----------------------------------------------------------------
+ * composite_over_n_8_8888
*/
pixman_bool_t
-pixmanFillsse2 (uint32_t *bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t data)
+pixman_fill_sse2 (uint32_t *bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t data)
{
- uint32_t byte_width;
- uint8_t *byte_line;
+ uint32_t byte_width;
+ uint8_t *byte_line;
- __m128i xmmDef;
+ __m128i xmm_def;
if (bpp == 16 && (data >> 16 != (data & 0xffff)))
return FALSE;
@@ -3226,430 +3828,459 @@ pixmanFillsse2 (uint32_t *bits,
if (bpp == 16)
{
- stride = stride * (int) sizeof (uint32_t) / 2;
- byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
- byte_width = 2 * width;
- stride *= 2;
+ stride = stride * (int) sizeof (uint32_t) / 2;
+ byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+ byte_width = 2 * width;
+ stride *= 2;
}
else
{
- stride = stride * (int) sizeof (uint32_t) / 4;
- byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
- byte_width = 4 * width;
- stride *= 4;
+ stride = stride * (int) sizeof (uint32_t) / 4;
+ byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+ byte_width = 4 * width;
+ stride *= 4;
}
- cachePrefetch ((__m128i*)byte_line);
- xmmDef = createMask_2x32_128 (data, data);
+ cache_prefetch ((__m128i*)byte_line);
+ xmm_def = create_mask_2x32_128 (data, data);
while (height--)
{
- int w;
- uint8_t *d = byte_line;
- byte_line += stride;
- w = byte_width;
-
-
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 2 && ((unsigned long)d & 3))
- {
- *(uint16_t *)d = data;
- w -= 2;
- d += 2;
- }
-
- while (w >= 4 && ((unsigned long)d & 15))
- {
- *(uint32_t *)d = data;
-
- w -= 4;
- d += 4;
- }
-
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 128)
- {
- cachePrefetch (((__m128i*)d) + 12);
-
- save128Aligned ((__m128i*)(d), xmmDef);
- save128Aligned ((__m128i*)(d+16), xmmDef);
- save128Aligned ((__m128i*)(d+32), xmmDef);
- save128Aligned ((__m128i*)(d+48), xmmDef);
- save128Aligned ((__m128i*)(d+64), xmmDef);
- save128Aligned ((__m128i*)(d+80), xmmDef);
- save128Aligned ((__m128i*)(d+96), xmmDef);
- save128Aligned ((__m128i*)(d+112), xmmDef);
-
- d += 128;
- w -= 128;
- }
-
- if (w >= 64)
- {
- cachePrefetch (((__m128i*)d) + 8);
-
- save128Aligned ((__m128i*)(d), xmmDef);
- save128Aligned ((__m128i*)(d+16), xmmDef);
- save128Aligned ((__m128i*)(d+32), xmmDef);
- save128Aligned ((__m128i*)(d+48), xmmDef);
-
- d += 64;
- w -= 64;
- }
-
- cachePrefetchNext ((__m128i*)d);
-
- if (w >= 32)
- {
- save128Aligned ((__m128i*)(d), xmmDef);
- save128Aligned ((__m128i*)(d+16), xmmDef);
-
- d += 32;
- w -= 32;
- }
-
- if (w >= 16)
- {
- save128Aligned ((__m128i*)(d), xmmDef);
-
- d += 16;
- w -= 16;
- }
-
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 4)
- {
- *(uint32_t *)d = data;
-
- w -= 4;
- d += 4;
- }
-
- if (w >= 2)
- {
- *(uint16_t *)d = data;
- w -= 2;
- d += 2;
- }
+ int w;
+ uint8_t *d = byte_line;
+ byte_line += stride;
+ w = byte_width;
+
+
+ cache_prefetch_next ((__m128i*)d);
+
+ while (w >= 2 && ((unsigned long)d & 3))
+ {
+ *(uint16_t *)d = data;
+ w -= 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((unsigned long)d & 15))
+ {
+ *(uint32_t *)d = data;
+
+ w -= 4;
+ d += 4;
+ }
+
+ cache_prefetch_next ((__m128i*)d);
+
+ while (w >= 128)
+ {
+ cache_prefetch (((__m128i*)d) + 12);
+
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+ save_128_aligned ((__m128i*)(d + 32), xmm_def);
+ save_128_aligned ((__m128i*)(d + 48), xmm_def);
+ save_128_aligned ((__m128i*)(d + 64), xmm_def);
+ save_128_aligned ((__m128i*)(d + 80), xmm_def);
+ save_128_aligned ((__m128i*)(d + 96), xmm_def);
+ save_128_aligned ((__m128i*)(d + 112), xmm_def);
+
+ d += 128;
+ w -= 128;
+ }
+
+ if (w >= 64)
+ {
+ cache_prefetch (((__m128i*)d) + 8);
+
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+ save_128_aligned ((__m128i*)(d + 32), xmm_def);
+ save_128_aligned ((__m128i*)(d + 48), xmm_def);
+
+ d += 64;
+ w -= 64;
+ }
+
+ cache_prefetch_next ((__m128i*)d);
+
+ if (w >= 32)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+
+ d += 32;
+ w -= 32;
+ }
+
+ if (w >= 16)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+
+ d += 16;
+ w -= 16;
+ }
+
+ cache_prefetch_next ((__m128i*)d);
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = data;
+
+ w -= 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = data;
+ w -= 2;
+ d += 2;
+ }
}
- _mm_empty();
+ _mm_empty ();
return TRUE;
}
-void
-fbCompositeSolidMaskSrc_nx8x8888sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t src, srca;
- uint32_t *dstLine, *dst;
- uint8_t *maskLine, *mask;
- int dstStride, maskStride;
- uint16_t w;
- uint32_t m;
-
- __m128i xmmSrc, xmmDef;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
-
- fbComposeGetSolid(pSrc, src, pDst->bits.format);
+static void
+sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ uint16_t w;
+ uint32_t m;
+
+ __m128i xmm_src, xmm_def;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
srca = src >> 24;
- if (srca == 0)
+ if (src == 0)
{
- pixmanFillsse2 (pDst->bits.bits, pDst->bits.rowstride,
- PIXMAN_FORMAT_BPP (pDst->bits.format),
- xDst, yDst, width, height, 0);
- return;
+ pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
+ PIXMAN_FORMAT_BPP (dst_image->bits.format),
+ dest_x, dest_y, width, height, 0);
+ return;
}
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- xmmDef = createMask_2x32_128 (src, src);
- xmmSrc = expandPixel_32_1x128 (src);
+ xmm_def = create_mask_2x32_128 (src, src);
+ xmm_src = expand_pixel_32_1x128 (src);
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w && (unsigned long)dst & 15)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
- }
- else
- {
- *dst = 0;
- }
-
- w--;
- dst++;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
-
- m = *((uint32_t*)mask);
-
- if (srca == 0xff && m == 0xffffffff)
- {
- save128Aligned ((__m128i*)dst, xmmDef);
- }
- else if (m)
- {
- xmmMask = unpack_32_1x128 (m);
- xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
-
- /* Unpacking */
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
- expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- pixMultiply_2x128 (&xmmSrc, &xmmSrc, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
- }
- else
- {
- save128Aligned ((__m128i*)dst, _mm_setzero_si128());
- }
-
- w -= 4;
- dst += 4;
- mask += 4;
- }
-
- while (w)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
- }
- else
- {
- *dst = 0;
- }
-
- w--;
- dst++;
- }
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ *dst = pack_1x64_32 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+ }
+ else
+ {
+ *dst = 0;
+ }
+
+ w--;
+ dst++;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w >= 4)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
+ m = *((uint32_t*)mask);
+
+ if (srca == 0xff && m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_def);
+ }
+ else if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_src, &xmm_src,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+ }
+ else
+ {
+ save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ *dst = pack_1x64_32 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+ }
+ else
+ {
+ *dst = 0;
+ }
+
+ w--;
+ dst++;
+ }
}
- _mm_empty();
+ _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolidMask_nx8x0565
+/*-----------------------------------------------------------------------
+ * composite_over_n_8_0565
*/
-void
-fbCompositeSolidMask_nx8x0565sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t src, srca;
- uint16_t *dstLine, *dst, d;
- uint8_t *maskLine, *mask;
- int dstStride, maskStride;
- uint16_t w;
+static void
+sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint16_t *dst_line, *dst, d;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ uint16_t w;
uint32_t m;
- __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
+ __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
- __m128i xmmSrc, xmmAlpha;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
- __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
- fbComposeGetSolid(pSrc, src, pDst->bits.format);
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
srca = src >> 24;
- if (srca == 0)
+ if (src == 0)
return;
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- xmmSrc = expandPixel_32_1x128 (src);
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
- mmxSrc = _mm_movepi64_pi64 (xmmSrc);
- mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = _mm_movepi64_pi64 (xmm_src);
+ mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w && (unsigned long)dst & 15)
- {
- m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
- mmxDest = expand565_16_1x64 (d);
-
- *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest)));
- }
-
- w--;
- dst++;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 8)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
-
- xmmDst = load128Aligned ((__m128i*) dst);
- unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
-
- m = *((uint32_t*)mask);
- mask += 4;
-
- if (m)
- {
- xmmMask = unpack_32_1x128 (m);
- xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
-
- /* Unpacking */
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
- expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
- inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
- }
-
- m = *((uint32_t*)mask);
- mask += 4;
-
- if (m)
- {
- xmmMask = unpack_32_1x128 (m);
- xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
-
- /* Unpacking */
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
- expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
- inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
- }
-
- save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
-
- w -= 8;
- dst += 8;
- }
-
- while (w)
- {
- m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
- mmxDest = expand565_16_1x64 (d);
-
- *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest)));
- }
-
- w--;
- dst++;
- }
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+ mmx_dest = expand565_16_1x64 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x64_32 (
+ in_over_1x64 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w >= 8)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ m = *((uint32_t*)mask);
+ mask += 4;
+
+ if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ m = *((uint32_t*)mask);
+ mask += 4;
+
+ if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ }
+
+ while (w)
+ {
+ m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+ mmx_dest = expand565_16_1x64 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x64_32 (
+ in_over_1x64 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ }
}
- _mm_empty();
+ _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_8888RevNPx0565
+/* -----------------------------------------------------------------------
+ * composite_over_pixbuf_0565
*/
-void
-fbCompositeSrc_8888RevNPx0565sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint16_t *dstLine, *dst, d;
- uint32_t *srcLine, *src, s;
- int dstStride, srcStride;
- uint16_t w;
- uint32_t packCmp;
+static void
+sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ uint16_t w;
+ uint32_t opaque, zero;
__m64 ms;
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
#if 0
/* FIXME
@@ -3657,131 +4288,144 @@ fbCompositeSrc_8888RevNPx0565sse2 (pixman_op_t op,
* I copy the code from MMX one and keep the fixme.
* If it's a problem there, probably is a problem here.
*/
- assert (pSrc->pDrawable == pMask->pDrawable);
+ assert (src_image->drawable == mask_image->drawable);
#endif
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
- while (w && (unsigned long)dst & 15)
- {
- s = *src++;
- d = *dst;
-
- ms = unpack_32_1x64 (s);
-
- *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 8)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)src);
- cachePrefetchNext ((__m128i*)dst);
-
- /* First round */
- xmmSrc = load128Unaligned((__m128i*)src);
- xmmDst = load128Aligned ((__m128i*)dst);
-
- packCmp = packAlpha (xmmSrc);
-
- unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
-
- /* preload next round*/
- xmmSrc = load128Unaligned((__m128i*)(src+4));
- /* preload next round*/
-
- if (packCmp == 0xffffffff)
- {
- invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
- }
- else if (packCmp)
- {
- overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
- }
-
- /* Second round */
- packCmp = packAlpha (xmmSrc);
-
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
-
- if (packCmp == 0xffffffff)
- {
- invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
- }
- else if (packCmp)
- {
- overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
- }
-
- save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
-
- w -= 8;
- src += 8;
- dst += 8;
- }
-
- while (w)
- {
- s = *src++;
- d = *dst;
-
- ms = unpack_32_1x64 (s);
-
- *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
- w--;
- }
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = *src++;
+ d = *dst;
+
+ ms = unpack_32_1x64 (s);
+
+ *dst++ = pack_565_32_16 (
+ pack_1x64_32 (
+ over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w >= 8)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+ cache_prefetch_next ((__m128i*)dst);
+
+ /* First round */
+ xmm_src = load_128_unaligned ((__m128i*)src);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ opaque = is_opaque (xmm_src);
+ zero = is_zero (xmm_src);
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+ /* preload next round*/
+ xmm_src = load_128_unaligned ((__m128i*)(src + 4));
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+ else if (!zero)
+ {
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ /* Second round */
+ opaque = is_opaque (xmm_src);
+ zero = is_zero (xmm_src);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+ else if (!zero)
+ {
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ src += 8;
+ dst += 8;
+ }
+
+ while (w)
+ {
+ s = *src++;
+ d = *dst;
+
+ ms = unpack_32_1x64 (s);
+
+ *dst++ = pack_565_32_16 (
+ pack_1x64_32 (
+ over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+ w--;
+ }
}
- _mm_empty();
+ _mm_empty ();
}
-/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
-
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_8888RevNPx8888
+/* -------------------------------------------------------------------------
+ * composite_over_pixbuf_8888
*/
-void
-fbCompositeSrc_8888RevNPx8888sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t *dstLine, *dst, d;
- uint32_t *srcLine, *src, s;
- int dstStride, srcStride;
- uint16_t w;
- uint32_t packCmp;
-
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
-
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+static void
+sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ uint16_t w;
+ uint32_t opaque, zero;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
#if 0
/* FIXME
@@ -3789,938 +4433,1276 @@ fbCompositeSrc_8888RevNPx8888sse2 (pixman_op_t op,
* I copy the code from MMX one and keep the fixme.
* If it's a problem there, probably is a problem here.
*/
- assert (pSrc->pDrawable == pMask->pDrawable);
+ assert (src_image->drawable == mask_image->drawable);
#endif
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
- while (w && (unsigned long)dst & 15)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
-
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)src);
- cachePrefetchNext ((__m128i*)dst);
-
- xmmSrcHi = load128Unaligned((__m128i*)src);
-
- packCmp = packAlpha (xmmSrcHi);
-
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-
- if (packCmp == 0xffffffff)
- {
- invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
- else if (packCmp)
- {
- xmmDstHi = load128Aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-
- overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
-
- w -= 4;
- dst += 4;
- src += 4;
- }
-
- while (w)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
-
- w--;
- }
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = pack_1x64_32 (
+ over_rev_non_pre_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (d)));
+
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w >= 4)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+ cache_prefetch_next ((__m128i*)dst);
+
+ xmm_src_hi = load_128_unaligned ((__m128i*)src);
+
+ opaque = is_opaque (xmm_src_hi);
+ zero = is_zero (xmm_src_hi);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ else if (!zero)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ src += 4;
+ }
+
+ while (w)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = pack_1x64_32 (
+ over_rev_non_pre_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (d)));
+
+ w--;
+ }
}
- _mm_empty();
+ _mm_empty ();
}
/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolidMask_nx8888x0565C
+ * composite_over_n_8888_0565_ca
*/
-void
-fbCompositeSolidMask_nx8888x0565Csse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t src, srca;
- uint16_t *dstLine, *dst, d;
- uint32_t *maskLine, *mask, m;
- int dstStride, maskStride;
+static void
+sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint16_t *dst_line, *dst, d;
+ uint32_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
int w;
- uint32_t packCmp;
+ uint32_t pack_cmp;
- __m128i xmmSrc, xmmAlpha;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
- __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
- __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
+ __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
- fbComposeGetSolid(pSrc, src, pDst->bits.format);
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- srca = src >> 24;
- if (srca == 0)
- return;
+ if (src == 0)
+ return;
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
- xmmSrc = expandPixel_32_1x128 (src);
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
- mmxSrc = _mm_movepi64_pi64 (xmmSrc);
- mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = _mm_movepi64_pi64 (xmm_src);
+ mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
while (height--)
{
- w = width;
- mask = maskLine;
- dst = dstLine;
- maskLine += maskStride;
- dstLine += dstStride;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w && ((unsigned long)dst & 15))
- {
- m = *(uint32_t *) mask;
-
- if (m)
- {
- d = *dst;
- mmxMask = unpack_32_1x64 (m);
- mmxDest = expand565_16_1x64 (d);
-
- *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest)));
- }
-
- w--;
- dst++;
- mask++;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 8)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
-
- /* First round */
- xmmMask = load128Unaligned((__m128i*)mask);
- xmmDst = load128Aligned((__m128i*)dst);
-
- packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
-
- unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
- /* preload next round*/
- xmmMask = load128Unaligned((__m128i*)(mask+4));
- /* preload next round*/
-
- if (packCmp != 0xffff)
- {
- inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
- }
-
- /* Second round */
- packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
-
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
- if (packCmp != 0xffff)
- {
- inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
- }
-
- save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
-
- w -= 8;
- dst += 8;
- mask += 8;
- }
-
- while (w)
- {
- m = *(uint32_t *) mask;
-
- if (m)
- {
- d = *dst;
- mmxMask = unpack_32_1x64 (m);
- mmxDest = expand565_16_1x64 (d);
-
- *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest)));
- }
-
- w--;
- dst++;
- mask++;
- }
+ w = width;
+ mask = mask_line;
+ dst = dst_line;
+ mask_line += mask_stride;
+ dst_line += dst_stride;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = *(uint32_t *) mask;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = unpack_32_1x64 (m);
+ mmx_dest = expand565_16_1x64 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x64_32 (
+ in_over_1x64 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ mask++;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w >= 8)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
+ /* First round */
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ /* preload next round */
+ xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
+
+ /* preload next round */
+ if (pack_cmp != 0xffff)
+ {
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ /* Second round */
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ if (pack_cmp != 0xffff)
+ {
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ mask += 8;
+ }
+
+ while (w)
+ {
+ m = *(uint32_t *) mask;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = unpack_32_1x64 (m);
+ mmx_dest = expand565_16_1x64 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x64_32 (
+ in_over_1x64 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ mask++;
+ }
}
_mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeIn_nx8x8
+/* -----------------------------------------------------------------------
+ * composite_in_n_8_8
*/
-void
-fbCompositeIn_nx8x8sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint8_t *dstLine, *dst;
- uint8_t *maskLine, *mask;
- int dstStride, maskStride;
- uint16_t w, d, m;
- uint32_t src;
- uint8_t sa;
-
- __m128i xmmAlpha;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
-
- fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-
- fbComposeGetSolid(pSrc, src, pDst->bits.format);
+static void
+sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ uint16_t w, d, m;
+ uint32_t src;
+ uint8_t sa;
+
+ __m128i xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
sa = src >> 24;
if (sa == 0)
- return;
+ return;
- xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w && ((unsigned long)dst & 15))
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 16)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
-
- xmmMask = load128Unaligned((__m128i*)mask);
- xmmDst = load128Aligned((__m128i*)dst);
-
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
- pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- mask += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
+ unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w >= 16)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ mask += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+ w--;
+ }
}
- _mm_empty();
+ _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeIn_8x8
+/* ---------------------------------------------------------------------------
+ * composite_in_8_8
*/
-void
-fbCompositeIn_8x8sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint8_t *dstLine, *dst;
- uint8_t *srcLine, *src;
- int srcStride, dstStride;
- uint16_t w;
- uint32_t s, d;
-
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
-
- fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
+static void
+sse2_composite_in_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int src_stride, dst_stride;
+ uint16_t w;
+ uint32_t s, d;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
- while (w && ((unsigned long)dst & 15))
- {
- s = (uint32_t) *src++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 16)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)src);
- cachePrefetchNext ((__m128i*)dst);
-
- xmmSrc = load128Unaligned((__m128i*)src);
- xmmDst = load128Aligned((__m128i*)dst);
-
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- src += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- s = (uint32_t) *src++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
- w--;
- }
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ s = (uint32_t) *src++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w >= 16)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+ cache_prefetch_next ((__m128i*)dst);
+
+ xmm_src = load_128_unaligned ((__m128i*)src);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ s = (uint32_t) *src++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ w--;
+ }
}
_mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrcAdd_8888x8x8
+/* -------------------------------------------------------------------------
+ * composite_add_8888_8_8
*/
-void
-fbCompositeSrcAdd_8888x8x8sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint8_t *dstLine, *dst;
- uint8_t *maskLine, *mask;
- int dstStride, maskStride;
- uint16_t w;
- uint32_t src;
- uint8_t sa;
+static void
+sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ uint16_t w;
+ uint32_t src;
+ uint8_t sa;
uint32_t m, d;
- __m128i xmmAlpha;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
+ __m128i xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- fbComposeGetSolid(pSrc, src, pDst->bits.format);
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
sa = src >> 24;
if (sa == 0)
- return;
+ return;
- xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w && ((unsigned long)dst & 15))
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 16)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
-
- xmmMask = load128Unaligned((__m128i*)mask);
- xmmDst = load128Aligned((__m128i*)dst);
-
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
- pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo);
- xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi);
-
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- mask += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ _mm_adds_pu16 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)mask);
+ cache_prefetch ((__m128i*)dst);
+
+ while (w >= 16)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)mask);
+ cache_prefetch_next ((__m128i*)dst);
+
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+ xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ mask += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ _mm_adds_pu16 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+
+ w--;
+ }
}
- _mm_empty();
+ _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrcAdd_8000x8000
+/* ----------------------------------------------------------------------
+ * composite_add_8000_8000
*/
-void
-fbCompositeSrcAdd_8000x8000sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint8_t *dstLine, *dst;
- uint8_t *srcLine, *src;
- int dstStride, srcStride;
- uint16_t w;
- uint16_t t;
-
- fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
- fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
+static void
+sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ uint16_t w;
+ uint16_t t;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
while (height--)
{
- dst = dstLine;
- src = srcLine;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
- dstLine += dstStride;
- srcLine += srcStride;
- w = width;
-
- /* Small head */
- while (w && (unsigned long)dst & 3)
- {
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
- w--;
- }
-
- coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, w >> 2);
-
- /* Small tail */
- dst += w & 0xfffc;
- src += w & 0xfffc;
-
- w &= 3;
-
- while (w)
- {
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
- w--;
- }
+ dst = dst_line;
+ src = src_line;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Small head */
+ while (w && (unsigned long)dst & 3)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+
+ core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+ /* Small tail */
+ dst += w & 0xfffc;
+ src += w & 0xfffc;
+
+ w &= 3;
+
+ while (w)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
}
- _mm_empty();
+ _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrcAdd_8888x8888
+/* ---------------------------------------------------------------------
+ * composite_add_8888_8888
*/
-void
-fbCompositeSrcAdd_8888x8888sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t *dstLine, *dst;
- uint32_t *srcLine, *src;
- int dstStride, srcStride;
-
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+static void
+sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
- coreCombineAddUsse2 (dst, src, width);
+ core_combine_add_u_sse2 (dst, src, NULL, width);
}
- _mm_empty();
+ _mm_empty ();
}
/* -------------------------------------------------------------------------------------------------
- * fbCompositeCopyAreasse2
+ * sse2_composite_copy_area
*/
-pixman_bool_t
-pixmanBltsse2 (uint32_t *src_bits,
- uint32_t *dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x, int src_y,
- int dst_x, int dst_y,
- int width, int height)
-{
- uint8_t * src_bytes;
- uint8_t * dst_bytes;
- int byte_width;
+static pixman_bool_t
+pixman_blt_sse2 (uint32_t *src_bits,
+ uint32_t *dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height)
+{
+ uint8_t * src_bytes;
+ uint8_t * dst_bytes;
+ int byte_width;
if (src_bpp != dst_bpp)
- return FALSE;
+ return FALSE;
if (src_bpp == 16)
{
- src_stride = src_stride * (int) sizeof (uint32_t) / 2;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
- src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 2 * width;
- src_stride *= 2;
- dst_stride *= 2;
+ src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+ dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+ src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+ dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+ byte_width = 2 * width;
+ src_stride *= 2;
+ dst_stride *= 2;
}
else if (src_bpp == 32)
{
- src_stride = src_stride * (int) sizeof (uint32_t) / 4;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
- src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 4 * width;
- src_stride *= 4;
- dst_stride *= 4;
+ src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+ dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+ src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+ dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+ byte_width = 4 * width;
+ src_stride *= 4;
+ dst_stride *= 4;
}
else
{
- return FALSE;
+ return FALSE;
}
- cachePrefetch ((__m128i*)src_bytes);
- cachePrefetch ((__m128i*)dst_bytes);
+ cache_prefetch ((__m128i*)src_bytes);
+ cache_prefetch ((__m128i*)dst_bytes);
while (height--)
{
- int w;
- uint8_t *s = src_bytes;
- uint8_t *d = dst_bytes;
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- w = byte_width;
-
- cachePrefetchNext ((__m128i*)s);
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 2 && ((unsigned long)d & 3))
- {
- *(uint16_t *)d = *(uint16_t *)s;
- w -= 2;
- s += 2;
- d += 2;
- }
-
- while (w >= 4 && ((unsigned long)d & 15))
- {
- *(uint32_t *)d = *(uint32_t *)s;
-
- w -= 4;
- s += 4;
- d += 4;
- }
-
- cachePrefetchNext ((__m128i*)s);
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 64)
- {
- __m128i xmm0, xmm1, xmm2, xmm3;
-
- /* 128 bytes ahead */
- cachePrefetch (((__m128i*)s) + 8);
- cachePrefetch (((__m128i*)d) + 8);
-
- xmm0 = load128Unaligned ((__m128i*)(s));
- xmm1 = load128Unaligned ((__m128i*)(s+16));
- xmm2 = load128Unaligned ((__m128i*)(s+32));
- xmm3 = load128Unaligned ((__m128i*)(s+48));
-
- save128Aligned ((__m128i*)(d), xmm0);
- save128Aligned ((__m128i*)(d+16), xmm1);
- save128Aligned ((__m128i*)(d+32), xmm2);
- save128Aligned ((__m128i*)(d+48), xmm3);
-
- s += 64;
- d += 64;
- w -= 64;
- }
-
- cachePrefetchNext ((__m128i*)s);
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 16)
- {
- save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) );
-
- w -= 16;
- d += 16;
- s += 16;
- }
-
- cachePrefetchNext ((__m128i*)s);
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 4)
- {
- *(uint32_t *)d = *(uint32_t *)s;
-
- w -= 4;
- s += 4;
- d += 4;
- }
-
- if (w >= 2)
- {
- *(uint16_t *)d = *(uint16_t *)s;
- w -= 2;
- s += 2;
- d += 2;
- }
+ int w;
+ uint8_t *s = src_bytes;
+ uint8_t *d = dst_bytes;
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ w = byte_width;
+
+ cache_prefetch_next ((__m128i*)s);
+ cache_prefetch_next ((__m128i*)d);
+
+ while (w >= 2 && ((unsigned long)d & 3))
+ {
+ *(uint16_t *)d = *(uint16_t *)s;
+ w -= 2;
+ s += 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((unsigned long)d & 15))
+ {
+ *(uint32_t *)d = *(uint32_t *)s;
+
+ w -= 4;
+ s += 4;
+ d += 4;
+ }
+
+ cache_prefetch_next ((__m128i*)s);
+ cache_prefetch_next ((__m128i*)d);
+
+ while (w >= 64)
+ {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+
+ /* 128 bytes ahead */
+ cache_prefetch (((__m128i*)s) + 8);
+ cache_prefetch (((__m128i*)d) + 8);
+
+ xmm0 = load_128_unaligned ((__m128i*)(s));
+ xmm1 = load_128_unaligned ((__m128i*)(s + 16));
+ xmm2 = load_128_unaligned ((__m128i*)(s + 32));
+ xmm3 = load_128_unaligned ((__m128i*)(s + 48));
+
+ save_128_aligned ((__m128i*)(d), xmm0);
+ save_128_aligned ((__m128i*)(d + 16), xmm1);
+ save_128_aligned ((__m128i*)(d + 32), xmm2);
+ save_128_aligned ((__m128i*)(d + 48), xmm3);
+
+ s += 64;
+ d += 64;
+ w -= 64;
+ }
+
+ cache_prefetch_next ((__m128i*)s);
+ cache_prefetch_next ((__m128i*)d);
+
+ while (w >= 16)
+ {
+ save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
+
+ w -= 16;
+ d += 16;
+ s += 16;
+ }
+
+ cache_prefetch_next ((__m128i*)s);
+ cache_prefetch_next ((__m128i*)d);
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = *(uint32_t *)s;
+
+ w -= 4;
+ s += 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = *(uint16_t *)s;
+ w -= 2;
+ s += 2;
+ d += 2;
+ }
}
- _mm_empty();
+ _mm_empty ();
return TRUE;
}
-void
-fbCompositeCopyAreasse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- pixmanBltsse2 (pSrc->bits.bits,
- pDst->bits.bits,
- pSrc->bits.rowstride,
- pDst->bits.rowstride,
- PIXMAN_FORMAT_BPP (pSrc->bits.format),
- PIXMAN_FORMAT_BPP (pDst->bits.format),
- xSrc, ySrc, xDst, yDst, width, height);
+static void
+sse2_composite_copy_area (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ pixman_blt_sse2 (src_image->bits.bits,
+ dst_image->bits.bits,
+ src_image->bits.rowstride,
+ dst_image->bits.rowstride,
+ PIXMAN_FORMAT_BPP (src_image->bits.format),
+ PIXMAN_FORMAT_BPP (dst_image->bits.format),
+ src_x, src_y, dest_x, dest_y, width, height);
}
#if 0
/* This code are buggy in MMX version, now the bug was translated to SSE2 version */
void
-fbCompositeOver_x888x8x8888sse2 (pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int16_t xSrc,
- int16_t ySrc,
- int16_t xMask,
- int16_t yMask,
- int16_t xDst,
- int16_t yDst,
- uint16_t width,
- uint16_t height)
-{
- uint32_t *src, *srcLine, s;
- uint32_t *dst, *dstLine, d;
- uint8_t *mask, *maskLine;
- uint32_t m;
- int srcStride, maskStride, dstStride;
+sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint8_t *mask, *mask_line;
+ uint32_t m;
+ int src_stride, mask_stride, dst_stride;
uint16_t w;
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
- src = srcLine;
- srcLine += srcStride;
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
-
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)mask);
-
- while (w && (unsigned long)dst & 15)
- {
- s = 0xff000000 | *src++;
- m = (uint32_t) *mask++;
- d = *dst;
-
- __m64 ms = unpack_32_1x64 (s);
-
- if (m != 0xff)
- {
- ms = inOver_1x64 (ms,
- xMask00ff,
- expandAlphaRev_1x64 (unpack_32_1x64 (m)),
- unpack_32_1x64 (d));
- }
-
- *dst++ = pack_1x64_32 (ms);
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)mask);
-
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)src);
- cachePrefetchNext ((__m128i*)dst);
- cachePrefetchNext ((__m128i*)mask);
-
- m = *(uint32_t*) mask;
- xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
-
- if (m == 0xffffffff)
- {
- save128Aligned ((__m128i*)dst, xmmSrc);
- }
- else
- {
- xmmDst = load128Aligned ((__m128i*)dst);
-
- xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
- expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
-
- src += 4;
- dst += 4;
- mask += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
-
- if (m)
- {
- s = 0xff000000 | *src;
-
- if (m == 0xff)
- {
- *dst = s;
- }
- else
- {
- d = *dst;
-
- *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
- xMask00ff,
- expandAlphaRev_1x64 (unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- }
-
- }
-
- src++;
- dst++;
- w--;
- }
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)mask);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = 0xff000000 | *src++;
+ m = (uint32_t) *mask++;
+ d = *dst;
+
+ __m64 ms = unpack_32_1x64 (s);
+
+ if (m != 0xff)
+ {
+ ms = in_over_1x64 (ms,
+ mask_x00ff,
+ expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
+ unpack_32_1x64 (d));
+ }
+
+ *dst++ = pack_1x64_32 (ms);
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)mask);
+
+ while (w >= 4)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+ cache_prefetch_next ((__m128i*)dst);
+ cache_prefetch_next ((__m128i*)mask);
+
+ m = *(uint32_t*) mask;
+ xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+ if (m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ xmm_mask = _mm_unpacklo_epi16 (
+ unpack_32_1x128 (m), _mm_setzero_si128 ());
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (xmm_src_lo, xmm_src_hi,
+ mask_00ff, mask_00ff,
+ xmm_mask_lo, xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ src += 4;
+ dst += 4;
+ mask += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+
+ if (m)
+ {
+ s = 0xff000000 | *src;
+
+ if (m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ d = *dst;
+
+ *dst = pack_1x64_32 (
+ in_over_1x64 (
+ unpack_32_1x64 (s),
+ mask_x00ff,
+ expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+ }
+
+ }
+
+ src++;
+ dst++;
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+#endif
+
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 },
+#if 0
+ /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
+ { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
+#endif
+ { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
+ { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
+ { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
+ { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK },
+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
+ { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
+ { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
+ { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
+ { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
+
+ { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 },
+ { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 },
+ { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 },
+ { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_8888_8_8, 0 },
+
+ { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 },
+
+ { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 },
+ { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 },
+
+ { PIXMAN_OP_NONE },
+};
+
+/*
+ * Work around GCC bug causing crashes in Mozilla with SSE2
+ *
+ * When using -msse, gcc generates movdqa instructions assuming that
+ * the stack is 16 byte aligned. Unfortunately some applications, such
+ * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
+ * causes the movdqa instructions to fail.
+ *
+ * The __force_align_arg_pointer__ makes gcc generate a prologue that
+ * realigns the stack pointer to 16 bytes.
+ *
+ * On x86-64 this is not necessary because the standard ABI already
+ * calls for a 16 byte aligned stack.
+ *
+ * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
+ */
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static void
+sse2_composite (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src,
+ pixman_image_t * mask,
+ pixman_image_t * dest,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ if (_pixman_run_fast_path (sse2_fast_paths, imp,
+ op, src, mask, dest,
+ src_x, src_y,
+ mask_x, mask_y,
+ dest_x, dest_y,
+ width, height))
+ {
+ return;
+ }
+
+ _pixman_implementation_composite (imp->delegate, op,
+ src, mask, dest,
+ src_x, src_y,
+ mask_x, mask_y,
+ dest_x, dest_y,
+ width, height);
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_blt (pixman_implementation_t *imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height)
+{
+ if (!pixman_blt_sse2 (
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dst_x, dst_y, width, height))
+
+ {
+ return _pixman_implementation_blt (
+ imp->delegate,
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dst_x, dst_y, width, height);
}
- _mm_empty();
+ return TRUE;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
+ {
+ return _pixman_implementation_fill (
+ imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+ }
+
+ return TRUE;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (void)
+{
+ pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
+ pixman_implementation_t *imp = _pixman_implementation_create (mmx);
+
+ /* SSE2 constants */
+ mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+ mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+ mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+ mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+ mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+ mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+ mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+ mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+ mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
+ mask_0080 = create_mask_16_128 (0x0080);
+ mask_00ff = create_mask_16_128 (0x00ff);
+ mask_0101 = create_mask_16_128 (0x0101);
+ mask_ffff = create_mask_16_128 (0xffff);
+ mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+ mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+
+ /* MMX constants */
+ mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
+ mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
+
+ mask_x0080 = create_mask_16_64 (0x0080);
+ mask_x00ff = create_mask_16_64 (0x00ff);
+ mask_x0101 = create_mask_16_64 (0x0101);
+ mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
+
+ _mm_empty ();
+
+ /* Set up function pointers */
+
+ /* SSE code patch for fbcompose.c */
+ imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+ imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+ imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+ imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+ imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+ imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+ imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+ imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+ imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+ imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+ imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+ imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+ imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+ imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+ imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+ imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+ imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+ imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
+ imp->composite = sse2_composite;
+ imp->blt = sse2_blt;
+ imp->fill = sse2_fill;
+
+ return imp;
}
-#endif /* #if 0 */
#endif /* USE_SSE2 */