diff options
Diffstat (limited to 'pixman/pixman/pixman-sse2.c')
-rw-r--r-- | pixman/pixman/pixman-sse2.c | 172 |
1 files changed, 151 insertions, 21 deletions
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index 2fa956e21..727ad42ec 100644 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -264,19 +264,14 @@ pix_add_multiply_2x128 (__m128i* src_lo, __m128i* ret_lo, __m128i* ret_hi) { - __m128i lo, hi; - __m128i mul_lo, mul_hi; + __m128i t1_lo, t1_hi; + __m128i t2_lo, t2_hi; - lo = _mm_mullo_epi16 (*src_lo, *alpha_dst_lo); - hi = _mm_mullo_epi16 (*src_hi, *alpha_dst_hi); - mul_lo = _mm_mullo_epi16 (*dst_lo, *alpha_src_lo); - mul_hi = _mm_mullo_epi16 (*dst_hi, *alpha_src_hi); - lo = _mm_adds_epu16 (lo, mask_0080); - hi = _mm_adds_epu16 (hi, mask_0080); - lo = _mm_adds_epu16 (lo, mul_lo); - hi = _mm_adds_epu16 (hi, mul_hi); - *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); - *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); + pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); + pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); + + *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); + *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); } static force_inline void @@ -457,11 +452,10 @@ pix_add_multiply_1x64 (__m64* src, __m64* dst, __m64* alpha_src) { - return _mm_mulhi_pu16 ( - _mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alpha_dst), - mask_x0080), - _mm_mullo_pi16 (*dst, *alpha_src)), - mask_x0101); + __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst); + __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src); + + return _mm_adds_pu8 (t1, t2); } static force_inline __m64 @@ -3098,6 +3092,145 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, _mm_empty (); } +/* ------------------------------ + * composite_add_n_8888_8888_ca + */ +static void +sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, d; + uint32_t *mask_line, m; + uint32_t pack_cmp; + int dst_stride, mask_stride; + + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + srca = src >> 24; + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + xmm_src = _mm_unpacklo_epi8 ( + create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = _mm_movepi64_pi64 (xmm_src); + mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + + while (height--) + { + int w = width; + const uint32_t *pm = (uint32_t *)mask_line; + uint32_t *pd = (uint32_t *)dst_line; + + dst_line += dst_stride; + mask_line += mask_stride; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + m = *pm++; + + if (m) + { + d = *pd; + + mmx_mask = unpack_32_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *pd = pack_1x64_32 ( + _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); + } + + pd++; + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)pd); + cache_prefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)pd); + cache_prefetch_next ((__m128i*)pm); + + xmm_mask = load_128_unaligned ((__m128i*)pm); + + pack_cmp = + _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ + if (pack_cmp != 0xffff) + { + xmm_dst = load_128_aligned ((__m128i*)pd); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_src, &xmm_src, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); + + save_128_aligned ( + (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); + } + + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + m = *pm++; + + if (m) + { + d = *pd; + + mmx_mask = unpack_32_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *pd = pack_1x64_32 ( + _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); + } + + pd++; + w--; + } + } + + _mm_empty (); +} + /* --------------------------------------------------------------------------- * composite_over_n_8888_8888_ca */ @@ -4718,8 +4851,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, src = _pixman_image_get_solid (src_image, dst_image->bits.format); sa = src >> 24; - if (sa == 0) - return; xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); @@ -4934,8 +5065,6 @@ sse2_composite_add_8888_8_8 (pixman_implementation_t *imp, src = _pixman_image_get_solid (src_image, dst_image->bits.format); sa = src >> 24; - if (sa == 0) - return; xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); @@ -5502,6 +5631,7 @@ static const pixman_fast_path_t sse2_fast_paths[] = { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 }, + { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 }, { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 }, { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 }, |