diff options
Diffstat (limited to 'pixman/pixman/pixman-mmx.c')
-rw-r--r-- | pixman/pixman/pixman-mmx.c | 69 |
1 files changed, 48 insertions, 21 deletions
diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c index 1e6dbe8aa..3cb3131ec 100644 --- a/pixman/pixman/pixman-mmx.c +++ b/pixman/pixman/pixman-mmx.c @@ -61,7 +61,7 @@ _mm_empty (void) } #endif -#ifdef USE_X86_MMX +#if defined __GNUC__ && defined USE_X86_MMX # if (defined(__SUNPRO_C) || defined(_MSC_VER)) # include <xmmintrin.h> # else @@ -118,13 +118,17 @@ _mm_shuffle_pi16 (__m64 __A, int8_t const __N) }) # endif # endif -#endif #ifndef _MSC_VER #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) #endif +#else +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ +#include <emmintrin.h> /* for SSE2 intrinsics */ +#endif + /* Notes about writing mmx code * * give memory operands as the second operand. If you give it as the @@ -270,6 +274,9 @@ to_m64 (uint64_t x) #endif } +#ifdef _MSC_VER +#define to_uint64(arg) arg.M64_MEMBER +#else static force_inline uint64_t to_uint64 (__m64 x) { @@ -284,6 +291,7 @@ to_uint64 (__m64 x) return (uint64_t)x; #endif } +#endif static force_inline __m64 shift (__m64 v, @@ -447,6 +455,11 @@ pack8888 (__m64 lo, __m64 hi) return _mm_packs_pu16 (lo, hi); } +#ifdef _MSC_VER +#define store8888(dest,v) *(dest)=_mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ())) +#define store(dest,v) *(dest) = _mm_cvtsi64_si32 (v) +#else + static force_inline void store (uint32_t *dest, __m64 v) { @@ -467,6 +480,7 @@ store8888 (uint32_t *dest, __m64 v) v = pack8888 (v, _mm_setzero_si64 ()); store (dest, v); } +#endif static force_inline pixman_bool_t is_equal (__m64 a, __m64 b) @@ -475,7 +489,9 @@ is_equal (__m64 a, __m64 b) /* __m64 is double, we can compare directly. */ return a == b; #else - return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff; + pixman_bool_t ret = _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff; + _mm_empty(); + return ret; #endif } @@ -486,15 +502,21 @@ is_opaque (__m64 v) return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha)); #else __m64 ffs = _mm_cmpeq_pi8 (v, v); - return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40); + pixman_bool_t ret = (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40); + _mm_empty(); + return ret; #endif } +#ifdef _MSC_VER +#define is_zero(v) is_equal (v, _mm_setzero_si64 ()) +#else static force_inline pixman_bool_t is_zero (__m64 v) { return is_equal (v, _mm_setzero_si64 ()); } +#endif /* Expand 16 bits positioned at @pos (0-3) of a mmx register into * @@ -562,6 +584,7 @@ expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha) *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */ *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */ + _mm_empty(); } static force_inline __m64 @@ -588,6 +611,7 @@ expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, *vout1 = expand8888 (v0, 1); *vout2 = expand8888 (v1, 0); *vout3 = expand8888 (v1, 1); + _mm_empty(); } static force_inline __m64 @@ -1890,7 +1914,7 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp, vdest = pack_565 ( over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); - *dst = to_uint64 (vdest); + *dst = to_uint64 (vdest)&0xffff; w--; dst++; @@ -1934,7 +1958,7 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp, vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); - *dst = to_uint64 (vdest); + *dst = to_uint64 (vdest) & 0xffff; w--; dst++; @@ -2136,7 +2160,7 @@ mmx_fill (pixman_implementation_t *imp, if (w >= 2 && ((unsigned long)d & 3)) { - *(uint16_t *)d = xor; + *(uint16_t *)d = (xor & 0xffff); w -= 2; d += 2; } @@ -2189,7 +2213,7 @@ mmx_fill (pixman_implementation_t *imp, } if (w >= 2) { - *(uint16_t *)d = xor; + *(uint16_t *)d = (xor & 0xffff); w -= 2; d += 2; } @@ -2473,7 +2497,7 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, vm3 = to_m64 (m3); v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3); - *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); } w -= 4; @@ -3549,41 +3573,44 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp, #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ do { \ + __m64 t_hi, t_lo, b_hi, b_lo, hi, lo; \ /* fetch 2x2 pixel block into 2 mmx registers */ \ __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \ __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \ /* vertical interpolation */ \ - __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \ - __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \ - __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \ - __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \ - __m64 hi = _mm_add_pi16 (t_hi, b_hi); \ - __m64 lo = _mm_add_pi16 (t_lo, b_lo); \ + t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \ + t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \ + b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \ + b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \ + hi = _mm_add_pi16 (t_hi, b_hi); \ + lo = _mm_add_pi16 (t_lo, b_lo); \ vx += unit_x; \ if (BILINEAR_INTERPOLATION_BITS < 8) \ { \ + __m64 p, q; \ /* calculate horizontal weights */ \ __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \ _mm_srli_pi16 (mm_x, \ 16 - BILINEAR_INTERPOLATION_BITS))); \ /* horizontal interpolation */ \ - __m64 p = _mm_unpacklo_pi16 (lo, hi); \ - __m64 q = _mm_unpackhi_pi16 (lo, hi); \ + p = _mm_unpacklo_pi16 (lo, hi); \ + q = _mm_unpackhi_pi16 (lo, hi); \ lo = _mm_madd_pi16 (p, mm_wh); \ hi = _mm_madd_pi16 (q, mm_wh); \ } \ else \ { \ + __m64 mm_lo_lo, mm_lo_hi, mm_hi_lo, mm_hi_hi; \ /* calculate horizontal weights */ \ __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \ 16 - BILINEAR_INTERPOLATION_BITS)); \ __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \ 16 - BILINEAR_INTERPOLATION_BITS); \ /* horizontal interpolation */ \ - __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \ - __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \ - __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \ - __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \ + mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \ + mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \ + mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \ + mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \ lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \ _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \ hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \ |