aboutsummaryrefslogtreecommitdiff
path: root/pixman/pixman/pixman-sse2.c
diff options
context:
space:
mode:
Diffstat (limited to 'pixman/pixman/pixman-sse2.c')
-rwxr-xr-x[-rw-r--r--]pixman/pixman/pixman-sse2.c210
1 files changed, 145 insertions, 65 deletions
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index 29ae623c5..430e455fd 100644..100755
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -30,6 +30,9 @@
#include <config.h>
#endif
+/* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
+#define PSHUFD_IS_FAST 0
+
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
#include <emmintrin.h> /* for SSE2 intrinsics */
#include "pixman-private.h"
@@ -5554,50 +5557,134 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
scaled_nearest_scanline_sse2_8888_n_8888_OVER,
uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
+#if PSHUFD_IS_FAST
+
+/***********************************************************************************/
+
# define BILINEAR_DECLARE_VARIABLES \
const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
- const __m128i xmm_ux = _mm_set_epi16 (unit_x&0xffff, (-unit_x)&0xffff, unit_x&0xffff, (-unit_x)&0xffff, \
- unit_x&0xffff, (-unit_x)&0xffff, unit_x&0xffff, (-unit_x)&0xffff); \
+ const __m128i xmm_ux1 = _mm_set_epi16 ((unit_x)&0xffff, (-unit_x)&0xffff, (unit_x)&0xffff, (-unit_x)&0xffff, \
+ (unit_x)&0xffff, (-unit_x)&0xffff, (unit_x)&0xffff, (-unit_x)&0xffff); \
+ const __m128i xmm_ux4 = _mm_set_epi16 ((unit_x * 4)&0xffff, (-unit_x * 4)&0xffff, \
+ (unit_x * 4)&0xffff, (-unit_x * 4)&0xffff, \
+ (unit_x * 4)&0xffff, (-unit_x * 4)&0xffff, \
+ (unit_x * 4)&0xffff, (-unit_x * 4)&0xffff); \
+ const __m128i xmm_zero = _mm_setzero_si128 (); \
+ __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \
+ vx + unit_x * 2, -(vx + 1) - unit_x * 2, \
+ vx + unit_x * 1, -(vx + 1) - unit_x * 1, \
+ vx + unit_x * 0, -(vx + 1) - unit_x * 0); \
+ __m128i xmm_wh_state;
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_) \
+do { \
+ int phase = phase_; \
+ __m128i xmm_wh, xmm_a, xmm_b; \
+ /* fetch 2x2 pixel block into sse2 registers */ \
+ __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
+ __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
+ vx += unit_x; \
+ /* vertical interpolation */ \
+ xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
+ xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
+ xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
+ /* calculate horizontal weights */ \
+ if (phase <= 0) \
+ { \
+ xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
+ 16 - BILINEAR_INTERPOLATION_BITS)); \
+ xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4); \
+ phase = 0; \
+ } \
+ xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase, \
+ phase, phase)); \
+ /* horizontal interpolation */ \
+ xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
+ xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh); \
+ /* shift the result */ \
+ pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
+} while (0)
+
+#else /************************************************************************/
+
+# define BILINEAR_DECLARE_VARIABLES \
+ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
+ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
+ const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
+ const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
+ unit_x, -unit_x, unit_x, -unit_x); \
+ const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
+ unit_x * 4, -unit_x * 4, \
+ unit_x * 4, -unit_x * 4, \
+ unit_x * 4, -unit_x * 4); \
const __m128i xmm_zero = _mm_setzero_si128 (); \
__m128i xmm_x = _mm_set_epi16 (vx&0xffff, (-(vx + 1))&0xffff, vx&0xffff, (-(vx + 1))&0xffff, \
vx&0xffff, (-(vx + 1))&0xffff, vx&0xffff, (-(vx + 1))&0xffff)
-#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
+#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase) \
do { \
- __m128i xmm_wh, a; \
+ __m128i xmm_wh, xmm_a, xmm_b; \
+ (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */ \
/* fetch 2x2 pixel block into sse2 registers */ \
- __m128i tltr = _mm_loadl_epi64 ( \
- (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \
- __m128i blbr = _mm_loadl_epi64 ( \
- (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \
+ __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
+ __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
vx += unit_x; \
/* vertical interpolation */ \
- a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \
- xmm_wt), \
- _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \
- xmm_wb)); \
+ xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
+ xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
+ xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
/* calculate horizontal weights */ \
xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
- 16 - BILINEAR_INTERPOLATION_BITS)); \
- xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ 16 - BILINEAR_INTERPOLATION_BITS)); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
/* horizontal interpolation */ \
- a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
- a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \
- /* shift and pack the result */ \
- a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \
- a = _mm_packs_epi32 (a, a); \
- a = _mm_packus_epi16 (a, a); \
- pix = _mm_cvtsi128_si32 (a); \
+ xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \
+ xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh); \
+ /* shift the result */ \
+ pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
} while (0)
+/***********************************************************************************/
+
+#endif
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix); \
+do { \
+ __m128i xmm_pix; \
+ BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1); \
+ xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix); \
+ xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix); \
+ pix = _mm_cvtsi128_si32 (xmm_pix); \
+} while(0)
+
+#define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix); \
+do { \
+ __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4; \
+ BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0); \
+ BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1); \
+ BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2); \
+ BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3); \
+ xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2); \
+ xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4); \
+ pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3); \
+} while(0)
+
#define BILINEAR_SKIP_ONE_PIXEL() \
do { \
vx += unit_x; \
- xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
} while(0)
+#define BILINEAR_SKIP_FOUR_PIXELS() \
+do { \
+ vx += unit_x * 4; \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4); \
+} while(0)
+
+/***********************************************************************************/
+
static force_inline void
scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
const uint32_t * mask,
@@ -5606,24 +5693,28 @@ scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
int32_t w,
int wt,
int wb,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
+ pixman_fixed_t vx_,
+ pixman_fixed_t unit_x_,
pixman_fixed_t max_vx,
pixman_bool_t zero_src)
{
+ intptr_t vx = vx_;
+ intptr_t unit_x = unit_x_;
BILINEAR_DECLARE_VARIABLES;
- uint32_t pix1, pix2, pix3, pix4;
+ uint32_t pix1, pix2;
- while ((w -= 4) >= 0)
+ while (w && ((uintptr_t)dst & 15))
{
BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
*dst++ = pix1;
- *dst++ = pix2;
- *dst++ = pix3;
- *dst++ = pix4;
+ w--;
+ }
+
+ while ((w -= 4) >= 0) {
+ __m128i xmm_src;
+ BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
+ _mm_store_si128 ((__m128i *)dst, xmm_src);
+ dst += 4;
}
if (w & 2)
@@ -5667,13 +5758,15 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
int32_t w,
int wt,
int wb,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
+ pixman_fixed_t vx_,
+ pixman_fixed_t unit_x_,
pixman_fixed_t max_vx,
pixman_bool_t zero_src)
{
+ intptr_t vx = vx_;
+ intptr_t unit_x = unit_x_;
BILINEAR_DECLARE_VARIABLES;
- uint32_t pix1, pix2, pix3, pix4;
+ uint32_t pix1, pix2;
while (w && ((uintptr_t)dst & 15))
{
@@ -5695,12 +5788,7 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
__m128i xmm_alpha_hi, xmm_alpha_lo;
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
-
- xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+ BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
if (!is_zero (xmm_src))
{
@@ -5767,13 +5855,15 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
int32_t w,
int wt,
int wb,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
+ pixman_fixed_t vx_,
+ pixman_fixed_t unit_x_,
pixman_fixed_t max_vx,
pixman_bool_t zero_src)
{
+ intptr_t vx = vx_;
+ intptr_t unit_x = unit_x_;
BILINEAR_DECLARE_VARIABLES;
- uint32_t pix1, pix2, pix3, pix4;
+ uint32_t pix1, pix2;
uint32_t m;
while (w && ((uintptr_t)dst & 15))
@@ -5824,12 +5914,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
if (m)
{
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
-
- xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+ BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
if (m == 0xffffffff && is_opaque (xmm_src))
{
@@ -5856,10 +5941,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
}
else
{
- BILINEAR_SKIP_ONE_PIXEL ();
- BILINEAR_SKIP_ONE_PIXEL ();
- BILINEAR_SKIP_ONE_PIXEL ();
- BILINEAR_SKIP_ONE_PIXEL ();
+ BILINEAR_SKIP_FOUR_PIXELS ();
}
w -= 4;
@@ -5931,13 +6013,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
int32_t w,
int wt,
int wb,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
+ pixman_fixed_t vx_,
+ pixman_fixed_t unit_x_,
pixman_fixed_t max_vx,
pixman_bool_t zero_src)
{
+ intptr_t vx = vx_;
+ intptr_t unit_x = unit_x_;
BILINEAR_DECLARE_VARIABLES;
- uint32_t pix1, pix2, pix3, pix4;
+ uint32_t pix1;
__m128i xmm_mask;
if (zero_src || (*mask >> 24) == 0)
@@ -5967,19 +6051,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
while (w >= 4)
{
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+ __m128i xmm_src;
+ BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
- if (pix1 | pix2 | pix3 | pix4)
+ if (!is_zero (xmm_src))
{
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
- xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);