diff options
Diffstat (limited to 'pixman')
| -rw-r--r-- | pixman/configure.ac | 45 | ||||
| -rw-r--r-- | pixman/pixman/Makefile.am | 12 | ||||
| -rw-r--r-- | pixman/pixman/pixman-general.c | 22 | ||||
| -rw-r--r-- | pixman/pixman/pixman-private.h | 8 | ||||
| -rwxr-xr-x[-rw-r--r--] | pixman/pixman/pixman-sse2.c | 210 | ||||
| -rw-r--r-- | pixman/pixman/pixman-ssse3.c | 362 | ||||
| -rw-r--r-- | pixman/pixman/pixman-utils.c | 9 | ||||
| -rw-r--r-- | pixman/pixman/pixman-x86.c | 15 | ||||
| -rw-r--r-- | pixman/test/scaling-bench.c | 29 | 
9 files changed, 629 insertions, 83 deletions
| diff --git a/pixman/configure.ac b/pixman/configure.ac index daf4062b6..263c63edf 100644 --- a/pixman/configure.ac +++ b/pixman/configure.ac @@ -437,6 +437,50 @@ fi  AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)  dnl =========================================================================== +dnl Check for SSSE3 + +if test "x$SSSE3_CFLAGS" = "x" ; then +    SSSE3_CFLAGS="-mssse3 -Winline" +fi + +have_ssse3_intrinsics=no +AC_MSG_CHECKING(whether to use SSSE3 intrinsics) +xserver_save_CFLAGS=$CFLAGS +CFLAGS="$SSSE3_CFLAGS $CFLAGS" + +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +#include <mmintrin.h> +#include <xmmintrin.h> +#include <emmintrin.h> +#include <tmmintrin.h> +int main () { +    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c; +    c = _mm_maddubs_epi16 (a, b); +    return 0; +}]])], have_ssse3_intrinsics=yes) +CFLAGS=$xserver_save_CFLAGS + +AC_ARG_ENABLE(ssse3, +   [AC_HELP_STRING([--disable-ssse3], +                   [disable SSSE3 fast paths])], +   [enable_ssse3=$enableval], [enable_ssse3=auto]) + +if test $enable_ssse3 = no ; then +   have_ssse3_intrinsics=disabled +fi + +if test $have_ssse3_intrinsics = yes ; then +   AC_DEFINE(USE_SSSE3, 1, [use SSSE3 compiler intrinsics]) +fi + +AC_MSG_RESULT($have_ssse3_intrinsics) +if test $enable_ssse3 = yes && test $have_ssse3_intrinsics = no ; then +   AC_MSG_ERROR([SSSE3 intrinsics not detected]) +fi + +AM_CONDITIONAL(USE_SSSE3, test $have_ssse3_intrinsics = yes) + +dnl ===========================================================================  dnl Other special flags needed when building code using MMX or SSE instructions  case $host_os in     solaris*) @@ -471,6 +515,7 @@ AC_SUBST(MMX_CFLAGS)  AC_SUBST(MMX_LDFLAGS)  AC_SUBST(SSE2_CFLAGS)  AC_SUBST(SSE2_LDFLAGS) +AC_SUBST(SSSE3_CFLAGS)  dnl ===========================================================================  dnl Check for VMX/Altivec diff --git a/pixman/pixman/Makefile.am b/pixman/pixman/Makefile.am index b9ea75424..b376d9aeb 100644 --- a/pixman/pixman/Makefile.am +++ b/pixman/pixman/Makefile.am @@ -52,6 +52,18 @@ libpixman_1_la_LIBADD += libpixman-sse2.la  ASM_CFLAGS_sse2=$(SSE2_CFLAGS)  endif +# ssse3 code +if USE_SSSE3 +noinst_LTLIBRARIES += libpixman-ssse3.la +libpixman_ssse3_la_SOURCES = \ +	pixman-ssse3.c +libpixman_ssse3_la_CFLAGS = $(SSSE3_CFLAGS) +libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS) +libpixman_1_la_LIBADD += libpixman-ssse3.la + +ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS) +endif +  # arm simd code  if USE_ARM_SIMD  noinst_LTLIBRARIES += libpixman-arm-simd.la diff --git a/pixman/pixman/pixman-general.c b/pixman/pixman/pixman-general.c index 6310bff9d..a653fa71a 100644 --- a/pixman/pixman/pixman-general.c +++ b/pixman/pixman/pixman-general.c @@ -114,7 +114,7 @@ general_composite_rect  (pixman_implementation_t *imp,                           pixman_composite_info_t *info)  {      PIXMAN_COMPOSITE_ARGS (info); -    uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8]; +    uint8_t stack_scanline_buffer[3 * SCANLINE_BUFFER_LENGTH];      uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;      uint8_t *src_buffer, *mask_buffer, *dest_buffer;      pixman_iter_t src_iter, mask_iter, dest_iter; @@ -137,17 +137,25 @@ general_composite_rect  (pixman_implementation_t *imp,  	Bpp = 16;      } -    if (width * Bpp > SCANLINE_BUFFER_LENGTH) +#define ALIGN(addr)							\ +    ((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15))) + +    src_buffer = ALIGN (scanline_buffer); +    mask_buffer = ALIGN (src_buffer + width * Bpp); +    dest_buffer = ALIGN (mask_buffer + width * Bpp); + +    if (ALIGN (dest_buffer + width * Bpp) > +	    scanline_buffer + sizeof (stack_scanline_buffer))      { -	scanline_buffer = pixman_malloc_abc (width, 3, Bpp); +	scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 32 * 3);  	if (!scanline_buffer)  	    return; -    } -    src_buffer = scanline_buffer; -    mask_buffer = src_buffer + width * Bpp; -    dest_buffer = mask_buffer + width * Bpp; +	src_buffer = ALIGN (scanline_buffer); +	mask_buffer = ALIGN (src_buffer + width * Bpp); +	dest_buffer = ALIGN (mask_buffer + width * Bpp); +    }      if (width_flag == ITER_WIDE)      { diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h index 120196ddf..6ca13b216 100644 --- a/pixman/pixman/pixman-private.h +++ b/pixman/pixman/pixman-private.h @@ -593,6 +593,11 @@ pixman_implementation_t *  _pixman_implementation_create_sse2 (pixman_implementation_t *fallback);  #endif +#ifdef USE_SSSE3 +pixman_implementation_t * +_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback); +#endif +  #ifdef USE_ARM_SIMD  pixman_implementation_t *  _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback); @@ -787,6 +792,9 @@ pixman_malloc_ab (unsigned int n, unsigned int b);  void *  pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c); +void * +pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c); +  pixman_bool_t  _pixman_multiply_overflows_size (size_t a, size_t b); diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index 29ae623c5..430e455fd 100644..100755 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -30,6 +30,9 @@  #include <config.h>  #endif +/* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */ +#define PSHUFD_IS_FAST 0 +  #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */  #include <emmintrin.h> /* for SSE2 intrinsics */  #include "pixman-private.h" @@ -5554,50 +5557,134 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,  			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,  			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) +#if PSHUFD_IS_FAST + +/***********************************************************************************/ +  # define BILINEAR_DECLARE_VARIABLES						\      const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\      const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\      const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\ -    const __m128i xmm_ux = _mm_set_epi16 (unit_x&0xffff, (-unit_x)&0xffff, unit_x&0xffff, (-unit_x)&0xffff,	\ -					  unit_x&0xffff, (-unit_x)&0xffff, unit_x&0xffff, (-unit_x)&0xffff);	\ +    const __m128i xmm_ux1 = _mm_set_epi16 ((unit_x)&0xffff, (-unit_x)&0xffff, (unit_x)&0xffff, (-unit_x)&0xffff,	\ +					   (unit_x)&0xffff, (-unit_x)&0xffff, (unit_x)&0xffff, (-unit_x)&0xffff);	\ +    const __m128i xmm_ux4 = _mm_set_epi16 ((unit_x * 4)&0xffff, (-unit_x * 4)&0xffff,		\ +					   (unit_x * 4)&0xffff, (-unit_x * 4)&0xffff,		\ +					   (unit_x * 4)&0xffff, (-unit_x * 4)&0xffff,		\ +					   (unit_x * 4)&0xffff, (-unit_x * 4)&0xffff);		\ +    const __m128i xmm_zero = _mm_setzero_si128 ();				\ +    __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3,	\ +				   vx + unit_x * 2, -(vx + 1) - unit_x * 2,	\ +				   vx + unit_x * 1, -(vx + 1) - unit_x * 1,	\ +				   vx + unit_x * 0, -(vx + 1) - unit_x * 0);	\ +    __m128i xmm_wh_state; + +#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_)			\ +do {										\ +    int phase = phase_;								\ +    __m128i xmm_wh, xmm_a, xmm_b;						\ +    /* fetch 2x2 pixel block into sse2 registers */				\ +    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\ +    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\ +    vx += unit_x;								\ +    /* vertical interpolation */						\ +    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\ +    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\ +    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);						\ +    /* calculate horizontal weights */						\ +    if (phase <= 0)								\ +    {										\ +	xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\ +					16 - BILINEAR_INTERPOLATION_BITS));	\ +	xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4);		\ +	phase = 0;								\ +    }										\ +    xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase,	\ +							   phase, phase));	\ +    /* horizontal interpolation */						\ +    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\ +		xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh);		\ +    /* shift the result */							\ +    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\ +} while (0) + +#else /************************************************************************/ + +# define BILINEAR_DECLARE_VARIABLES						\ +    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\ +    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\ +    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\ +    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\ +					  unit_x, -unit_x, unit_x, -unit_x);	\ +    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\ +					   unit_x * 4, -unit_x * 4,		\ +					   unit_x * 4, -unit_x * 4,		\ +					   unit_x * 4, -unit_x * 4);		\      const __m128i xmm_zero = _mm_setzero_si128 ();				\      __m128i xmm_x = _mm_set_epi16 (vx&0xffff, (-(vx + 1))&0xffff, vx&0xffff, (-(vx + 1))&0xffff,		\  				   vx&0xffff, (-(vx + 1))&0xffff, vx&0xffff, (-(vx + 1))&0xffff) -#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\ +#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase)			\  do {										\ -    __m128i xmm_wh, a;								\ +    __m128i xmm_wh, xmm_a, xmm_b;						\ +    (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */		\      /* fetch 2x2 pixel block into sse2 registers */				\ -    __m128i tltr = _mm_loadl_epi64 (						\ -			    (__m128i *)&src_top[pixman_fixed_to_int (vx)]);	\ -    __m128i blbr = _mm_loadl_epi64 (						\ -			    (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);	\ +    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\ +    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\      vx += unit_x;								\      /* vertical interpolation */						\ -    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),	\ -					xmm_wt),				\ -		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\ -					xmm_wb));				\ +    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\ +    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\ +    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);					\      /* calculate horizontal weights */						\      xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,			\ -			      16 - BILINEAR_INTERPOLATION_BITS));		\ -    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\ +					16 - BILINEAR_INTERPOLATION_BITS));	\ +    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\      /* horizontal interpolation */						\ -    a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (			\ -			       a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);	\ -    /* shift and pack the result */						\ -    a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\ -    a = _mm_packs_epi32 (a, a);							\ -    a = _mm_packus_epi16 (a, a);						\ -    pix = _mm_cvtsi128_si32 (a);						\ +    xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a);	\ +    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh);		\ +    /* shift the result */							\ +    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\  } while (0) +/***********************************************************************************/ + +#endif + +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix);					\ +do {										\ +	__m128i xmm_pix;							\ +	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1);			\ +	xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix);				\ +	xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix);				\ +	pix = _mm_cvtsi128_si32 (xmm_pix);					\ +} while(0) + +#define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix);					\ +do {										\ +	__m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4;				\ +	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0);			\ +	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1);			\ +	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2);			\ +	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3);			\ +	xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2);			\ +	xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4);			\ +	pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3);				\ +} while(0) +  #define BILINEAR_SKIP_ONE_PIXEL()						\  do {										\      vx += unit_x;								\ -    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\ +    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\  } while(0) +#define BILINEAR_SKIP_FOUR_PIXELS()						\ +do {										\ +    vx += unit_x * 4;								\ +    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4);					\ +} while(0) + +/***********************************************************************************/ +  static force_inline void  scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,  					     const uint32_t * mask, @@ -5606,24 +5693,28 @@ scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,  					     int32_t          w,  					     int              wt,  					     int              wb, -					     pixman_fixed_t   vx, -					     pixman_fixed_t   unit_x, +					     pixman_fixed_t   vx_, +					     pixman_fixed_t   unit_x_,  					     pixman_fixed_t   max_vx,  					     pixman_bool_t    zero_src)  { +    intptr_t vx = vx_; +    intptr_t unit_x = unit_x_;      BILINEAR_DECLARE_VARIABLES; -    uint32_t pix1, pix2, pix3, pix4; +    uint32_t pix1, pix2; -    while ((w -= 4) >= 0) +    while (w && ((uintptr_t)dst & 15))      {  	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); -	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); -	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); -	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);  	*dst++ = pix1; -	*dst++ = pix2; -	*dst++ = pix3; -	*dst++ = pix4; +	w--; +    } + +    while ((w -= 4) >= 0) { +	__m128i xmm_src; +	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); +	_mm_store_si128 ((__m128i *)dst, xmm_src); +	dst += 4;      }      if (w & 2) @@ -5667,13 +5758,15 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,  					      int32_t          w,  					      int              wt,  					      int              wb, -					      pixman_fixed_t   vx, -					      pixman_fixed_t   unit_x, +					      pixman_fixed_t   vx_, +					      pixman_fixed_t   unit_x_,  					      pixman_fixed_t   max_vx,  					      pixman_bool_t    zero_src)  { +    intptr_t vx = vx_; +    intptr_t unit_x = unit_x_;      BILINEAR_DECLARE_VARIABLES; -    uint32_t pix1, pix2, pix3, pix4; +    uint32_t pix1, pix2;      while (w && ((uintptr_t)dst & 15))      { @@ -5695,12 +5788,7 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,  	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;  	__m128i xmm_alpha_hi, xmm_alpha_lo; -	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); -	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); -	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); -	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); - -	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); +	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);  	if (!is_zero (xmm_src))  	{ @@ -5767,13 +5855,15 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,  						int32_t          w,  						int              wt,  						int              wb, -						pixman_fixed_t   vx, -						pixman_fixed_t   unit_x, +						pixman_fixed_t   vx_, +						pixman_fixed_t   unit_x_,  						pixman_fixed_t   max_vx,  						pixman_bool_t    zero_src)  { +    intptr_t vx = vx_; +    intptr_t unit_x = unit_x_;      BILINEAR_DECLARE_VARIABLES; -    uint32_t pix1, pix2, pix3, pix4; +    uint32_t pix1, pix2;      uint32_t m;      while (w && ((uintptr_t)dst & 15)) @@ -5824,12 +5914,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,  	if (m)  	{ -	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); -	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); -	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); -	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); - -	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); +	    BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);  	    if (m == 0xffffffff && is_opaque (xmm_src))  	    { @@ -5856,10 +5941,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,  	}  	else  	{ -	    BILINEAR_SKIP_ONE_PIXEL (); -	    BILINEAR_SKIP_ONE_PIXEL (); -	    BILINEAR_SKIP_ONE_PIXEL (); -	    BILINEAR_SKIP_ONE_PIXEL (); +	    BILINEAR_SKIP_FOUR_PIXELS ();  	}  	w -= 4; @@ -5931,13 +6013,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,  						int32_t          w,  						int              wt,  						int              wb, -						pixman_fixed_t   vx, -						pixman_fixed_t   unit_x, +						pixman_fixed_t   vx_, +						pixman_fixed_t   unit_x_,  						pixman_fixed_t   max_vx,  						pixman_bool_t    zero_src)  { +    intptr_t vx = vx_; +    intptr_t unit_x = unit_x_;      BILINEAR_DECLARE_VARIABLES; -    uint32_t pix1, pix2, pix3, pix4; +    uint32_t pix1;      __m128i xmm_mask;      if (zero_src || (*mask >> 24) == 0) @@ -5967,19 +6051,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,      while (w >= 4)      { -	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); -	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); -	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); -	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); +	__m128i xmm_src; +	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); -	if (pix1 | pix2 | pix3 | pix4) +	if (!is_zero (xmm_src))  	{ -	    __m128i xmm_src, xmm_src_lo, xmm_src_hi; +	    __m128i xmm_src_lo, xmm_src_hi;  	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;  	    __m128i xmm_alpha_lo, xmm_alpha_hi; -	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); -  	    xmm_dst = load_128_aligned ((__m128i*)dst);  	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); diff --git a/pixman/pixman/pixman-ssse3.c b/pixman/pixman/pixman-ssse3.c new file mode 100644 index 000000000..34763e20b --- /dev/null +++ b/pixman/pixman/pixman-ssse3.c @@ -0,0 +1,362 @@ +/* + * Copyright © 2013 Soren Sandmann Pedersen + * Copyright © 2013 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Soren Sandmann (soren.sandmann@gmail.com) + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdlib.h> +#include <mmintrin.h> +#include <xmmintrin.h> +#include <emmintrin.h> +#include <tmmintrin.h> +#include "pixman-private.h" +#include "pixman-inlines.h" + +typedef struct +{ +    int		y; +    uint64_t *	buffer; +} line_t; + +typedef struct +{ +    line_t		line0; +    line_t		line1; +    pixman_fixed_t	y; +    pixman_fixed_t	x; +    uint64_t		data[1]; +} bilinear_info_t; + +static void +ssse3_fetch_horizontal (bits_image_t *image, line_t *line, +			int y, pixman_fixed_t x, pixman_fixed_t ux, int n) +{ +    uint32_t *bits = image->bits + y * image->rowstride; +    __m128i vx = _mm_set_epi16 ( +	- (x + 1), x, - (x + 1), x, +	- (x + ux + 1), x + ux,  - (x + ux + 1), x + ux); +    __m128i vux = _mm_set_epi16 ( +	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux, +	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux); +    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0); +    __m128i *b = (__m128i *)line->buffer; +    __m128i vrl0, vrl1; + +    while ((n -= 2) >= 0) +    { +	__m128i vw, vr, s; + +	vrl1 = _mm_loadl_epi64 ( +	    (__m128i *)(bits + pixman_fixed_to_int (x + ux))); +	/* vrl1: R1, L1 */ + +    final_pixel: +	vrl0 = _mm_loadl_epi64 ( +	    (__m128i *)(bits + pixman_fixed_to_int (x))); +	/* vrl0: R0, L0 */ + +	/* The weights are based on vx which is a vector of  +	 * +	 *    - (x + 1), x, - (x + 1), x, +	 *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux +	 * +	 * so the 16 bit weights end up like this: +	 * +	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1 +	 * +	 * and after shifting and packing, we get these bytes: +	 * +	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1, +	 *        iw0, w0, iw0, w0, iw1, w1, iw1, w1, +	 * +	 * which means the first and the second input pixel  +	 * have to be interleaved like this: +	 * +	 *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, +	 *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 +	 * +	 * before maddubsw can be used. +	 */ + +	vw = _mm_add_epi16 ( +	    vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS)); +	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1 +	 */ + +	vw = _mm_packus_epi16 (vw, vw); +	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1, +	 *         iw0, w0, iw0, w0, iw1, w1, iw1, w1 +	 */ +	vx = _mm_add_epi16 (vx, vux); + +	x += 2 * ux; + +	vr = _mm_unpacklo_epi16 (vrl1, vrl0); +	/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */ + +	s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2)); +	/* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */ + +	vr = _mm_unpackhi_epi8 (vr, s); +	/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, +	 *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 +	 */ + +	vr = _mm_maddubs_epi16 (vr, vw); + +	/* When the weight is 0, the inverse weight is +	 * 128 which can't be represented in a signed byte. +	 * As a result maddubsw computes the following: +	 * +	 *     r = l * -128 + r * 0 +	 * +	 * rather than the desired +	 * +	 *     r = l * 128 + r * 0 +	 * +	 * We fix this by taking the absolute value of the +	 * result. +	 */ +	vr = _mm_abs_epi16 (vr); + +	/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */ +	_mm_store_si128 (b++, vr); +    } + +    if (n == -1) +    { +	vrl1 = _mm_setzero_si128(); +	goto final_pixel; +    } + +    line->y = y; +} + +static uint32_t * +ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask) +{ +    pixman_fixed_t fx, ux; +    bilinear_info_t *info = iter->data; +    line_t *line0, *line1; +    int y0, y1; +    int32_t dist_y; +    __m128i vw; +    int i; + +    fx = info->x; +    ux = iter->image->common.transform->matrix[0][0]; + +    y0 = pixman_fixed_to_int (info->y); +    y1 = y0 + 1; + +    line0 = &info->line0; +    line1 = &info->line1; + +    if (line0->y != y0 || line1->y != y1) +    { +	if (line0->y == y1 || line1->y == y0) +	{ +	    line_t tmp = *line0; +	    *line0 = *line1; +	    *line1 = tmp; +	} + +	if (line0->y != y0) +	{ +	    ssse3_fetch_horizontal ( +		&iter->image->bits, line0, y0, fx, ux, iter->width); +	} + +	if (line1->y != y1) +	{ +	    ssse3_fetch_horizontal ( +		&iter->image->bits, line1, y1, fx, ux, iter->width); +	} +    } + +    dist_y = pixman_fixed_to_bilinear_weight (info->y); +    dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS); + +    vw = _mm_set_epi16 ( +	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y); + +    for (i = 0; i + 3 < iter->width; i += 4) +    { +	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); +	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); +	__m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2)); +	__m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2)); +	__m128i r0, r1, tmp, p; + +	r0 = _mm_mulhi_epu16 ( +	    _mm_sub_epi16 (bot0, top0), vw); +	tmp = _mm_cmplt_epi16 (bot0, top0); +	tmp = _mm_and_si128 (tmp, vw); +	r0 = _mm_sub_epi16 (r0, tmp); +	r0 = _mm_add_epi16 (r0, top0); +	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); +	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */ +	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); +	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */ + +	r1 = _mm_mulhi_epu16 ( +	    _mm_sub_epi16 (bot1, top1), vw); +	tmp = _mm_cmplt_epi16 (bot1, top1); +	tmp = _mm_and_si128 (tmp, vw); +	r1 = _mm_sub_epi16 (r1, tmp); +	r1 = _mm_add_epi16 (r1, top1); +	r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS); +	r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1)); +	/* r1: A3 R3 G3 B3 A2 R2 G2 B2 */ + +	p = _mm_packus_epi16 (r0, r1); + +	_mm_storeu_si128 ((__m128i *)(iter->buffer + i), p); +    } + +    while (i < iter->width) +    { +	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); +	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); +	__m128i r0, tmp, p; + +	r0 = _mm_mulhi_epu16 ( +	    _mm_sub_epi16 (bot0, top0), vw); +	tmp = _mm_cmplt_epi16 (bot0, top0); +	tmp = _mm_and_si128 (tmp, vw); +	r0 = _mm_sub_epi16 (r0, tmp); +	r0 = _mm_add_epi16 (r0, top0); +	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); +	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */ +	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); +	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */ + +	p = _mm_packus_epi16 (r0, r0); + +	if (iter->width - i == 1) +	{ +	    *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p); +	    i++; +	} +	else +	{ +	    _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p); +	    i += 2; +	} +    } +     +    info->y += iter->image->common.transform->matrix[1][1]; + +    return iter->buffer; +} + +static void +ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter) +{ +    free (iter->data); +} + +static void +ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info) +{ +    int width = iter->width; +    bilinear_info_t *info; +    pixman_vector_t v; + +    /* Reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    if (!pixman_transform_point_3d (iter->image->common.transform, &v)) +	goto fail; + +    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64); +    if (!info) +	goto fail; + +    info->x = v.vector[0] - pixman_fixed_1 / 2; +    info->y = v.vector[1] - pixman_fixed_1 / 2; + +#define ALIGN(addr)							\ +    ((void *)((((uintptr_t)(addr)) + 15) & (~15))) + +    /* It is safe to set the y coordinates to -1 initially +     * because COVER_CLIP_BILINEAR ensures that we will only +     * be asked to fetch lines in the [0, height) interval +     */ +    info->line0.y = -1; +    info->line0.buffer = ALIGN (&(info->data[0])); +    info->line1.y = -1; +    info->line1.buffer = ALIGN (info->line0.buffer + width); + +    iter->get_scanline = ssse3_fetch_bilinear_cover; +    iter->fini = ssse3_bilinear_cover_iter_fini; + +    iter->data = info; +    return; + +fail: +    /* Something went wrong, either a bad matrix or OOM; in such cases, +     * we don't guarantee any particular rendering. +     */ +    _pixman_log_error ( +	FUNC, "Allocation failure or bad matrix, skipping rendering\n"); +     +    iter->get_scanline = _pixman_iter_get_scanline_noop; +    iter->fini = NULL; +} + +static const pixman_iter_info_t ssse3_iters[] =  +{ +    { PIXMAN_a8r8g8b8, +      (FAST_PATH_STANDARD_FLAGS			| +       FAST_PATH_SCALE_TRANSFORM		| +       FAST_PATH_BILINEAR_FILTER		| +       FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR), +      ITER_NARROW | ITER_SRC, +      ssse3_bilinear_cover_iter_init, +      NULL, NULL +    }, + +    { PIXMAN_null }, +}; + +static const pixman_fast_path_t ssse3_fast_paths[] = +{ +    { PIXMAN_OP_NONE }, +}; + +pixman_implementation_t * +_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback) +{ +    pixman_implementation_t *imp = +	_pixman_implementation_create (fallback, ssse3_fast_paths); + +    imp->iter_info = ssse3_iters; + +    return imp; +} diff --git a/pixman/pixman/pixman-utils.c b/pixman/pixman/pixman-utils.c index 98723a800..4a3a835c4 100644 --- a/pixman/pixman/pixman-utils.c +++ b/pixman/pixman/pixman-utils.c @@ -49,6 +49,15 @@ _pixman_addition_overflows_int (unsigned int a, unsigned int b)  }  void * +pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c) +{ +    if (!b || a >= INT32_MAX / b || (a * b) > INT32_MAX - c) +	return NULL; + +    return malloc (a * b + c); +} + +void *  pixman_malloc_ab (unsigned int a,                    unsigned int b)  { diff --git a/pixman/pixman/pixman-x86.c b/pixman/pixman/pixman-x86.c index 57e4d1f35..652776021 100644 --- a/pixman/pixman/pixman-x86.c +++ b/pixman/pixman/pixman-x86.c @@ -25,7 +25,7 @@  #include "pixman-private.h" -#if defined(USE_X86_MMX) || defined (USE_SSE2) +#if defined(USE_X86_MMX) || defined (USE_SSE2) || defined (USE_SSSE3)  /* The CPU detection code needs to be in a file not compiled with   * "-mmmx -msse", as gcc would generate CMOV instructions otherwise @@ -39,7 +39,8 @@ typedef enum      X86_MMX_EXTENSIONS		= (1 << 1),      X86_SSE			= (1 << 2) | X86_MMX_EXTENSIONS,      X86_SSE2			= (1 << 3), -    X86_CMOV			= (1 << 4) +    X86_CMOV			= (1 << 4), +    X86_SSSE3			= (1 << 5)  } cpu_features_t;  #ifdef HAVE_GETISAX @@ -64,6 +65,8 @@ detect_cpu_features (void)  	    features |= X86_SSE;  	if (result & AV_386_SSE2)  	    features |= X86_SSE2; +	if (result & AV_386_SSSE3) +	    features |= X86_SSSE3;      }      return features; @@ -167,6 +170,8 @@ detect_cpu_features (void)  	features |= X86_SSE;      if (d & (1 << 26))  	features |= X86_SSE2; +    if (d & (1 << 9)) +	features |= X86_SSSE3;      /* Check for AMD specific features */      if ((features & X86_MMX) && !(features & X86_SSE)) @@ -222,6 +227,7 @@ _pixman_x86_get_implementations (pixman_implementation_t *imp)  {  #define MMX_BITS  (X86_MMX | X86_MMX_EXTENSIONS)  #define SSE2_BITS (X86_MMX | X86_MMX_EXTENSIONS | X86_SSE | X86_SSE2) +#define SSSE3_BITS (X86_SSE | X86_SSE2 | X86_SSSE3)  #ifdef USE_X86_MMX      if (!_pixman_disabled ("mmx") && have_feature (MMX_BITS)) @@ -233,5 +239,10 @@ _pixman_x86_get_implementations (pixman_implementation_t *imp)  	imp = _pixman_implementation_create_sse2 (imp);  #endif +#ifdef USE_SSSE3 +    if (!_pixman_disabled ("ssse3") && have_feature (SSSE3_BITS)) +	imp = _pixman_implementation_create_ssse3 (imp); +#endif +      return imp;  } diff --git a/pixman/test/scaling-bench.c b/pixman/test/scaling-bench.c index b39adeff5..365e79850 100644 --- a/pixman/test/scaling-bench.c +++ b/pixman/test/scaling-bench.c @@ -3,6 +3,7 @@  #define SOURCE_WIDTH 320  #define SOURCE_HEIGHT 240 +#define TEST_REPEATS 3  static pixman_image_t *  make_source (void) @@ -39,30 +40,40 @@ main ()  	    "time per pixel / ns");      for (scale = 0.1; scale < 10.005; scale += 0.01)      { +	int i;  	int dest_width = SOURCE_WIDTH * scale + 0.5;  	int dest_height = SOURCE_HEIGHT * scale + 0.5; +	int dest_byte_stride = (dest_width * 4 + 15) & ~15;  	pixman_fixed_t s = (1 / scale) * 65536.0 + 0.5;  	pixman_transform_t transform;  	pixman_image_t *dest; -	double t1, t2; +	double t1, t2, t = -1; +	uint32_t *dest_buf = aligned_malloc (16, dest_byte_stride * dest_height); +	memset (dest_buf, 0, dest_byte_stride * dest_height);  	pixman_transform_init_scale (&transform, s, s);  	pixman_image_set_transform (src, &transform);  	dest = pixman_image_create_bits ( -	    PIXMAN_a8r8g8b8, dest_width, dest_height, NULL, -1); +	    PIXMAN_a8r8g8b8, dest_width, dest_height, dest_buf, dest_byte_stride); + +	for (i = 0; i < TEST_REPEATS; i++) +	{ +	    t1 = gettime(); +	    pixman_image_composite ( +		PIXMAN_OP_OVER, src, NULL, dest, +		scale, scale, 0, 0, 0, 0, dest_width, dest_height); +	    t2 = gettime(); +	    if (t < 0 || t2 - t1 < t) +		t = t2 - t1; +	} -	t1 = gettime(); -	pixman_image_composite ( -	    PIXMAN_OP_OVER, src, NULL, dest, -	    scale, scale, 0, 0, 0, 0, dest_width, dest_height); -	t2 = gettime(); -	  	printf ("%6.2f : %4dx%-4d => %4dx%-4d : %12.4f : %12.4f\n",  		scale, SOURCE_WIDTH, SOURCE_HEIGHT, dest_width, dest_height, -		(t2 - t1) * 1000, ((t2 - t1) / (dest_width * dest_height)) * 1000000000); +		t * 1000, (t / (dest_width * dest_height)) * 1000000000);  	pixman_image_unref (dest); +	free (dest_buf);      }      return 0; | 
