From 7f669a708bd35bdf8e842f762ec68f9ad0ec0486 Mon Sep 17 00:00:00 2001 From: marha Date: Wed, 18 Sep 2013 08:03:04 +0200 Subject: libX11 mesa pixman xserver git update 18 Sep 2013 xserver commit 8010d3a48bd0b224dcb0883e39c2351ad364d846 libX11 commit 5dcb40f28d59587597d2ff6e6ac64c71cfe6ff7b pixman commit 58a79dfe6d1fd62c2b66c69fdb64f6b8ecf61da5 mesa commit a3b51a22f71819236baa6bbda9d0d050914b149d --- pixman/configure.ac | 45 +++++ pixman/pixman/Makefile.am | 12 ++ pixman/pixman/pixman-general.c | 22 ++- pixman/pixman/pixman-private.h | 8 + pixman/pixman/pixman-sse2.c | 208 +++++++++++++++-------- pixman/pixman/pixman-ssse3.c | 362 +++++++++++++++++++++++++++++++++++++++++ pixman/pixman/pixman-utils.c | 9 + pixman/pixman/pixman-x86.c | 15 +- pixman/test/scaling-bench.c | 29 +++- 9 files changed, 628 insertions(+), 82 deletions(-) create mode 100644 pixman/pixman/pixman-ssse3.c (limited to 'pixman') diff --git a/pixman/configure.ac b/pixman/configure.ac index daf4062b6..263c63edf 100644 --- a/pixman/configure.ac +++ b/pixman/configure.ac @@ -436,6 +436,50 @@ fi AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes) +dnl =========================================================================== +dnl Check for SSSE3 + +if test "x$SSSE3_CFLAGS" = "x" ; then + SSSE3_CFLAGS="-mssse3 -Winline" +fi + +have_ssse3_intrinsics=no +AC_MSG_CHECKING(whether to use SSSE3 intrinsics) +xserver_save_CFLAGS=$CFLAGS +CFLAGS="$SSSE3_CFLAGS $CFLAGS" + +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +#include +#include +#include +#include +int main () { + __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c; + c = _mm_maddubs_epi16 (a, b); + return 0; +}]])], have_ssse3_intrinsics=yes) +CFLAGS=$xserver_save_CFLAGS + +AC_ARG_ENABLE(ssse3, + [AC_HELP_STRING([--disable-ssse3], + [disable SSSE3 fast paths])], + [enable_ssse3=$enableval], [enable_ssse3=auto]) + +if test $enable_ssse3 = no ; then + have_ssse3_intrinsics=disabled +fi + +if test $have_ssse3_intrinsics = yes ; then + AC_DEFINE(USE_SSSE3, 1, [use SSSE3 compiler intrinsics]) +fi + +AC_MSG_RESULT($have_ssse3_intrinsics) +if test $enable_ssse3 = yes && test $have_ssse3_intrinsics = no ; then + AC_MSG_ERROR([SSSE3 intrinsics not detected]) +fi + +AM_CONDITIONAL(USE_SSSE3, test $have_ssse3_intrinsics = yes) + dnl =========================================================================== dnl Other special flags needed when building code using MMX or SSE instructions case $host_os in @@ -471,6 +515,7 @@ AC_SUBST(MMX_CFLAGS) AC_SUBST(MMX_LDFLAGS) AC_SUBST(SSE2_CFLAGS) AC_SUBST(SSE2_LDFLAGS) +AC_SUBST(SSSE3_CFLAGS) dnl =========================================================================== dnl Check for VMX/Altivec diff --git a/pixman/pixman/Makefile.am b/pixman/pixman/Makefile.am index b9ea75424..b376d9aeb 100644 --- a/pixman/pixman/Makefile.am +++ b/pixman/pixman/Makefile.am @@ -52,6 +52,18 @@ libpixman_1_la_LIBADD += libpixman-sse2.la ASM_CFLAGS_sse2=$(SSE2_CFLAGS) endif +# ssse3 code +if USE_SSSE3 +noinst_LTLIBRARIES += libpixman-ssse3.la +libpixman_ssse3_la_SOURCES = \ + pixman-ssse3.c +libpixman_ssse3_la_CFLAGS = $(SSSE3_CFLAGS) +libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS) +libpixman_1_la_LIBADD += libpixman-ssse3.la + +ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS) +endif + # arm simd code if USE_ARM_SIMD noinst_LTLIBRARIES += libpixman-arm-simd.la diff --git a/pixman/pixman/pixman-general.c b/pixman/pixman/pixman-general.c index 6310bff9d..a653fa71a 100644 --- a/pixman/pixman/pixman-general.c +++ b/pixman/pixman/pixman-general.c @@ -114,7 +114,7 @@ general_composite_rect (pixman_implementation_t *imp, pixman_composite_info_t *info) { PIXMAN_COMPOSITE_ARGS (info); - uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8]; + uint8_t stack_scanline_buffer[3 * SCANLINE_BUFFER_LENGTH]; uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer; uint8_t *src_buffer, *mask_buffer, *dest_buffer; pixman_iter_t src_iter, mask_iter, dest_iter; @@ -137,17 +137,25 @@ general_composite_rect (pixman_implementation_t *imp, Bpp = 16; } - if (width * Bpp > SCANLINE_BUFFER_LENGTH) +#define ALIGN(addr) \ + ((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15))) + + src_buffer = ALIGN (scanline_buffer); + mask_buffer = ALIGN (src_buffer + width * Bpp); + dest_buffer = ALIGN (mask_buffer + width * Bpp); + + if (ALIGN (dest_buffer + width * Bpp) > + scanline_buffer + sizeof (stack_scanline_buffer)) { - scanline_buffer = pixman_malloc_abc (width, 3, Bpp); + scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 32 * 3); if (!scanline_buffer) return; - } - src_buffer = scanline_buffer; - mask_buffer = src_buffer + width * Bpp; - dest_buffer = mask_buffer + width * Bpp; + src_buffer = ALIGN (scanline_buffer); + mask_buffer = ALIGN (src_buffer + width * Bpp); + dest_buffer = ALIGN (mask_buffer + width * Bpp); + } if (width_flag == ITER_WIDE) { diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h index 120196ddf..6ca13b216 100644 --- a/pixman/pixman/pixman-private.h +++ b/pixman/pixman/pixman-private.h @@ -593,6 +593,11 @@ pixman_implementation_t * _pixman_implementation_create_sse2 (pixman_implementation_t *fallback); #endif +#ifdef USE_SSSE3 +pixman_implementation_t * +_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback); +#endif + #ifdef USE_ARM_SIMD pixman_implementation_t * _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback); @@ -787,6 +792,9 @@ pixman_malloc_ab (unsigned int n, unsigned int b); void * pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c); +void * +pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c); + pixman_bool_t _pixman_multiply_overflows_size (size_t a, size_t b); diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index a629565ef..42c720938 100644 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -30,6 +30,9 @@ #include #endif +/* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */ +#define PSHUFD_IS_FAST 0 + #include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ #include /* for SSE2 intrinsics */ #include "pixman-private.h" @@ -5554,50 +5557,134 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, scaled_nearest_scanline_sse2_8888_n_8888_OVER, uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) +#if PSHUFD_IS_FAST + +/***********************************************************************************/ + # define BILINEAR_DECLARE_VARIABLES \ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ - const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \ + const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \ + unit_x, -unit_x, unit_x, -unit_x); \ + const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4); \ + const __m128i xmm_zero = _mm_setzero_si128 (); \ + __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \ + vx + unit_x * 2, -(vx + 1) - unit_x * 2, \ + vx + unit_x * 1, -(vx + 1) - unit_x * 1, \ + vx + unit_x * 0, -(vx + 1) - unit_x * 0); \ + __m128i xmm_wh_state; + +#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_) \ +do { \ + int phase = phase_; \ + __m128i xmm_wh, xmm_a, xmm_b; \ + /* fetch 2x2 pixel block into sse2 registers */ \ + __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \ + __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \ + vx += unit_x; \ + /* vertical interpolation */ \ + xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \ + xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \ + xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \ + /* calculate horizontal weights */ \ + if (phase <= 0) \ + { \ + xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \ + 16 - BILINEAR_INTERPOLATION_BITS)); \ + xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4); \ + phase = 0; \ + } \ + xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase, \ + phase, phase)); \ + /* horizontal interpolation */ \ + xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ + xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh); \ + /* shift the result */ \ + pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \ +} while (0) + +#else /************************************************************************/ + +# define BILINEAR_DECLARE_VARIABLES \ + const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ + const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ + const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ + const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \ unit_x, -unit_x, unit_x, -unit_x); \ + const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4); \ const __m128i xmm_zero = _mm_setzero_si128 (); \ __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \ vx, -(vx + 1), vx, -(vx + 1)) -#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ +#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase) \ do { \ - __m128i xmm_wh, a; \ + __m128i xmm_wh, xmm_a, xmm_b; \ + (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */ \ /* fetch 2x2 pixel block into sse2 registers */ \ - __m128i tltr = _mm_loadl_epi64 ( \ - (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \ - __m128i blbr = _mm_loadl_epi64 ( \ - (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \ + __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \ + __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \ vx += unit_x; \ /* vertical interpolation */ \ - a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \ - xmm_wt), \ - _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \ - xmm_wb)); \ + xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \ + xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \ + xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \ /* calculate horizontal weights */ \ xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \ - 16 - BILINEAR_INTERPOLATION_BITS)); \ - xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ + 16 - BILINEAR_INTERPOLATION_BITS)); \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \ /* horizontal interpolation */ \ - a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ - a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \ - /* shift and pack the result */ \ - a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \ - a = _mm_packs_epi32 (a, a); \ - a = _mm_packus_epi16 (a, a); \ - pix = _mm_cvtsi128_si32 (a); \ + xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \ + xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh); \ + /* shift the result */ \ + pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \ } while (0) +/***********************************************************************************/ + +#endif + +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix); \ +do { \ + __m128i xmm_pix; \ + BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1); \ + xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix); \ + xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix); \ + pix = _mm_cvtsi128_si32 (xmm_pix); \ +} while(0) + +#define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix); \ +do { \ + __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4; \ + BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0); \ + BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1); \ + BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2); \ + BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3); \ + xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2); \ + xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4); \ + pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3); \ +} while(0) + #define BILINEAR_SKIP_ONE_PIXEL() \ do { \ vx += unit_x; \ - xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \ } while(0) +#define BILINEAR_SKIP_FOUR_PIXELS() \ +do { \ + vx += unit_x * 4; \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4); \ +} while(0) + +/***********************************************************************************/ + static force_inline void scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, const uint32_t * mask, @@ -5606,24 +5693,28 @@ scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, int32_t w, int wt, int wb, - pixman_fixed_t vx, - pixman_fixed_t unit_x, + pixman_fixed_t vx_, + pixman_fixed_t unit_x_, pixman_fixed_t max_vx, pixman_bool_t zero_src) { + intptr_t vx = vx_; + intptr_t unit_x = unit_x_; BILINEAR_DECLARE_VARIABLES; - uint32_t pix1, pix2, pix3, pix4; + uint32_t pix1, pix2; - while ((w -= 4) >= 0) + while (w && ((uintptr_t)dst & 15)) { BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); *dst++ = pix1; - *dst++ = pix2; - *dst++ = pix3; - *dst++ = pix4; + w--; + } + + while ((w -= 4) >= 0) { + __m128i xmm_src; + BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); + _mm_store_si128 ((__m128i *)dst, xmm_src); + dst += 4; } if (w & 2) @@ -5667,13 +5758,15 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, int32_t w, int wt, int wb, - pixman_fixed_t vx, - pixman_fixed_t unit_x, + pixman_fixed_t vx_, + pixman_fixed_t unit_x_, pixman_fixed_t max_vx, pixman_bool_t zero_src) { + intptr_t vx = vx_; + intptr_t unit_x = unit_x_; BILINEAR_DECLARE_VARIABLES; - uint32_t pix1, pix2, pix3, pix4; + uint32_t pix1, pix2; while (w && ((uintptr_t)dst & 15)) { @@ -5695,12 +5788,7 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo; __m128i xmm_alpha_hi, xmm_alpha_lo; - BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); - - xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); + BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); if (!is_zero (xmm_src)) { @@ -5767,13 +5855,15 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, int32_t w, int wt, int wb, - pixman_fixed_t vx, - pixman_fixed_t unit_x, + pixman_fixed_t vx_, + pixman_fixed_t unit_x_, pixman_fixed_t max_vx, pixman_bool_t zero_src) { + intptr_t vx = vx_; + intptr_t unit_x = unit_x_; BILINEAR_DECLARE_VARIABLES; - uint32_t pix1, pix2, pix3, pix4; + uint32_t pix1, pix2; uint32_t m; while (w && ((uintptr_t)dst & 15)) @@ -5824,12 +5914,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, if (m) { - BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); - - xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); + BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); if (m == 0xffffffff && is_opaque (xmm_src)) { @@ -5856,10 +5941,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, } else { - BILINEAR_SKIP_ONE_PIXEL (); - BILINEAR_SKIP_ONE_PIXEL (); - BILINEAR_SKIP_ONE_PIXEL (); - BILINEAR_SKIP_ONE_PIXEL (); + BILINEAR_SKIP_FOUR_PIXELS (); } w -= 4; @@ -5931,13 +6013,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, int32_t w, int wt, int wb, - pixman_fixed_t vx, - pixman_fixed_t unit_x, + pixman_fixed_t vx_, + pixman_fixed_t unit_x_, pixman_fixed_t max_vx, pixman_bool_t zero_src) { + intptr_t vx = vx_; + intptr_t unit_x = unit_x_; BILINEAR_DECLARE_VARIABLES; - uint32_t pix1, pix2, pix3, pix4; + uint32_t pix1; __m128i xmm_mask; if (zero_src || (*mask >> 24) == 0) @@ -5967,19 +6051,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, while (w >= 4) { - BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); + __m128i xmm_src; + BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); - if (pix1 | pix2 | pix3 | pix4) + if (!is_zero (xmm_src)) { - __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; - xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); diff --git a/pixman/pixman/pixman-ssse3.c b/pixman/pixman/pixman-ssse3.c new file mode 100644 index 000000000..34763e20b --- /dev/null +++ b/pixman/pixman/pixman-ssse3.c @@ -0,0 +1,362 @@ +/* + * Copyright © 2013 Soren Sandmann Pedersen + * Copyright © 2013 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Soren Sandmann (soren.sandmann@gmail.com) + */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include "pixman-private.h" +#include "pixman-inlines.h" + +typedef struct +{ + int y; + uint64_t * buffer; +} line_t; + +typedef struct +{ + line_t line0; + line_t line1; + pixman_fixed_t y; + pixman_fixed_t x; + uint64_t data[1]; +} bilinear_info_t; + +static void +ssse3_fetch_horizontal (bits_image_t *image, line_t *line, + int y, pixman_fixed_t x, pixman_fixed_t ux, int n) +{ + uint32_t *bits = image->bits + y * image->rowstride; + __m128i vx = _mm_set_epi16 ( + - (x + 1), x, - (x + 1), x, + - (x + ux + 1), x + ux, - (x + ux + 1), x + ux); + __m128i vux = _mm_set_epi16 ( + - 2 * ux, 2 * ux, - 2 * ux, 2 * ux, + - 2 * ux, 2 * ux, - 2 * ux, 2 * ux); + __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0); + __m128i *b = (__m128i *)line->buffer; + __m128i vrl0, vrl1; + + while ((n -= 2) >= 0) + { + __m128i vw, vr, s; + + vrl1 = _mm_loadl_epi64 ( + (__m128i *)(bits + pixman_fixed_to_int (x + ux))); + /* vrl1: R1, L1 */ + + final_pixel: + vrl0 = _mm_loadl_epi64 ( + (__m128i *)(bits + pixman_fixed_to_int (x))); + /* vrl0: R0, L0 */ + + /* The weights are based on vx which is a vector of + * + * - (x + 1), x, - (x + 1), x, + * - (x + ux + 1), x + ux, - (x + ux + 1), x + ux + * + * so the 16 bit weights end up like this: + * + * iw0, w0, iw0, w0, iw1, w1, iw1, w1 + * + * and after shifting and packing, we get these bytes: + * + * iw0, w0, iw0, w0, iw1, w1, iw1, w1, + * iw0, w0, iw0, w0, iw1, w1, iw1, w1, + * + * which means the first and the second input pixel + * have to be interleaved like this: + * + * la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, + * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 + * + * before maddubsw can be used. + */ + + vw = _mm_add_epi16 ( + vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS)); + /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1 + */ + + vw = _mm_packus_epi16 (vw, vw); + /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1, + * iw0, w0, iw0, w0, iw1, w1, iw1, w1 + */ + vx = _mm_add_epi16 (vx, vux); + + x += 2 * ux; + + vr = _mm_unpacklo_epi16 (vrl1, vrl0); + /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */ + + s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2)); + /* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */ + + vr = _mm_unpackhi_epi8 (vr, s); + /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, + * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 + */ + + vr = _mm_maddubs_epi16 (vr, vw); + + /* When the weight is 0, the inverse weight is + * 128 which can't be represented in a signed byte. + * As a result maddubsw computes the following: + * + * r = l * -128 + r * 0 + * + * rather than the desired + * + * r = l * 128 + r * 0 + * + * We fix this by taking the absolute value of the + * result. + */ + vr = _mm_abs_epi16 (vr); + + /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */ + _mm_store_si128 (b++, vr); + } + + if (n == -1) + { + vrl1 = _mm_setzero_si128(); + goto final_pixel; + } + + line->y = y; +} + +static uint32_t * +ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask) +{ + pixman_fixed_t fx, ux; + bilinear_info_t *info = iter->data; + line_t *line0, *line1; + int y0, y1; + int32_t dist_y; + __m128i vw; + int i; + + fx = info->x; + ux = iter->image->common.transform->matrix[0][0]; + + y0 = pixman_fixed_to_int (info->y); + y1 = y0 + 1; + + line0 = &info->line0; + line1 = &info->line1; + + if (line0->y != y0 || line1->y != y1) + { + if (line0->y == y1 || line1->y == y0) + { + line_t tmp = *line0; + *line0 = *line1; + *line1 = tmp; + } + + if (line0->y != y0) + { + ssse3_fetch_horizontal ( + &iter->image->bits, line0, y0, fx, ux, iter->width); + } + + if (line1->y != y1) + { + ssse3_fetch_horizontal ( + &iter->image->bits, line1, y1, fx, ux, iter->width); + } + } + + dist_y = pixman_fixed_to_bilinear_weight (info->y); + dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS); + + vw = _mm_set_epi16 ( + dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y); + + for (i = 0; i + 3 < iter->width; i += 4) + { + __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); + __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); + __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2)); + __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2)); + __m128i r0, r1, tmp, p; + + r0 = _mm_mulhi_epu16 ( + _mm_sub_epi16 (bot0, top0), vw); + tmp = _mm_cmplt_epi16 (bot0, top0); + tmp = _mm_and_si128 (tmp, vw); + r0 = _mm_sub_epi16 (r0, tmp); + r0 = _mm_add_epi16 (r0, top0); + r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); + /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ + r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); + /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ + + r1 = _mm_mulhi_epu16 ( + _mm_sub_epi16 (bot1, top1), vw); + tmp = _mm_cmplt_epi16 (bot1, top1); + tmp = _mm_and_si128 (tmp, vw); + r1 = _mm_sub_epi16 (r1, tmp); + r1 = _mm_add_epi16 (r1, top1); + r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS); + r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1)); + /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */ + + p = _mm_packus_epi16 (r0, r1); + + _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p); + } + + while (i < iter->width) + { + __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); + __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); + __m128i r0, tmp, p; + + r0 = _mm_mulhi_epu16 ( + _mm_sub_epi16 (bot0, top0), vw); + tmp = _mm_cmplt_epi16 (bot0, top0); + tmp = _mm_and_si128 (tmp, vw); + r0 = _mm_sub_epi16 (r0, tmp); + r0 = _mm_add_epi16 (r0, top0); + r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); + /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ + r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); + /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ + + p = _mm_packus_epi16 (r0, r0); + + if (iter->width - i == 1) + { + *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p); + i++; + } + else + { + _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p); + i += 2; + } + } + + info->y += iter->image->common.transform->matrix[1][1]; + + return iter->buffer; +} + +static void +ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter) +{ + free (iter->data); +} + +static void +ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info) +{ + int width = iter->width; + bilinear_info_t *info; + pixman_vector_t v; + + /* Reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (iter->image->common.transform, &v)) + goto fail; + + info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64); + if (!info) + goto fail; + + info->x = v.vector[0] - pixman_fixed_1 / 2; + info->y = v.vector[1] - pixman_fixed_1 / 2; + +#define ALIGN(addr) \ + ((void *)((((uintptr_t)(addr)) + 15) & (~15))) + + /* It is safe to set the y coordinates to -1 initially + * because COVER_CLIP_BILINEAR ensures that we will only + * be asked to fetch lines in the [0, height) interval + */ + info->line0.y = -1; + info->line0.buffer = ALIGN (&(info->data[0])); + info->line1.y = -1; + info->line1.buffer = ALIGN (info->line0.buffer + width); + + iter->get_scanline = ssse3_fetch_bilinear_cover; + iter->fini = ssse3_bilinear_cover_iter_fini; + + iter->data = info; + return; + +fail: + /* Something went wrong, either a bad matrix or OOM; in such cases, + * we don't guarantee any particular rendering. + */ + _pixman_log_error ( + FUNC, "Allocation failure or bad matrix, skipping rendering\n"); + + iter->get_scanline = _pixman_iter_get_scanline_noop; + iter->fini = NULL; +} + +static const pixman_iter_info_t ssse3_iters[] = +{ + { PIXMAN_a8r8g8b8, + (FAST_PATH_STANDARD_FLAGS | + FAST_PATH_SCALE_TRANSFORM | + FAST_PATH_BILINEAR_FILTER | + FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR), + ITER_NARROW | ITER_SRC, + ssse3_bilinear_cover_iter_init, + NULL, NULL + }, + + { PIXMAN_null }, +}; + +static const pixman_fast_path_t ssse3_fast_paths[] = +{ + { PIXMAN_OP_NONE }, +}; + +pixman_implementation_t * +_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback) +{ + pixman_implementation_t *imp = + _pixman_implementation_create (fallback, ssse3_fast_paths); + + imp->iter_info = ssse3_iters; + + return imp; +} diff --git a/pixman/pixman/pixman-utils.c b/pixman/pixman/pixman-utils.c index 98723a800..4a3a835c4 100644 --- a/pixman/pixman/pixman-utils.c +++ b/pixman/pixman/pixman-utils.c @@ -48,6 +48,15 @@ _pixman_addition_overflows_int (unsigned int a, unsigned int b) return a > INT32_MAX - b; } +void * +pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c) +{ + if (!b || a >= INT32_MAX / b || (a * b) > INT32_MAX - c) + return NULL; + + return malloc (a * b + c); +} + void * pixman_malloc_ab (unsigned int a, unsigned int b) diff --git a/pixman/pixman/pixman-x86.c b/pixman/pixman/pixman-x86.c index 57e4d1f35..652776021 100644 --- a/pixman/pixman/pixman-x86.c +++ b/pixman/pixman/pixman-x86.c @@ -25,7 +25,7 @@ #include "pixman-private.h" -#if defined(USE_X86_MMX) || defined (USE_SSE2) +#if defined(USE_X86_MMX) || defined (USE_SSE2) || defined (USE_SSSE3) /* The CPU detection code needs to be in a file not compiled with * "-mmmx -msse", as gcc would generate CMOV instructions otherwise @@ -39,7 +39,8 @@ typedef enum X86_MMX_EXTENSIONS = (1 << 1), X86_SSE = (1 << 2) | X86_MMX_EXTENSIONS, X86_SSE2 = (1 << 3), - X86_CMOV = (1 << 4) + X86_CMOV = (1 << 4), + X86_SSSE3 = (1 << 5) } cpu_features_t; #ifdef HAVE_GETISAX @@ -64,6 +65,8 @@ detect_cpu_features (void) features |= X86_SSE; if (result & AV_386_SSE2) features |= X86_SSE2; + if (result & AV_386_SSSE3) + features |= X86_SSSE3; } return features; @@ -167,6 +170,8 @@ detect_cpu_features (void) features |= X86_SSE; if (d & (1 << 26)) features |= X86_SSE2; + if (d & (1 << 9)) + features |= X86_SSSE3; /* Check for AMD specific features */ if ((features & X86_MMX) && !(features & X86_SSE)) @@ -222,6 +227,7 @@ _pixman_x86_get_implementations (pixman_implementation_t *imp) { #define MMX_BITS (X86_MMX | X86_MMX_EXTENSIONS) #define SSE2_BITS (X86_MMX | X86_MMX_EXTENSIONS | X86_SSE | X86_SSE2) +#define SSSE3_BITS (X86_SSE | X86_SSE2 | X86_SSSE3) #ifdef USE_X86_MMX if (!_pixman_disabled ("mmx") && have_feature (MMX_BITS)) @@ -233,5 +239,10 @@ _pixman_x86_get_implementations (pixman_implementation_t *imp) imp = _pixman_implementation_create_sse2 (imp); #endif +#ifdef USE_SSSE3 + if (!_pixman_disabled ("ssse3") && have_feature (SSSE3_BITS)) + imp = _pixman_implementation_create_ssse3 (imp); +#endif + return imp; } diff --git a/pixman/test/scaling-bench.c b/pixman/test/scaling-bench.c index b39adeff5..365e79850 100644 --- a/pixman/test/scaling-bench.c +++ b/pixman/test/scaling-bench.c @@ -3,6 +3,7 @@ #define SOURCE_WIDTH 320 #define SOURCE_HEIGHT 240 +#define TEST_REPEATS 3 static pixman_image_t * make_source (void) @@ -39,30 +40,40 @@ main () "time per pixel / ns"); for (scale = 0.1; scale < 10.005; scale += 0.01) { + int i; int dest_width = SOURCE_WIDTH * scale + 0.5; int dest_height = SOURCE_HEIGHT * scale + 0.5; + int dest_byte_stride = (dest_width * 4 + 15) & ~15; pixman_fixed_t s = (1 / scale) * 65536.0 + 0.5; pixman_transform_t transform; pixman_image_t *dest; - double t1, t2; + double t1, t2, t = -1; + uint32_t *dest_buf = aligned_malloc (16, dest_byte_stride * dest_height); + memset (dest_buf, 0, dest_byte_stride * dest_height); pixman_transform_init_scale (&transform, s, s); pixman_image_set_transform (src, &transform); dest = pixman_image_create_bits ( - PIXMAN_a8r8g8b8, dest_width, dest_height, NULL, -1); + PIXMAN_a8r8g8b8, dest_width, dest_height, dest_buf, dest_byte_stride); + + for (i = 0; i < TEST_REPEATS; i++) + { + t1 = gettime(); + pixman_image_composite ( + PIXMAN_OP_OVER, src, NULL, dest, + scale, scale, 0, 0, 0, 0, dest_width, dest_height); + t2 = gettime(); + if (t < 0 || t2 - t1 < t) + t = t2 - t1; + } - t1 = gettime(); - pixman_image_composite ( - PIXMAN_OP_OVER, src, NULL, dest, - scale, scale, 0, 0, 0, 0, dest_width, dest_height); - t2 = gettime(); - printf ("%6.2f : %4dx%-4d => %4dx%-4d : %12.4f : %12.4f\n", scale, SOURCE_WIDTH, SOURCE_HEIGHT, dest_width, dest_height, - (t2 - t1) * 1000, ((t2 - t1) / (dest_width * dest_height)) * 1000000000); + t * 1000, (t / (dest_width * dest_height)) * 1000000000); pixman_image_unref (dest); + free (dest_buf); } return 0; -- cgit v1.2.3