aboutsummaryrefslogtreecommitdiff
path: root/pixman/pixman/pixman-mmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'pixman/pixman/pixman-mmx.c')
-rw-r--r--pixman/pixman/pixman-mmx.c280
1 files changed, 274 insertions, 6 deletions
diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c
index d869c04c6..5441d6bc2 100644
--- a/pixman/pixman/pixman-mmx.c
+++ b/pixman/pixman/pixman-mmx.c
@@ -42,6 +42,7 @@
#endif
#include "pixman-private.h"
#include "pixman-combine32.h"
+#include "pixman-inlines.h"
#define no_vERBOSE
@@ -694,6 +695,24 @@ combine (const uint32_t *src, const uint32_t *mask)
return vsrc;
}
+static force_inline __m64
+core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
+{
+ vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
+
+ if (is_opaque (vsrc))
+ {
+ return vsrc;
+ }
+ else if (!is_zero (vsrc))
+ {
+ return over (vsrc, expand_alpha (vsrc),
+ _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
+ }
+
+ return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
+}
+
static void
mmx_combine_over_u (pixman_implementation_t *imp,
pixman_op_t op,
@@ -1599,9 +1618,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
- mask &= 0xff000000;
- mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
- vmask = load8888 (&mask);
+ vmask = expand_alpha (load8888 (&mask));
while (height--)
{
@@ -1670,9 +1687,7 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
- mask &= 0xff000000;
- mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
- vmask = load8888 (&mask);
+ vmask = expand_alpha (load8888 (&mask));
srca = MC (4x00ff);
while (height--)
@@ -3506,6 +3521,242 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
_mm_empty ();
}
+#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
+#define BMSK (BSHIFT - 1)
+
+#define BILINEAR_DECLARE_VARIABLES \
+ const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
+ const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
+ const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \
+ const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
+ const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
+ const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
+ const __m64 mm_zero = _mm_setzero_si64 (); \
+ __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
+do { \
+ /* fetch 2x2 pixel block into 2 mmx registers */ \
+ __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
+ __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
+ vx += unit_x; \
+ /* vertical interpolation */ \
+ __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
+ __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
+ __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
+ __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
+ __m64 hi = _mm_add_pi16 (t_hi, b_hi); \
+ __m64 lo = _mm_add_pi16 (t_lo, b_lo); \
+ if (BILINEAR_INTERPOLATION_BITS < 8) \
+ { \
+ /* calculate horizontal weights */ \
+ __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
+ _mm_srli_pi16 (mm_x, \
+ 16 - BILINEAR_INTERPOLATION_BITS))); \
+ mm_x = _mm_add_pi16 (mm_x, mm_ux); \
+ /* horizontal interpolation */ \
+ __m64 p = _mm_unpacklo_pi16 (lo, hi); \
+ __m64 q = _mm_unpackhi_pi16 (lo, hi); \
+ lo = _mm_madd_pi16 (p, mm_wh); \
+ hi = _mm_madd_pi16 (q, mm_wh); \
+ } \
+ else \
+ { \
+ /* calculate horizontal weights */ \
+ __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \
+ 16 - BILINEAR_INTERPOLATION_BITS)); \
+ __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \
+ 16 - BILINEAR_INTERPOLATION_BITS); \
+ mm_x = _mm_add_pi16 (mm_x, mm_ux); \
+ /* horizontal interpolation */ \
+ __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \
+ __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \
+ __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \
+ __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \
+ lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \
+ _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \
+ hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \
+ _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \
+ } \
+ /* shift and pack the result */ \
+ hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
+ lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
+ lo = _mm_packs_pi32 (lo, hi); \
+ lo = _mm_packs_pu16 (lo, lo); \
+ pix = lo; \
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL() \
+do { \
+ vx += unit_x; \
+ mm_x = _mm_add_pi16 (mm_x, mm_ux); \
+} while(0)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ __m64 pix;
+
+ while (w--)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
+ store (dst, pix);
+ dst++;
+ }
+
+ _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ __m64 pix1, pix2;
+
+ while (w)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+ if (!is_zero (pix1))
+ {
+ pix2 = load (dst);
+ store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
+ }
+
+ w--;
+ dst++;
+ }
+
+ _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
+ scaled_bilinear_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
+ scaled_bilinear_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
+ scaled_bilinear_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
+ scaled_bilinear_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst,
+ const uint8_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ __m64 pix1, pix2;
+ uint32_t m;
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+
+ if (m)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+ if (m == 0xff && is_opaque (pix1))
+ {
+ store (dst, pix1);
+ }
+ else
+ {
+ __m64 ms, md, ma, msa;
+
+ pix2 = load (dst);
+ ma = expand_alpha_rev (to_m64 (m));
+ ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
+ md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
+
+ msa = expand_alpha (ms);
+
+ store8888 (dst, (in_over (ms, msa, ma, md)));
+ }
+ }
+ else
+ {
+ BILINEAR_SKIP_ONE_PIXEL ();
+ }
+
+ w--;
+ dst++;
+ }
+
+ _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
+ scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
+ scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
+ scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
+ scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
static uint32_t *
mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
{
@@ -3761,6 +4012,23 @@ static const pixman_fast_path_t mmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
+
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
+
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ),
+
{ PIXMAN_OP_NONE },
};