diff options
author | marha <marha@users.sourceforge.net> | 2011-01-17 09:16:52 +0000 |
---|---|---|
committer | marha <marha@users.sourceforge.net> | 2011-01-17 09:16:52 +0000 |
commit | 260ff71e2ae7eac8d6856523bf4d92ac8e2c507f (patch) | |
tree | a41d3070b8230419f4e60e1725f00dd8a9245317 | |
parent | a690eedbb713ab50331ca068dde4a2f530c3ad55 (diff) | |
parent | 9845f4671b0f731f868ea0f39844114ab0f70a3c (diff) | |
download | vcxsrv-260ff71e2ae7eac8d6856523bf4d92ac8e2c507f.tar.gz vcxsrv-260ff71e2ae7eac8d6856523bf4d92ac8e2c507f.tar.bz2 vcxsrv-260ff71e2ae7eac8d6856523bf4d92ac8e2c507f.zip |
svn merge ^/branches/released .
-rw-r--r-- | pixman/configure.ac | 1580 | ||||
-rw-r--r-- | pixman/pixman/pixman-bits-image.c | 2884 | ||||
-rw-r--r-- | pixman/pixman/pixman-compiler.h | 431 | ||||
-rw-r--r-- | pixman/pixman/pixman-cpu.c | 1201 | ||||
-rw-r--r-- | pixman/pixman/pixman-fast-path.c | 3874 | ||||
-rw-r--r-- | pixman/pixman/pixman-fast-path.h | 894 | ||||
-rw-r--r-- | pixman/pixman/pixman-matrix.c | 3 | ||||
-rw-r--r-- | pixman/pixman/pixman-sse2.c | 12062 | ||||
-rw-r--r-- | pixman/test/Makefile.am | 21 |
9 files changed, 11496 insertions, 11454 deletions
diff --git a/pixman/configure.ac b/pixman/configure.ac index ac0d16158..6552f1270 100644 --- a/pixman/configure.ac +++ b/pixman/configure.ac @@ -1,783 +1,797 @@ -dnl Copyright 2005 Red Hat, Inc.
-dnl
-dnl Permission to use, copy, modify, distribute, and sell this software and its
-dnl documentation for any purpose is hereby granted without fee, provided that
-dnl the above copyright notice appear in all copies and that both that
-dnl copyright notice and this permission notice appear in supporting
-dnl documentation, and that the name of Red Hat not be used in
-dnl advertising or publicity pertaining to distribution of the software without
-dnl specific, written prior permission. Red Hat makes no
-dnl representations about the suitability of this software for any purpose. It
-dnl is provided "as is" without express or implied warranty.
-dnl
-dnl RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
-dnl INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
-dnl EVENT SHALL RED HAT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
-dnl CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
-dnl DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-dnl TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-dnl PERFORMANCE OF THIS SOFTWARE.
-dnl
-dnl Process this file with autoconf to create configure.
-
-AC_PREREQ([2.57])
-
-# Pixman versioning scheme
-#
-# - The version in git has an odd MICRO version number
-#
-# - Released versions both development and stable have an even MICRO
-# version number
-#
-# - Released development versions have an odd MINOR number
-#
-# - Released stable versions have an even MINOR number
-#
-# - Versions that break ABI must have a new MAJOR number
-#
-# - If you break the ABI, then at least this must be done:
-#
-# - increment MAJOR
-#
-# - In the first development release where you break ABI, find
-# all instances of "pixman-n" and change them to pixman-(n+1)
-#
-# This needs to be done at least in
-# configure.ac
-# all Makefile.am's
-# pixman-n.pc.in
-#
-# This ensures that binary incompatible versions can be installed
-# in parallel. See http://www106.pair.com/rhp/parallel.html for
-# more information
-#
-
-m4_define([pixman_major], 0)
-m4_define([pixman_minor], 21)
-m4_define([pixman_micro], 3)
-
-m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
-
-AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman)
-AM_INIT_AUTOMAKE([foreign dist-bzip2])
-
-# Suppress verbose compile lines
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-
-AM_CONFIG_HEADER(config.h)
-
-AC_CANONICAL_HOST
-
-test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
-
-AC_PROG_CC
-AM_PROG_AS
-AC_PROG_LIBTOOL
-AC_CHECK_FUNCS([getisax])
-AC_C_BIGENDIAN
-AC_C_INLINE
-
-dnl PIXMAN_LINK_WITH_ENV(env-setup, program, true-action, false-action)
-dnl
-dnl Compiles and links the given program in the environment setup by env-setup
-dnl and executes true-action on success and false-action on failure.
-AC_DEFUN([PIXMAN_LINK_WITH_ENV],[dnl
- save_CFLAGS="$CFLAGS"
- save_LDFLAGS="$LDFLAGS"
- save_LIBS="$LIBS"
- CFLAGS=""
- LDFLAGS=""
- LIBS=""
- $1
- AC_LINK_IFELSE(
- [$2],
- [pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
- pixman_cc_flag=yes],
- [pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
- pixman_cc_flag=no])
-
- if test "x$pixman_cc_stderr" != "x"; then
- pixman_cc_flag=no
- fi
-
- if test "x$pixman_cc_flag" = "xyes"; then
- ifelse([$3], , :, [$3])
- else
- ifelse([$4], , :, [$4])
- fi
- CFLAGS="$save_CFLAGS"
- LDFLAGS="$save_LDFLAGS"
- LIBS="$save_LIBS"
-])
-
-dnl Find a -Werror for catching warnings.
-WERROR=
-for w in -Werror -errwarn; do
- if test "z$WERROR" = "z"; then
- AC_MSG_CHECKING([whether the compiler supports $w])
- PIXMAN_LINK_WITH_ENV(
- [CFLAGS=$w],
- [int main(int c, char **v) { (void)c; (void)v; return 0; }],
- [WERROR=$w; yesno=yes], [yesno=no])
- AC_MSG_RESULT($_yesno)
- fi
-done
-
-dnl PIXMAN_CHECK_CFLAG(flag, [program])
-dnl Adds flag to CFLAGS if the given program links without warnings or errors.
-AC_DEFUN([PIXMAN_CHECK_CFLAG], [dnl
- AC_MSG_CHECKING([whether the compiler supports $1])
- PIXMAN_LINK_WITH_ENV(
- [CFLAGS="$WERROR $1"],
- [$2
- int main(int c, char **v) { (void)c; (void)v; return 0; }
- ],
- [_yesno=yes],
- [_yesno=no])
- if test "x$_yesno" = xyes; then
- CFLAGS="$CFLAGS $1"
- fi
- AC_MSG_RESULT($_yesno)
-])
-
-AC_CHECK_SIZEOF(long)
-
-# Checks for Sun Studio compilers
-AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
-AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
-
-# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
-# if we're using Sun Studio and neither the user nor a config.site
-# has set CFLAGS.
-if test $SUNCC = yes && \
- test "$test_CFLAGS" == "" && \
- test "$CFLAGS" = "-g"
-then
- CFLAGS="-O -g"
-fi
-
-#
-# We ignore pixman_major in the version here because the major version should
-# always be encoded in the actual library name. Ie., the soname is:
-#
-# pixman-$(pixman_major).0.minor.micro
-#
-m4_define([lt_current], [pixman_minor])
-m4_define([lt_revision], [pixman_micro])
-m4_define([lt_age], [pixman_minor])
-
-LT_VERSION_INFO="lt_current:lt_revision:lt_age"
-
-PIXMAN_VERSION_MAJOR=pixman_major()
-AC_SUBST(PIXMAN_VERSION_MAJOR)
-PIXMAN_VERSION_MINOR=pixman_minor()
-AC_SUBST(PIXMAN_VERSION_MINOR)
-PIXMAN_VERSION_MICRO=pixman_micro()
-AC_SUBST(PIXMAN_VERSION_MICRO)
-
-AC_SUBST(LT_VERSION_INFO)
-
-# Check for dependencies
-
-PIXMAN_CHECK_CFLAG([-Wall])
-PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
-
-AC_PATH_PROG(PERL, perl, no)
-if test "x$PERL" = xno; then
- AC_MSG_ERROR([Perl is required to build pixman.])
-fi
-AC_SUBST(PERL)
-
-dnl =========================================================================
-dnl OpenMP for the test suite?
-dnl
-
-# Check for OpenMP support (only supported by autoconf >=2.62)
-OPENMP_CFLAGS=
-m4_ifdef([AC_OPENMP], [AC_OPENMP])
-
-m4_define([openmp_test_program],[dnl
-#include <stdio.h>
-
-extern unsigned int lcg_seed;
-#pragma omp threadprivate(lcg_seed)
-unsigned int lcg_seed;
-
-unsigned function(unsigned a, unsigned b)
-{
- lcg_seed ^= b;
- return ((a + b) ^ a ) + lcg_seed;
-}
-
-int main(int argc, char **argv)
-{
- int i;
- int n1 = 0, n2 = argc;
- unsigned checksum = 0;
- int verbose = argv != NULL;
- unsigned (*test_function)(unsigned, unsigned);
- test_function = function;
- #pragma omp parallel for reduction(+:checksum) default(none) \
- shared(n1, n2, test_function, verbose)
- for (i = n1; i < n2; i++)
- {
- unsigned crc = test_function (i, 0);
- if (verbose)
- printf ("%d: %08X\n", i, crc);
- checksum += crc;
- }
- printf("%u\n", checksum);
- return 0;
-}
-])
-
-PIXMAN_LINK_WITH_ENV(
- [CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"],
- [openmp_test_program],
- [have_openmp=yes],
- [have_openmp=no])
-if test "x$have_openmp" = "xyes"; then
- AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
-else
- OPENMP_CFLAGS=""
-fi
-AC_SUBST(OPENMP_CFLAGS)
-
-dnl =========================================================================
-dnl -fvisibility stuff
-
-PIXMAN_CHECK_CFLAG([-fvisibility=hidden], [dnl
-#if defined(__GNUC__) && (__GNUC__ >= 4)
-#ifdef _WIN32
-#error Have -fvisibility but it is ignored and generates a warning
-#endif
-#else
-error Need GCC 4.0 for visibility
-#endif
-])
-
-PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
-#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
-#else
-error Need Sun Studio 8 for visibility
-#endif
-])
-
-dnl ===========================================================================
-dnl Check for MMX
-
-if test "x$MMX_CFLAGS" = "x" ; then
- if test "x$SUNCC" = "xyes"; then
- # Sun Studio doesn't have an -xarch=mmx flag, so we have to use sse
- # but if we're building 64-bit, mmx & sse support is on by default and
- # -xarch=sse throws an error instead
- if test "$AMD64_ABI" = "no" ; then
- MMX_CFLAGS="-xarch=sse"
- fi
- else
- MMX_CFLAGS="-mmmx -Winline"
- fi
-fi
-
-have_mmx_intrinsics=no
-AC_MSG_CHECKING(whether to use MMX intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$MMX_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-error "Need GCC >= 3.4 for MMX intrinsics"
-#endif
-#include <mmintrin.h>
-int main () {
- __m64 v = _mm_cvtsi32_si64 (1);
- return _mm_cvtsi64_si32 (v);
-}], have_mmx_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(mmx,
- [AC_HELP_STRING([--disable-mmx],
- [disable MMX fast paths])],
- [enable_mmx=$enableval], [enable_mmx=auto])
-
-if test $enable_mmx = no ; then
- have_mmx_intrinsics=disabled
-fi
-
-if test $have_mmx_intrinsics = yes ; then
- AC_DEFINE(USE_MMX, 1, [use MMX compiler intrinsics])
-else
- MMX_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_mmx_intrinsics)
-if test $enable_mmx = yes && test $have_mmx_intrinsics = no ; then
- AC_MSG_ERROR([MMX intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_MMX, test $have_mmx_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Check for SSE2
-
-if test "x$SSE2_CFLAGS" = "x" ; then
- if test "x$SUNCC" = "xyes"; then
- # SSE2 is enabled by default in the Sun Studio 64-bit environment
- if test "$AMD64_ABI" = "no" ; then
- SSE2_CFLAGS="-xarch=sse2"
- fi
- else
- SSE2_CFLAGS="-mmmx -msse2 -Winline"
- fi
-fi
-
-have_sse2_intrinsics=no
-AC_MSG_CHECKING(whether to use SSE2 intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$SSE2_CFLAGS $CFLAGS"
-
-AC_COMPILE_IFELSE([
-#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
-# if !defined(__amd64__) && !defined(__x86_64__)
-# error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
-# endif
-#endif
-#include <mmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-int main () {
- __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
- c = _mm_xor_si128 (a, b);
- return 0;
-}], have_sse2_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(sse2,
- [AC_HELP_STRING([--disable-sse2],
- [disable SSE2 fast paths])],
- [enable_sse2=$enableval], [enable_sse2=auto])
-
-if test $enable_sse2 = no ; then
- have_sse2_intrinsics=disabled
-fi
-
-if test $have_sse2_intrinsics = yes ; then
- AC_DEFINE(USE_SSE2, 1, [use SSE2 compiler intrinsics])
-fi
-
-AC_MSG_RESULT($have_sse2_intrinsics)
-if test $enable_sse2 = yes && test $have_sse2_intrinsics = no ; then
- AC_MSG_ERROR([SSE2 intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Other special flags needed when building code using MMX or SSE instructions
-case $host_os in
- solaris*)
- # When building 32-bit binaries, apply a mapfile to ensure that the
- # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
- # since they check at runtime before using those instructions.
- # Not all linkers grok the mapfile format so we check for that first.
- if test "$AMD64_ABI" = "no" ; then
- use_hwcap_mapfile=no
- AC_MSG_CHECKING(whether to use a hardware capability map file)
- hwcap_save_LDFLAGS="$LDFLAGS"
- HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
- LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
- AC_LINK_IFELSE([int main() { return 0; }],
- use_hwcap_mapfile=yes,
- HWCAP_LDFLAGS="")
- LDFLAGS="$hwcap_save_LDFLAGS"
- AC_MSG_RESULT($use_hwcap_mapfile)
- fi
- if test "x$MMX_LDFLAGS" = "x" ; then
- MMX_LDFLAGS="$HWCAP_LDFLAGS"
- fi
- if test "x$SSE2_LDFLAGS" = "x" ; then
- SSE2_LDFLAGS="$HWCAP_LDFLAGS"
- fi
- ;;
-esac
-
-AC_SUBST(MMX_CFLAGS)
-AC_SUBST(MMX_LDFLAGS)
-AC_SUBST(SSE2_CFLAGS)
-AC_SUBST(SSE2_LDFLAGS)
-
-dnl ===========================================================================
-dnl Check for VMX/Altivec
-if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
- VMX_CFLAGS="-faltivec"
-else
- VMX_CFLAGS="-maltivec -mabi=altivec"
-fi
-
-have_vmx_intrinsics=no
-AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$VMX_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-error "Need GCC >= 3.4 for sane altivec support"
-#endif
-#include <altivec.h>
-int main () {
- vector unsigned int v = vec_splat_u32 (1);
- v = vec_sub (v, v);
- return 0;
-}], have_vmx_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(vmx,
- [AC_HELP_STRING([--disable-vmx],
- [disable VMX fast paths])],
- [enable_vmx=$enableval], [enable_vmx=auto])
-
-if test $enable_vmx = no ; then
- have_vmx_intrinsics=disabled
-fi
-
-if test $have_vmx_intrinsics = yes ; then
- AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
-else
- VMX_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_vmx_intrinsics)
-if test $enable_vmx = yes && test $have_vmx_intrinsics = no ; then
- AC_MSG_ERROR([VMX intrinsics not detected])
-fi
-
-AC_SUBST(VMX_CFLAGS)
-
-AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports ARM SIMD instructions
-have_arm_simd=no
-AC_MSG_CHECKING(whether to use ARM SIMD assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-x assembler-with-cpp $CFLAGS"
-AC_COMPILE_IFELSE([[
-.text
-.arch armv6
-.object_arch armv4
-.arm
-.altmacro
-#ifndef __ARM_EABI__
-#error EABI is required (to be sure that calling conventions are compatible)
-#endif
-pld [r0]
-uqadd8 r0, r0, r0]], have_arm_simd=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(arm-simd,
- [AC_HELP_STRING([--disable-arm-simd],
- [disable ARM SIMD fast paths])],
- [enable_arm_simd=$enableval], [enable_arm_simd=auto])
-
-if test $enable_arm_simd = no ; then
- have_arm_simd=disabled
-fi
-
-if test $have_arm_simd = yes ; then
- AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
-
-AC_MSG_RESULT($have_arm_simd)
-if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
- AC_MSG_ERROR([ARM SIMD intrinsics not detected])
-fi
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports NEON instructions
-have_arm_neon=no
-AC_MSG_CHECKING(whether to use ARM NEON assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-x assembler-with-cpp $CFLAGS"
-AC_COMPILE_IFELSE([[
-.text
-.fpu neon
-.arch armv7a
-.object_arch armv4
-.eabi_attribute 10, 0
-.arm
-.altmacro
-#ifndef __ARM_EABI__
-#error EABI is required (to be sure that calling conventions are compatible)
-#endif
-pld [r0]
-vmovn.u16 d0, q0]], have_arm_neon=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(arm-neon,
- [AC_HELP_STRING([--disable-arm-neon],
- [disable ARM NEON fast paths])],
- [enable_arm_neon=$enableval], [enable_arm_neon=auto])
-
-if test $enable_arm_neon = no ; then
- have_arm_neon=disabled
-fi
-
-if test $have_arm_neon = yes ; then
- AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
-
-AC_MSG_RESULT($have_arm_neon)
-if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
- AC_MSG_ERROR([ARM NEON intrinsics not detected])
-fi
-
-dnl =========================================================================================
-dnl Check for GNU-style inline assembly support
-
-have_gcc_inline_asm=no
-AC_MSG_CHECKING(whether to use GNU-style inline assembler)
-AC_COMPILE_IFELSE([
-int main () {
- /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
- asm volatile ( "\tnop\n" : : : "cc", "memory" );
- return 0;
-}], have_gcc_inline_asm=yes)
-
-AC_ARG_ENABLE(gcc-inline-asm,
- [AC_HELP_STRING([--disable-gcc-inline-asm],
- [disable GNU-style inline assembler])],
- [enable_gcc_inline_asm=$enableval], [enable_gcc_inline_asm=auto])
-
-if test $enable_gcc_inline_asm = no ; then
- have_gcc_inline_asm=disabled
-fi
-
-if test $have_gcc_inline_asm = yes ; then
- AC_DEFINE(USE_GCC_INLINE_ASM, 1, [use GNU-style inline assembler])
-fi
-
-AC_MSG_RESULT($have_gcc_inline_asm)
-if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then
- AC_MSG_ERROR([GNU-style inline assembler not detected])
-fi
-
-AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
-
-dnl ==============================================
-dnl Timers
-
-AC_ARG_ENABLE(timers,
- [AC_HELP_STRING([--enable-timers],
- [enable TIMER_BEGIN and TIMER_END macros [default=no]])],
- [enable_timers=$enableval], [enable_timers=no])
-
-if test $enable_timers = yes ; then
- AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros])
-fi
-AC_SUBST(PIXMAN_TIMERS)
-
-dnl ===================================
-dnl GTK+
-
-AC_ARG_ENABLE(gtk,
- [AC_HELP_STRING([--enable-gtk],
- [enable tests using GTK+ [default=auto]])],
- [enable_gtk=$enableval], [enable_gtk=auto])
-
-PKG_PROG_PKG_CONFIG
-
-if test $enable_gtk = yes ; then
- AC_CHECK_LIB([pixman-1], [pixman_version_string])
- PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1])
-fi
-
-if test $enable_gtk = auto ; then
- AC_CHECK_LIB([pixman-1], [pixman_version_string], [enable_gtk=auto], [enable_gtk=no])
-fi
-
-if test $enable_gtk = auto ; then
- PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1], [enable_gtk=yes], [enable_gtk=no])
-fi
-
-AM_CONDITIONAL(HAVE_GTK, [test "x$enable_gtk" = xyes])
-
-AC_SUBST(GTK_CFLAGS)
-AC_SUBST(GTK_LIBS)
-AC_SUBST(DEP_CFLAGS)
-AC_SUBST(DEP_LIBS)
-
-dnl =====================================
-dnl posix_memalign, sigaction, alarm, gettimeofday
-
-AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
-if test x$have_posix_memalign = xyes; then
- AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
-fi
-
-AC_CHECK_FUNC(sigaction, have_sigaction=yes, have_sigaction=no)
-if test x$have_sigaction = xyes; then
- AC_DEFINE(HAVE_SIGACTION, 1, [Whether we have sigaction()])
-fi
-
-AC_CHECK_FUNC(alarm, have_alarm=yes, have_alarm=no)
-if test x$have_alarm = xyes; then
- AC_DEFINE(HAVE_ALARM, 1, [Whether we have alarm()])
-fi
-
-AC_CHECK_HEADER([sys/mman.h],
- [AC_DEFINE(HAVE_SYS_MMAN_H, [1], [Define to 1 if we have <sys/mman.h>])])
-
-AC_CHECK_FUNC(mprotect, have_mprotect=yes, have_mprotect=no)
-if test x$have_mprotect = xyes; then
- AC_DEFINE(HAVE_MPROTECT, 1, [Whether we have mprotect()])
-fi
-
-AC_CHECK_FUNC(getpagesize, have_getpagesize=yes, have_getpagesize=no)
-if test x$have_getpagesize = xyes; then
- AC_DEFINE(HAVE_GETPAGESIZE, 1, [Whether we have getpagesize()])
-fi
-
-AC_CHECK_HEADER([fenv.h],
- [AC_DEFINE(HAVE_FENV_H, [1], [Define to 1 if we have <fenv.h>])])
-
-AC_CHECK_LIB(m, feenableexcept, have_feenableexcept=yes, have_feenableexcept=no)
-if test x$have_feenableexcept = xyes; then
- AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()])
-fi
-
-AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no)
-AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no)
-if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
- AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()])
-fi
-
-dnl =====================================
-dnl Thread local storage
-
-support_for__thread=no
-
-AC_MSG_CHECKING(for __thread)
-AC_LINK_IFELSE([
-#if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
-#error This MinGW version has broken __thread support
-#endif
-#ifdef __OpenBSD__
-#error OpenBSD has broken __thread support
-#endif
-static __thread int x ;
-int main () { x = 123; return x; }
-], support_for__thread=yes)
-
-if test $support_for__thread = yes; then
- AC_DEFINE([TOOLCHAIN_SUPPORTS__THREAD],[],[Whether the tool chain supports __thread])
-fi
-
-AC_MSG_RESULT($support_for__thread)
-
-dnl
-dnl posix tls
-dnl
-
-m4_define([pthread_test_program],[dnl
-#include <stdlib.h>
-#include <pthread.h>
-
-static pthread_once_t once_control = PTHREAD_ONCE_INIT;
-static pthread_key_t key;
-
-static void
-make_key (void)
-{
- pthread_key_create (&key, NULL);
-}
-
-int
-main ()
-{
- void *value = NULL;
-
- if (pthread_once (&once_control, make_key) != 0)
- {
- value = NULL;
- }
- else
- {
- value = pthread_getspecific (key);
- if (!value)
- {
- value = malloc (100);
- pthread_setspecific (key, value);
- }
- }
- return 0;
-}
-])
-
-AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
- if test "z$support_for_pthread_setspecific" != "zyes"; then
- PIXMAN_LINK_WITH_ENV(
- [$1], [pthread_test_program],
- [PTHREAD_CFLAGS="$CFLAGS"
- PTHREAD_LIBS="$LIBS"
- PTHREAD_LDFLAGS="$LDFLAGS"
- support_for_pthread_setspecific=yes])
- fi
-])
-
-if test $support_for__thread = no; then
- support_for_pthread_setspecific=no
-
- AC_MSG_CHECKING(for pthread_setspecific)
-
- PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
- PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
- PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
-
- if test $support_for_pthread_setspecific = yes; then
- CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
- AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
- fi
-
- AC_MSG_RESULT($support_for_pthread_setspecific);
-fi
-
-AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
-AC_SUBST(HAVE_PTHREAD_SETSPECIFIC)
-AC_SUBST(PTHREAD_LDFLAGS)
-AC_SUBST(PTHREAD_LIBS)
-
-dnl =====================================
-dnl __attribute__((constructor))
-
-support_for_attribute_constructor=no
-
-AC_MSG_CHECKING(for __attribute__((constructor)))
-AC_LINK_IFELSE([
-#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
-/* attribute 'constructor' is supported since gcc 2.7, but some compilers
- * may only pretend to be gcc, so let's try to actually use it
- */
-static int x = 1;
-static void __attribute__((constructor)) constructor_function () { x = 0; }
-int main (void) { return x; }
-#else
-#error not gcc or gcc version is older than 2.7
-#endif
-], support_for_attribute_constructor=yes)
-
-if test x$support_for_attribute_constructor = xyes; then
- AC_DEFINE([TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR],
- [],[Whether the tool chain supports __attribute__((constructor))])
-fi
-
-AC_MSG_RESULT($support_for_attribute_constructor)
-AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
-
-AC_OUTPUT([pixman-1.pc
- pixman-1-uninstalled.pc
- Makefile
- pixman/Makefile
- pixman/pixman-version.h
- test/Makefile])
+dnl Copyright 2005 Red Hat, Inc. +dnl +dnl Permission to use, copy, modify, distribute, and sell this software and its +dnl documentation for any purpose is hereby granted without fee, provided that +dnl the above copyright notice appear in all copies and that both that +dnl copyright notice and this permission notice appear in supporting +dnl documentation, and that the name of Red Hat not be used in +dnl advertising or publicity pertaining to distribution of the software without +dnl specific, written prior permission. Red Hat makes no +dnl representations about the suitability of this software for any purpose. It +dnl is provided "as is" without express or implied warranty. +dnl +dnl RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, +dnl INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO +dnl EVENT SHALL RED HAT BE LIABLE FOR ANY SPECIAL, INDIRECT OR +dnl CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +dnl DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +dnl TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +dnl PERFORMANCE OF THIS SOFTWARE. +dnl +dnl Process this file with autoconf to create configure. + +AC_PREREQ([2.57]) + +# Pixman versioning scheme +# +# - The version in git has an odd MICRO version number +# +# - Released versions both development and stable have an even MICRO +# version number +# +# - Released development versions have an odd MINOR number +# +# - Released stable versions have an even MINOR number +# +# - Versions that break ABI must have a new MAJOR number +# +# - If you break the ABI, then at least this must be done: +# +# - increment MAJOR +# +# - In the first development release where you break ABI, find +# all instances of "pixman-n" and change them to pixman-(n+1) +# +# This needs to be done at least in +# configure.ac +# all Makefile.am's +# pixman-n.pc.in +# +# This ensures that binary incompatible versions can be installed +# in parallel. See http://www106.pair.com/rhp/parallel.html for +# more information +# + +m4_define([pixman_major], 0) +m4_define([pixman_minor], 21) +m4_define([pixman_micro], 3) + +m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro]) + +AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman) +AM_INIT_AUTOMAKE([foreign dist-bzip2]) + +# Suppress verbose compile lines +m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) + +AM_CONFIG_HEADER(config.h) + +AC_CANONICAL_HOST + +test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS. + +AC_PROG_CC +AM_PROG_AS +AC_PROG_LIBTOOL +AC_CHECK_FUNCS([getisax]) +AC_C_BIGENDIAN +AC_C_INLINE + +dnl PIXMAN_LINK_WITH_ENV(env-setup, program, true-action, false-action) +dnl +dnl Compiles and links the given program in the environment setup by env-setup +dnl and executes true-action on success and false-action on failure. +AC_DEFUN([PIXMAN_LINK_WITH_ENV],[dnl + save_CFLAGS="$CFLAGS" + save_LDFLAGS="$LDFLAGS" + save_LIBS="$LIBS" + CFLAGS="" + LDFLAGS="" + LIBS="" + $1 + AC_LINK_IFELSE( + [$2], + [pixman_cc_stderr=`test -f conftest.err && cat conftest.err` + pixman_cc_flag=yes], + [pixman_cc_stderr=`test -f conftest.err && cat conftest.err` + pixman_cc_flag=no]) + + if test "x$pixman_cc_stderr" != "x"; then + pixman_cc_flag=no + fi + + if test "x$pixman_cc_flag" = "xyes"; then + ifelse([$3], , :, [$3]) + else + ifelse([$4], , :, [$4]) + fi + CFLAGS="$save_CFLAGS" + LDFLAGS="$save_LDFLAGS" + LIBS="$save_LIBS" +]) + +dnl Find a -Werror for catching warnings. +WERROR= +for w in -Werror -errwarn; do + if test "z$WERROR" = "z"; then + AC_MSG_CHECKING([whether the compiler supports $w]) + PIXMAN_LINK_WITH_ENV( + [CFLAGS=$w], + [int main(int c, char **v) { (void)c; (void)v; return 0; }], + [WERROR=$w; yesno=yes], [yesno=no]) + AC_MSG_RESULT($_yesno) + fi +done + +dnl PIXMAN_CHECK_CFLAG(flag, [program]) +dnl Adds flag to CFLAGS if the given program links without warnings or errors. +AC_DEFUN([PIXMAN_CHECK_CFLAG], [dnl + AC_MSG_CHECKING([whether the compiler supports $1]) + PIXMAN_LINK_WITH_ENV( + [CFLAGS="$WERROR $1"], + [$2 + int main(int c, char **v) { (void)c; (void)v; return 0; } + ], + [_yesno=yes], + [_yesno=no]) + if test "x$_yesno" = xyes; then + CFLAGS="$CFLAGS $1" + fi + AC_MSG_RESULT($_yesno) +]) + +AC_CHECK_SIZEOF(long) + +# Checks for Sun Studio compilers +AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"]) +AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"]) + +# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC +# if we're using Sun Studio and neither the user nor a config.site +# has set CFLAGS. +if test $SUNCC = yes && \ + test "$test_CFLAGS" == "" && \ + test "$CFLAGS" = "-g" +then + CFLAGS="-O -g" +fi + +# +# We ignore pixman_major in the version here because the major version should +# always be encoded in the actual library name. Ie., the soname is: +# +# pixman-$(pixman_major).0.minor.micro +# +m4_define([lt_current], [pixman_minor]) +m4_define([lt_revision], [pixman_micro]) +m4_define([lt_age], [pixman_minor]) + +LT_VERSION_INFO="lt_current:lt_revision:lt_age" + +PIXMAN_VERSION_MAJOR=pixman_major() +AC_SUBST(PIXMAN_VERSION_MAJOR) +PIXMAN_VERSION_MINOR=pixman_minor() +AC_SUBST(PIXMAN_VERSION_MINOR) +PIXMAN_VERSION_MICRO=pixman_micro() +AC_SUBST(PIXMAN_VERSION_MICRO) + +AC_SUBST(LT_VERSION_INFO) + +# Check for dependencies + +PIXMAN_CHECK_CFLAG([-Wall]) +PIXMAN_CHECK_CFLAG([-fno-strict-aliasing]) + +AC_PATH_PROG(PERL, perl, no) +if test "x$PERL" = xno; then + AC_MSG_ERROR([Perl is required to build pixman.]) +fi +AC_SUBST(PERL) + +dnl ========================================================================= +dnl OpenMP for the test suite? +dnl + +# Check for OpenMP support (only supported by autoconf >=2.62) +OPENMP_CFLAGS= +m4_ifdef([AC_OPENMP], [AC_OPENMP]) + +m4_define([openmp_test_program],[dnl +#include <stdio.h> + +extern unsigned int lcg_seed; +#pragma omp threadprivate(lcg_seed) +unsigned int lcg_seed; + +unsigned function(unsigned a, unsigned b) +{ + lcg_seed ^= b; + return ((a + b) ^ a ) + lcg_seed; +} + +int main(int argc, char **argv) +{ + int i; + int n1 = 0, n2 = argc; + unsigned checksum = 0; + int verbose = argv != NULL; + unsigned (*test_function)(unsigned, unsigned); + test_function = function; + #pragma omp parallel for reduction(+:checksum) default(none) \ + shared(n1, n2, test_function, verbose) + for (i = n1; i < n2; i++) + { + unsigned crc = test_function (i, 0); + if (verbose) + printf ("%d: %08X\n", i, crc); + checksum += crc; + } + printf("%u\n", checksum); + return 0; +} +]) + +PIXMAN_LINK_WITH_ENV( + [CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"], + [openmp_test_program], + [have_openmp=yes], + [have_openmp=no]) +if test "x$have_openmp" = "xyes"; then + AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite]) +else + OPENMP_CFLAGS="" +fi +AC_SUBST(OPENMP_CFLAGS) + +dnl ========================================================================= +dnl -fvisibility stuff + +PIXMAN_CHECK_CFLAG([-fvisibility=hidden], [dnl +#if defined(__GNUC__) && (__GNUC__ >= 4) +#ifdef _WIN32 +#error Have -fvisibility but it is ignored and generates a warning +#endif +#else +error Need GCC 4.0 for visibility +#endif +]) + +PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl +#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550) +#else +error Need Sun Studio 8 for visibility +#endif +]) + +dnl =========================================================================== +dnl Check for MMX + +if test "x$MMX_CFLAGS" = "x" ; then + if test "x$SUNCC" = "xyes"; then + # Sun Studio doesn't have an -xarch=mmx flag, so we have to use sse + # but if we're building 64-bit, mmx & sse support is on by default and + # -xarch=sse throws an error instead + if test "$AMD64_ABI" = "no" ; then + MMX_CFLAGS="-xarch=sse" + fi + else + MMX_CFLAGS="-mmmx -Winline" + fi +fi + +have_mmx_intrinsics=no +AC_MSG_CHECKING(whether to use MMX intrinsics) +xserver_save_CFLAGS=$CFLAGS +CFLAGS="$MMX_CFLAGS $CFLAGS" +AC_COMPILE_IFELSE([ +#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4)) +error "Need GCC >= 3.4 for MMX intrinsics" +#endif +#include <mmintrin.h> +int main () { + __m64 v = _mm_cvtsi32_si64 (1); + return _mm_cvtsi64_si32 (v); +}], have_mmx_intrinsics=yes) +CFLAGS=$xserver_save_CFLAGS + +AC_ARG_ENABLE(mmx, + [AC_HELP_STRING([--disable-mmx], + [disable MMX fast paths])], + [enable_mmx=$enableval], [enable_mmx=auto]) + +if test $enable_mmx = no ; then + have_mmx_intrinsics=disabled +fi + +if test $have_mmx_intrinsics = yes ; then + AC_DEFINE(USE_MMX, 1, [use MMX compiler intrinsics]) +else + MMX_CFLAGS= +fi + +AC_MSG_RESULT($have_mmx_intrinsics) +if test $enable_mmx = yes && test $have_mmx_intrinsics = no ; then + AC_MSG_ERROR([MMX intrinsics not detected]) +fi + +AM_CONDITIONAL(USE_MMX, test $have_mmx_intrinsics = yes) + +dnl =========================================================================== +dnl Check for SSE2 + +if test "x$SSE2_CFLAGS" = "x" ; then + if test "x$SUNCC" = "xyes"; then + # SSE2 is enabled by default in the Sun Studio 64-bit environment + if test "$AMD64_ABI" = "no" ; then + SSE2_CFLAGS="-xarch=sse2" + fi + else + SSE2_CFLAGS="-mmmx -msse2 -Winline" + fi +fi + +have_sse2_intrinsics=no +AC_MSG_CHECKING(whether to use SSE2 intrinsics) +xserver_save_CFLAGS=$CFLAGS +CFLAGS="$SSE2_CFLAGS $CFLAGS" + +AC_COMPILE_IFELSE([ +#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)) +# if !defined(__amd64__) && !defined(__x86_64__) +# error "Need GCC >= 4.2 for SSE2 intrinsics on x86" +# endif +#endif +#include <mmintrin.h> +#include <xmmintrin.h> +#include <emmintrin.h> +int main () { + __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c; + c = _mm_xor_si128 (a, b); + return 0; +}], have_sse2_intrinsics=yes) +CFLAGS=$xserver_save_CFLAGS + +AC_ARG_ENABLE(sse2, + [AC_HELP_STRING([--disable-sse2], + [disable SSE2 fast paths])], + [enable_sse2=$enableval], [enable_sse2=auto]) + +if test $enable_sse2 = no ; then + have_sse2_intrinsics=disabled +fi + +if test $have_sse2_intrinsics = yes ; then + AC_DEFINE(USE_SSE2, 1, [use SSE2 compiler intrinsics]) +fi + +AC_MSG_RESULT($have_sse2_intrinsics) +if test $enable_sse2 = yes && test $have_sse2_intrinsics = no ; then + AC_MSG_ERROR([SSE2 intrinsics not detected]) +fi + +AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes) + +dnl =========================================================================== +dnl Other special flags needed when building code using MMX or SSE instructions +case $host_os in + solaris*) + # When building 32-bit binaries, apply a mapfile to ensure that the + # binaries aren't flagged as only able to run on MMX+SSE capable CPUs + # since they check at runtime before using those instructions. + # Not all linkers grok the mapfile format so we check for that first. + if test "$AMD64_ABI" = "no" ; then + use_hwcap_mapfile=no + AC_MSG_CHECKING(whether to use a hardware capability map file) + hwcap_save_LDFLAGS="$LDFLAGS" + HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile' + LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile" + AC_LINK_IFELSE([int main() { return 0; }], + use_hwcap_mapfile=yes, + HWCAP_LDFLAGS="") + LDFLAGS="$hwcap_save_LDFLAGS" + AC_MSG_RESULT($use_hwcap_mapfile) + fi + if test "x$MMX_LDFLAGS" = "x" ; then + MMX_LDFLAGS="$HWCAP_LDFLAGS" + fi + if test "x$SSE2_LDFLAGS" = "x" ; then + SSE2_LDFLAGS="$HWCAP_LDFLAGS" + fi + ;; +esac + +AC_SUBST(MMX_CFLAGS) +AC_SUBST(MMX_LDFLAGS) +AC_SUBST(SSE2_CFLAGS) +AC_SUBST(SSE2_LDFLAGS) + +dnl =========================================================================== +dnl Check for VMX/Altivec +if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then + VMX_CFLAGS="-faltivec" +else + VMX_CFLAGS="-maltivec -mabi=altivec" +fi + +have_vmx_intrinsics=no +AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics) +xserver_save_CFLAGS=$CFLAGS +CFLAGS="$VMX_CFLAGS $CFLAGS" +AC_COMPILE_IFELSE([ +#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4)) +error "Need GCC >= 3.4 for sane altivec support" +#endif +#include <altivec.h> +int main () { + vector unsigned int v = vec_splat_u32 (1); + v = vec_sub (v, v); + return 0; +}], have_vmx_intrinsics=yes) +CFLAGS=$xserver_save_CFLAGS + +AC_ARG_ENABLE(vmx, + [AC_HELP_STRING([--disable-vmx], + [disable VMX fast paths])], + [enable_vmx=$enableval], [enable_vmx=auto]) + +if test $enable_vmx = no ; then + have_vmx_intrinsics=disabled +fi + +if test $have_vmx_intrinsics = yes ; then + AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics]) +else + VMX_CFLAGS= +fi + +AC_MSG_RESULT($have_vmx_intrinsics) +if test $enable_vmx = yes && test $have_vmx_intrinsics = no ; then + AC_MSG_ERROR([VMX intrinsics not detected]) +fi + +AC_SUBST(VMX_CFLAGS) + +AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes) + +dnl ========================================================================== +dnl Check if assembler is gas compatible and supports ARM SIMD instructions +have_arm_simd=no +AC_MSG_CHECKING(whether to use ARM SIMD assembler) +xserver_save_CFLAGS=$CFLAGS +CFLAGS="-x assembler-with-cpp $CFLAGS" +AC_COMPILE_IFELSE([[ +.text +.arch armv6 +.object_arch armv4 +.arm +.altmacro +#ifndef __ARM_EABI__ +#error EABI is required (to be sure that calling conventions are compatible) +#endif +pld [r0] +uqadd8 r0, r0, r0]], have_arm_simd=yes) +CFLAGS=$xserver_save_CFLAGS + +AC_ARG_ENABLE(arm-simd, + [AC_HELP_STRING([--disable-arm-simd], + [disable ARM SIMD fast paths])], + [enable_arm_simd=$enableval], [enable_arm_simd=auto]) + +if test $enable_arm_simd = no ; then + have_arm_simd=disabled +fi + +if test $have_arm_simd = yes ; then + AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations]) +fi + +AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes) + +AC_MSG_RESULT($have_arm_simd) +if test $enable_arm_simd = yes && test $have_arm_simd = no ; then + AC_MSG_ERROR([ARM SIMD intrinsics not detected]) +fi + +dnl ========================================================================== +dnl Check if assembler is gas compatible and supports NEON instructions +have_arm_neon=no +AC_MSG_CHECKING(whether to use ARM NEON assembler) +xserver_save_CFLAGS=$CFLAGS +CFLAGS="-x assembler-with-cpp $CFLAGS" +AC_COMPILE_IFELSE([[ +.text +.fpu neon +.arch armv7a +.object_arch armv4 +.eabi_attribute 10, 0 +.arm +.altmacro +#ifndef __ARM_EABI__ +#error EABI is required (to be sure that calling conventions are compatible) +#endif +pld [r0] +vmovn.u16 d0, q0]], have_arm_neon=yes) +CFLAGS=$xserver_save_CFLAGS + +AC_ARG_ENABLE(arm-neon, + [AC_HELP_STRING([--disable-arm-neon], + [disable ARM NEON fast paths])], + [enable_arm_neon=$enableval], [enable_arm_neon=auto]) + +if test $enable_arm_neon = no ; then + have_arm_neon=disabled +fi + +if test $have_arm_neon = yes ; then + AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations]) +fi + +AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes) + +AC_MSG_RESULT($have_arm_neon) +if test $enable_arm_neon = yes && test $have_arm_neon = no ; then + AC_MSG_ERROR([ARM NEON intrinsics not detected]) +fi + +dnl ========================================================================================= +dnl Check for GNU-style inline assembly support + +have_gcc_inline_asm=no +AC_MSG_CHECKING(whether to use GNU-style inline assembler) +AC_COMPILE_IFELSE([ +int main () { + /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */ + asm volatile ( "\tnop\n" : : : "cc", "memory" ); + return 0; +}], have_gcc_inline_asm=yes) + +AC_ARG_ENABLE(gcc-inline-asm, + [AC_HELP_STRING([--disable-gcc-inline-asm], + [disable GNU-style inline assembler])], + [enable_gcc_inline_asm=$enableval], [enable_gcc_inline_asm=auto]) + +if test $enable_gcc_inline_asm = no ; then + have_gcc_inline_asm=disabled +fi + +if test $have_gcc_inline_asm = yes ; then + AC_DEFINE(USE_GCC_INLINE_ASM, 1, [use GNU-style inline assembler]) +fi + +AC_MSG_RESULT($have_gcc_inline_asm) +if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then + AC_MSG_ERROR([GNU-style inline assembler not detected]) +fi + +AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes) + +dnl ============================================== +dnl Static test programs + +AC_ARG_ENABLE(static-testprogs, + [AC_HELP_STRING([--enable-static-testprogs], + [build test programs as static binaries [default=no]])], + [enable_static_testprogs=$enableval], [enable_static_testprogs=no]) + +TESTPROGS_EXTRA_LDFLAGS= +if test "x$enable_static_testprogs" = "xyes" ; then + TESTPROGS_EXTRA_LDFLAGS="-all-static" +fi +AC_SUBST(TESTPROGS_EXTRA_LDFLAGS) + +dnl ============================================== +dnl Timers + +AC_ARG_ENABLE(timers, + [AC_HELP_STRING([--enable-timers], + [enable TIMER_BEGIN and TIMER_END macros [default=no]])], + [enable_timers=$enableval], [enable_timers=no]) + +if test $enable_timers = yes ; then + AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros]) +fi +AC_SUBST(PIXMAN_TIMERS) + +dnl =================================== +dnl GTK+ + +AC_ARG_ENABLE(gtk, + [AC_HELP_STRING([--enable-gtk], + [enable tests using GTK+ [default=auto]])], + [enable_gtk=$enableval], [enable_gtk=auto]) + +PKG_PROG_PKG_CONFIG + +if test $enable_gtk = yes ; then + AC_CHECK_LIB([pixman-1], [pixman_version_string]) + PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1]) +fi + +if test $enable_gtk = auto ; then + AC_CHECK_LIB([pixman-1], [pixman_version_string], [enable_gtk=auto], [enable_gtk=no]) +fi + +if test $enable_gtk = auto ; then + PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1], [enable_gtk=yes], [enable_gtk=no]) +fi + +AM_CONDITIONAL(HAVE_GTK, [test "x$enable_gtk" = xyes]) + +AC_SUBST(GTK_CFLAGS) +AC_SUBST(GTK_LIBS) +AC_SUBST(DEP_CFLAGS) +AC_SUBST(DEP_LIBS) + +dnl ===================================== +dnl posix_memalign, sigaction, alarm, gettimeofday + +AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no) +if test x$have_posix_memalign = xyes; then + AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()]) +fi + +AC_CHECK_FUNC(sigaction, have_sigaction=yes, have_sigaction=no) +if test x$have_sigaction = xyes; then + AC_DEFINE(HAVE_SIGACTION, 1, [Whether we have sigaction()]) +fi + +AC_CHECK_FUNC(alarm, have_alarm=yes, have_alarm=no) +if test x$have_alarm = xyes; then + AC_DEFINE(HAVE_ALARM, 1, [Whether we have alarm()]) +fi + +AC_CHECK_HEADER([sys/mman.h], + [AC_DEFINE(HAVE_SYS_MMAN_H, [1], [Define to 1 if we have <sys/mman.h>])]) + +AC_CHECK_FUNC(mprotect, have_mprotect=yes, have_mprotect=no) +if test x$have_mprotect = xyes; then + AC_DEFINE(HAVE_MPROTECT, 1, [Whether we have mprotect()]) +fi + +AC_CHECK_FUNC(getpagesize, have_getpagesize=yes, have_getpagesize=no) +if test x$have_getpagesize = xyes; then + AC_DEFINE(HAVE_GETPAGESIZE, 1, [Whether we have getpagesize()]) +fi + +AC_CHECK_HEADER([fenv.h], + [AC_DEFINE(HAVE_FENV_H, [1], [Define to 1 if we have <fenv.h>])]) + +AC_CHECK_LIB(m, feenableexcept, have_feenableexcept=yes, have_feenableexcept=no) +if test x$have_feenableexcept = xyes; then + AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()]) +fi + +AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no) +AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no) +if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then + AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()]) +fi + +dnl ===================================== +dnl Thread local storage + +support_for__thread=no + +AC_MSG_CHECKING(for __thread) +AC_LINK_IFELSE([ +#if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)) +#error This MinGW version has broken __thread support +#endif +#ifdef __OpenBSD__ +#error OpenBSD has broken __thread support +#endif +static __thread int x ; +int main () { x = 123; return x; } +], support_for__thread=yes) + +if test $support_for__thread = yes; then + AC_DEFINE([TOOLCHAIN_SUPPORTS__THREAD],[],[Whether the tool chain supports __thread]) +fi + +AC_MSG_RESULT($support_for__thread) + +dnl +dnl posix tls +dnl + +m4_define([pthread_test_program],[dnl +#include <stdlib.h> +#include <pthread.h> + +static pthread_once_t once_control = PTHREAD_ONCE_INIT; +static pthread_key_t key; + +static void +make_key (void) +{ + pthread_key_create (&key, NULL); +} + +int +main () +{ + void *value = NULL; + + if (pthread_once (&once_control, make_key) != 0) + { + value = NULL; + } + else + { + value = pthread_getspecific (key); + if (!value) + { + value = malloc (100); + pthread_setspecific (key, value); + } + } + return 0; +} +]) + +AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl + if test "z$support_for_pthread_setspecific" != "zyes"; then + PIXMAN_LINK_WITH_ENV( + [$1], [pthread_test_program], + [PTHREAD_CFLAGS="$CFLAGS" + PTHREAD_LIBS="$LIBS" + PTHREAD_LDFLAGS="$LDFLAGS" + support_for_pthread_setspecific=yes]) + fi +]) + +if test $support_for__thread = no; then + support_for_pthread_setspecific=no + + AC_MSG_CHECKING(for pthread_setspecific) + + PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"]) + PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"]) + PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"]) + + if test $support_for_pthread_setspecific = yes; then + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported]) + fi + + AC_MSG_RESULT($support_for_pthread_setspecific); +fi + +AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD) +AC_SUBST(HAVE_PTHREAD_SETSPECIFIC) +AC_SUBST(PTHREAD_LDFLAGS) +AC_SUBST(PTHREAD_LIBS) + +dnl ===================================== +dnl __attribute__((constructor)) + +support_for_attribute_constructor=no + +AC_MSG_CHECKING(for __attribute__((constructor))) +AC_LINK_IFELSE([ +#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)) +/* attribute 'constructor' is supported since gcc 2.7, but some compilers + * may only pretend to be gcc, so let's try to actually use it + */ +static int x = 1; +static void __attribute__((constructor)) constructor_function () { x = 0; } +int main (void) { return x; } +#else +#error not gcc or gcc version is older than 2.7 +#endif +], support_for_attribute_constructor=yes) + +if test x$support_for_attribute_constructor = xyes; then + AC_DEFINE([TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR], + [],[Whether the tool chain supports __attribute__((constructor))]) +fi + +AC_MSG_RESULT($support_for_attribute_constructor) +AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR) + +AC_OUTPUT([pixman-1.pc + pixman-1-uninstalled.pc + Makefile + pixman/Makefile + pixman/pixman-version.h + test/Makefile]) diff --git a/pixman/pixman/pixman-bits-image.c b/pixman/pixman/pixman-bits-image.c index 85ff2a339..c453e0ee6 100644 --- a/pixman/pixman/pixman-bits-image.c +++ b/pixman/pixman/pixman-bits-image.c @@ -1,1443 +1,1441 @@ -/*
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- * 2005 Lars Knoll & Zack Rusin, Trolltech
- * 2008 Aaron Plattner, NVIDIA Corporation
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007, 2009 Red Hat, Inc.
- * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. Keith Packard makes no
- * representations about the suitability of this software for any purpose. It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-
-/* Store functions */
-void
-_pixman_image_store_scanline_32 (bits_image_t * image,
- int x,
- int y,
- int width,
- const uint32_t *buffer)
-{
- image->store_scanline_32 (image, x, y, width, buffer);
-
- if (image->common.alpha_map)
- {
- x -= image->common.alpha_origin_x;
- y -= image->common.alpha_origin_y;
-
- image->common.alpha_map->store_scanline_32 (
- image->common.alpha_map, x, y, width, buffer);
- }
-}
-
-void
-_pixman_image_store_scanline_64 (bits_image_t * image,
- int x,
- int y,
- int width,
- const uint32_t *buffer)
-{
- image->store_scanline_64 (image, x, y, width, buffer);
-
- if (image->common.alpha_map)
- {
- x -= image->common.alpha_origin_x;
- y -= image->common.alpha_origin_y;
-
- image->common.alpha_map->store_scanline_64 (
- image->common.alpha_map, x, y, width, buffer);
- }
-}
-
-/* Fetch functions */
-
-static force_inline uint32_t
-fetch_pixel_no_alpha (bits_image_t *image,
- int x, int y, pixman_bool_t check_bounds)
-{
- if (check_bounds &&
- (x < 0 || x >= image->width || y < 0 || y >= image->height))
- {
- return 0;
- }
-
- return image->fetch_pixel_32 (image, x, y);
-}
-
-typedef uint32_t (* get_pixel_t) (bits_image_t *image,
- int x, int y, pixman_bool_t check_bounds);
-
-static force_inline void
-repeat (pixman_repeat_t repeat, int size, int *coord)
-{
- switch (repeat)
- {
- case PIXMAN_REPEAT_NORMAL:
- *coord = MOD (*coord, size);
- break;
-
- case PIXMAN_REPEAT_PAD:
- *coord = CLIP (*coord, 0, size - 1);
- break;
-
- case PIXMAN_REPEAT_REFLECT:
- *coord = MOD (*coord, size * 2);
-
- if (*coord >= size)
- *coord = size * 2 - *coord - 1;
- break;
-
- case PIXMAN_REPEAT_NONE:
- break;
-
- default:
- break;
- }
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_nearest (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
- int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
- if (image->common.repeat != PIXMAN_REPEAT_NONE)
- {
- repeat (image->common.repeat, image->width, &x0);
- repeat (image->common.repeat, image->height, &y0);
-
- return get_pixel (image, x0, y0, FALSE);
- }
- else
- {
- return get_pixel (image, x0, y0, TRUE);
- }
-}
-
-#if SIZEOF_LONG > 4
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
- uint32_t bl, uint32_t br,
- int distx, int disty)
-{
- uint64_t distxy, distxiy, distixy, distixiy;
- uint64_t tl64, tr64, bl64, br64;
- uint64_t f, r;
-
- distxy = distx * disty;
- distxiy = distx * (256 - disty);
- distixy = (256 - distx) * disty;
- distixiy = (256 - distx) * (256 - disty);
-
- /* Alpha and Blue */
- tl64 = tl & 0xff0000ff;
- tr64 = tr & 0xff0000ff;
- bl64 = bl & 0xff0000ff;
- br64 = br & 0xff0000ff;
-
- f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
- r = f & 0x0000ff0000ff0000ull;
-
- /* Red and Green */
- tl64 = tl;
- tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
-
- tr64 = tr;
- tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
-
- bl64 = bl;
- bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
-
- br64 = br;
- br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
-
- f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
- r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
-
- return (uint32_t)(r >> 16);
-}
-
-#else
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
- uint32_t bl, uint32_t br,
- int distx, int disty)
-{
- int distxy, distxiy, distixy, distixiy;
- uint32_t f, r;
-
- distxy = distx * disty;
- distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */
- distixy = (disty << 8) - distxy; /* disty * (256 - distx) */
- distixiy =
- 256 * 256 - (disty << 8) -
- (distx << 8) + distxy; /* (256 - distx) * (256 - disty) */
-
- /* Blue */
- r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
- + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
-
- /* Green */
- f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
- + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
- r |= f & 0xff000000;
-
- tl >>= 16;
- tr >>= 16;
- bl >>= 16;
- br >>= 16;
- r >>= 16;
-
- /* Red */
- f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
- + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
- r |= f & 0x00ff0000;
-
- /* Alpha */
- f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
- + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
- r |= f & 0xff000000;
-
- return r;
-}
-
-#endif
-
-static force_inline uint32_t
-bits_image_fetch_pixel_bilinear (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- pixman_repeat_t repeat_mode = image->common.repeat;
- int width = image->width;
- int height = image->height;
- int x1, y1, x2, y2;
- uint32_t tl, tr, bl, br;
- int32_t distx, disty;
-
- x1 = x - pixman_fixed_1 / 2;
- y1 = y - pixman_fixed_1 / 2;
-
- distx = (x1 >> 8) & 0xff;
- disty = (y1 >> 8) & 0xff;
-
- x1 = pixman_fixed_to_int (x1);
- y1 = pixman_fixed_to_int (y1);
- x2 = x1 + 1;
- y2 = y1 + 1;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- repeat (repeat_mode, width, &x1);
- repeat (repeat_mode, height, &y1);
- repeat (repeat_mode, width, &x2);
- repeat (repeat_mode, height, &y2);
-
- tl = get_pixel (image, x1, y1, FALSE);
- bl = get_pixel (image, x1, y2, FALSE);
- tr = get_pixel (image, x2, y1, FALSE);
- br = get_pixel (image, x2, y2, FALSE);
- }
- else
- {
- tl = get_pixel (image, x1, y1, TRUE);
- tr = get_pixel (image, x2, y1, TRUE);
- bl = get_pixel (image, x1, y2, TRUE);
- br = get_pixel (image, x2, y2, TRUE);
- }
-
- return bilinear_interpolation (tl, tr, bl, br, distx, disty);
-}
-
-static void
-bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- bits_image_t *bits = &ima->bits;
- pixman_fixed_t x_top, x_bottom, x;
- pixman_fixed_t ux_top, ux_bottom, ux;
- pixman_vector_t v;
- uint32_t top_mask, bottom_mask;
- uint32_t *top_row;
- uint32_t *bottom_row;
- uint32_t *end;
- uint32_t zero[2] = { 0, 0 };
- int y, y1, y2;
- int disty;
- int mask_inc;
- int w;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (bits->common.transform, &v))
- return;
-
- ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
- x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
-
- y = v.vector[1] - pixman_fixed_1/2;
- disty = (y >> 8) & 0xff;
-
- /* Load the pointers to the first and second lines from the source
- * image that bilinear code must read.
- *
- * The main trick in this code is about the check if any line are
- * outside of the image;
- *
- * When I realize that a line (any one) is outside, I change
- * the pointer to a dummy area with zeros. Once I change this, I
- * must be sure the pointer will not change, so I set the
- * variables to each pointer increments inside the loop.
- */
- y1 = pixman_fixed_to_int (y);
- y2 = y1 + 1;
-
- if (y1 < 0 || y1 >= bits->height)
- {
- top_row = zero;
- x_top = 0;
- ux_top = 0;
- }
- else
- {
- top_row = bits->bits + y1 * bits->rowstride;
- x_top = x;
- ux_top = ux;
- }
-
- if (y2 < 0 || y2 >= bits->height)
- {
- bottom_row = zero;
- x_bottom = 0;
- ux_bottom = 0;
- }
- else
- {
- bottom_row = bits->bits + y2 * bits->rowstride;
- x_bottom = x;
- ux_bottom = ux;
- }
-
- /* Instead of checking whether the operation uses the mast in
- * each loop iteration, verify this only once and prepare the
- * variables to make the code smaller inside the loop.
- */
- if (!mask)
- {
- uint32_t mask_bits = 1;
-
- mask_inc = 0;
- mask = &mask_bits;
- }
- else
- {
- /* If have a mask, prepare the variables to check it */
- mask_inc = 1;
- }
-
- /* If both are zero, then the whole thing is zero */
- if (top_row == zero && bottom_row == zero)
- {
- memset (buffer, 0, width * sizeof (uint32_t));
- return;
- }
- else if (bits->format == PIXMAN_x8r8g8b8)
- {
- if (top_row == zero)
- {
- top_mask = 0;
- bottom_mask = 0xff000000;
- }
- else if (bottom_row == zero)
- {
- top_mask = 0xff000000;
- bottom_mask = 0;
- }
- else
- {
- top_mask = 0xff000000;
- bottom_mask = 0xff000000;
- }
- }
- else
- {
- top_mask = 0;
- bottom_mask = 0;
- }
-
- end = buffer + width;
-
- /* Zero fill to the left of the image */
- while (buffer < end && x < pixman_fixed_minus_1)
- {
- *buffer++ = 0;
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Left edge
- */
- while (buffer < end && x < 0)
- {
- uint32_t tr, br;
- int32_t distx;
-
- tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
- br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
- distx = (x >> 8) & 0xff;
-
- *buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
-
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Main part */
- w = pixman_int_to_fixed (bits->width - 1);
-
- while (buffer < end && x < w)
- {
- if (*mask)
- {
- uint32_t tl, tr, bl, br;
- int32_t distx;
-
- tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
- tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
- bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
- br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
- distx = (x >> 8) & 0xff;
-
- *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
- }
-
- buffer++;
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Right Edge */
- w = pixman_int_to_fixed (bits->width);
- while (buffer < end && x < w)
- {
- if (*mask)
- {
- uint32_t tl, bl;
- int32_t distx;
-
- tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
- bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
-
- distx = (x >> 8) & 0xff;
-
- *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
- }
-
- buffer++;
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Zero fill to the left of the image */
- while (buffer < end)
- *buffer++ = 0;
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_convolution (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- pixman_fixed_t *params = image->common.filter_params;
- int x_off = (params[0] - pixman_fixed_1) >> 1;
- int y_off = (params[1] - pixman_fixed_1) >> 1;
- int32_t cwidth = pixman_fixed_to_int (params[0]);
- int32_t cheight = pixman_fixed_to_int (params[1]);
- int32_t srtot, sgtot, sbtot, satot;
- int32_t i, j, x1, x2, y1, y2;
- pixman_repeat_t repeat_mode = image->common.repeat;
- int width = image->width;
- int height = image->height;
-
- params += 2;
-
- x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
- y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
- x2 = x1 + cwidth;
- y2 = y1 + cheight;
-
- srtot = sgtot = sbtot = satot = 0;
-
- for (i = y1; i < y2; ++i)
- {
- for (j = x1; j < x2; ++j)
- {
- int rx = j;
- int ry = i;
-
- pixman_fixed_t f = *params;
-
- if (f)
- {
- uint32_t pixel;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- repeat (repeat_mode, width, &rx);
- repeat (repeat_mode, height, &ry);
-
- pixel = get_pixel (image, rx, ry, FALSE);
- }
- else
- {
- pixel = get_pixel (image, rx, ry, TRUE);
- }
-
- srtot += RED_8 (pixel) * f;
- sgtot += GREEN_8 (pixel) * f;
- sbtot += BLUE_8 (pixel) * f;
- satot += ALPHA_8 (pixel) * f;
- }
-
- params++;
- }
- }
-
- satot >>= 16;
- srtot >>= 16;
- sgtot >>= 16;
- sbtot >>= 16;
-
- satot = CLIP (satot, 0, 0xff);
- srtot = CLIP (srtot, 0, 0xff);
- sgtot = CLIP (sgtot, 0, 0xff);
- sbtot = CLIP (sbtot, 0, 0xff);
-
- return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot));
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_filtered (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- switch (image->common.filter)
- {
- case PIXMAN_FILTER_NEAREST:
- case PIXMAN_FILTER_FAST:
- return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
- break;
-
- case PIXMAN_FILTER_BILINEAR:
- case PIXMAN_FILTER_GOOD:
- case PIXMAN_FILTER_BEST:
- return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
- break;
-
- case PIXMAN_FILTER_CONVOLUTION:
- return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
- break;
-
- default:
- break;
- }
-
- return 0;
-}
-
-static void
-bits_image_fetch_affine_no_alpha (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- pixman_fixed_t x, y;
- pixman_fixed_t ux, uy;
- pixman_vector_t v;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (image->common.transform)
- {
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
- }
- else
- {
- ux = pixman_fixed_1;
- uy = 0;
- }
-
- x = v.vector[0];
- y = v.vector[1];
-
- for (i = 0; i < width; ++i)
- {
- if (!mask || mask[i])
- {
- buffer[i] = bits_image_fetch_pixel_filtered (
- &image->bits, x, y, fetch_pixel_no_alpha);
- }
-
- x += ux;
- y += uy;
- }
-}
-
-/* General fetcher */
-static force_inline uint32_t
-fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
-{
- uint32_t pixel;
-
- if (check_bounds &&
- (x < 0 || x >= image->width || y < 0 || y >= image->height))
- {
- return 0;
- }
-
- pixel = image->fetch_pixel_32 (image, x, y);
-
- if (image->common.alpha_map)
- {
- uint32_t pixel_a;
-
- x -= image->common.alpha_origin_x;
- y -= image->common.alpha_origin_y;
-
- if (x < 0 || x >= image->common.alpha_map->width ||
- y < 0 || y >= image->common.alpha_map->height)
- {
- pixel_a = 0;
- }
- else
- {
- pixel_a = image->common.alpha_map->fetch_pixel_32 (
- image->common.alpha_map, x, y);
-
- pixel_a = ALPHA_8 (pixel_a);
- }
-
- pixel &= 0x00ffffff;
- pixel |= (pixel_a << 24);
- }
-
- return pixel;
-}
-
-static void
-bits_image_fetch_general (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- pixman_fixed_t x, y, w;
- pixman_fixed_t ux, uy, uw;
- pixman_vector_t v;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (image->common.transform)
- {
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
- uw = image->common.transform->matrix[2][0];
- }
- else
- {
- ux = pixman_fixed_1;
- uy = 0;
- uw = 0;
- }
-
- x = v.vector[0];
- y = v.vector[1];
- w = v.vector[2];
-
- for (i = 0; i < width; ++i)
- {
- pixman_fixed_t x0, y0;
-
- if (!mask || mask[i])
- {
- if (w != 0)
- {
- x0 = ((pixman_fixed_48_16_t)x << 16) / w;
- y0 = ((pixman_fixed_48_16_t)y << 16) / w;
- }
- else
- {
- x0 = 0;
- y0 = 0;
- }
-
- buffer[i] = bits_image_fetch_pixel_filtered (
- &image->bits, x0, y0, fetch_pixel_general);
- }
-
- x += ux;
- y += uy;
- w += uw;
- }
-}
-
-static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
-typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
-
-static force_inline void
-bits_image_fetch_bilinear_affine (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask,
-
- convert_pixel_t convert_pixel,
- pixman_format_code_t format,
- pixman_repeat_t repeat_mode)
-{
- pixman_fixed_t x, y;
- pixman_fixed_t ux, uy;
- pixman_vector_t v;
- bits_image_t *bits = &image->bits;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
-
- x = v.vector[0];
- y = v.vector[1];
-
- for (i = 0; i < width; ++i)
- {
- int x1, y1, x2, y2;
- uint32_t tl, tr, bl, br;
- int32_t distx, disty;
- int width = image->bits.width;
- int height = image->bits.height;
- const uint8_t *row1;
- const uint8_t *row2;
-
- if (mask && !mask[i])
- goto next;
-
- x1 = x - pixman_fixed_1 / 2;
- y1 = y - pixman_fixed_1 / 2;
-
- distx = (x1 >> 8) & 0xff;
- disty = (y1 >> 8) & 0xff;
-
- y1 = pixman_fixed_to_int (y1);
- y2 = y1 + 1;
- x1 = pixman_fixed_to_int (x1);
- x2 = x1 + 1;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- uint32_t mask;
-
- mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
- repeat (repeat_mode, width, &x1);
- repeat (repeat_mode, height, &y1);
- repeat (repeat_mode, width, &x2);
- repeat (repeat_mode, height, &y2);
-
- row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
- row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
-
- tl = convert_pixel (row1, x1) | mask;
- tr = convert_pixel (row1, x2) | mask;
- bl = convert_pixel (row2, x1) | mask;
- br = convert_pixel (row2, x2) | mask;
- }
- else
- {
- uint32_t mask1, mask2;
- int bpp;
-
- /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
- * which means if you use it in expressions, those
- * expressions become unsigned themselves. Since
- * the variables below can be negative in some cases,
- * that will lead to crashes on 64 bit architectures.
- *
- * So this line makes sure bpp is signed
- */
- bpp = PIXMAN_FORMAT_BPP (format);
-
- if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
- {
- buffer[i] = 0;
- goto next;
- }
-
- if (y2 == 0)
- {
- row1 = zero;
- mask1 = 0;
- }
- else
- {
- row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
- row1 += bpp / 8 * x1;
-
- mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
- }
-
- if (y1 == height - 1)
- {
- row2 = zero;
- mask2 = 0;
- }
- else
- {
- row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
- row2 += bpp / 8 * x1;
-
- mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
- }
-
- if (x2 == 0)
- {
- tl = 0;
- bl = 0;
- }
- else
- {
- tl = convert_pixel (row1, 0) | mask1;
- bl = convert_pixel (row2, 0) | mask2;
- }
-
- if (x1 == width - 1)
- {
- tr = 0;
- br = 0;
- }
- else
- {
- tr = convert_pixel (row1, 1) | mask1;
- br = convert_pixel (row2, 1) | mask2;
- }
- }
-
- buffer[i] = bilinear_interpolation (
- tl, tr, bl, br, distx, disty);
-
- next:
- x += ux;
- y += uy;
- }
-}
-
-static force_inline void
-bits_image_fetch_nearest_affine (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask,
-
- convert_pixel_t convert_pixel,
- pixman_format_code_t format,
- pixman_repeat_t repeat_mode)
-{
- pixman_fixed_t x, y;
- pixman_fixed_t ux, uy;
- pixman_vector_t v;
- bits_image_t *bits = &image->bits;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
-
- x = v.vector[0];
- y = v.vector[1];
-
- for (i = 0; i < width; ++i)
- {
- int width, height, x0, y0;
- const uint8_t *row;
-
- if (mask && !mask[i])
- goto next;
-
- width = image->bits.width;
- height = image->bits.height;
- x0 = pixman_fixed_to_int (x - pixman_fixed_e);
- y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
- if (repeat_mode == PIXMAN_REPEAT_NONE &&
- (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
- {
- buffer[i] = 0;
- }
- else
- {
- uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- repeat (repeat_mode, width, &x0);
- repeat (repeat_mode, height, &y0);
- }
-
- row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
-
- buffer[i] = convert_pixel (row, x0) | mask;
- }
-
- next:
- x += ux;
- y += uy;
- }
-}
-
-static force_inline uint32_t
-convert_a8r8g8b8 (const uint8_t *row, int x)
-{
- return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_x8r8g8b8 (const uint8_t *row, int x)
-{
- return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_a8 (const uint8_t *row, int x)
-{
- return *(row + x) << 24;
-}
-
-static force_inline uint32_t
-convert_r5g6b5 (const uint8_t *row, int x)
-{
- return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
-}
-
-#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode) \
- static void \
- bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image, \
- int offset, \
- int line, \
- int width, \
- uint32_t * buffer, \
- const uint32_t * mask) \
- { \
- bits_image_fetch_bilinear_affine (image, offset, line, \
- width, buffer, mask, \
- convert_ ## format, \
- PIXMAN_ ## format, \
- repeat_mode); \
- } \
- extern int no_such_variable
-
-#define MAKE_NEAREST_FETCHER(name, format, repeat_mode) \
- static void \
- bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image, \
- int offset, \
- int line, \
- int width, \
- uint32_t * buffer, \
- const uint32_t * mask) \
- { \
- bits_image_fetch_nearest_affine (image, offset, line, \
- width, buffer, mask, \
- convert_ ## format, \
- PIXMAN_ ## format, \
- repeat_mode); \
- } \
- extern int no_such_variable
-
-#define MAKE_FETCHERS(name, format, repeat_mode) \
- MAKE_NEAREST_FETCHER (name, format, repeat_mode); \
- MAKE_BILINEAR_FETCHER (name, format, repeat_mode);
-
-MAKE_FETCHERS (pad_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_PAD);
-MAKE_FETCHERS (none_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NONE);
-MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT);
-MAKE_FETCHERS (normal_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NORMAL);
-MAKE_FETCHERS (pad_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_PAD);
-MAKE_FETCHERS (none_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NONE);
-MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT);
-MAKE_FETCHERS (normal_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NORMAL);
-MAKE_FETCHERS (pad_a8, a8, PIXMAN_REPEAT_PAD);
-MAKE_FETCHERS (none_a8, a8, PIXMAN_REPEAT_NONE);
-MAKE_FETCHERS (reflect_a8, a8, PIXMAN_REPEAT_REFLECT);
-MAKE_FETCHERS (normal_a8, a8, PIXMAN_REPEAT_NORMAL);
-MAKE_FETCHERS (pad_r5g6b5, r5g6b5, PIXMAN_REPEAT_PAD);
-MAKE_FETCHERS (none_r5g6b5, r5g6b5, PIXMAN_REPEAT_NONE);
-MAKE_FETCHERS (reflect_r5g6b5, r5g6b5, PIXMAN_REPEAT_REFLECT);
-MAKE_FETCHERS (normal_r5g6b5, r5g6b5, PIXMAN_REPEAT_NORMAL);
-
-static void
-bits_image_fetch_solid_32 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- uint32_t color;
- uint32_t *end;
-
- color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
-
- end = buffer + width;
- while (buffer < end)
- *(buffer++) = color;
-}
-
-static void
-bits_image_fetch_solid_64 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * b,
- const uint32_t * unused)
-{
- uint64_t color;
- uint64_t *buffer = (uint64_t *)b;
- uint64_t *end;
-
- color = image->bits.fetch_pixel_64 (&image->bits, 0, 0);
-
- end = buffer + width;
- while (buffer < end)
- *(buffer++) = color;
-}
-
-static void
-bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
- pixman_bool_t wide,
- int x,
- int y,
- int width,
- uint32_t * buffer)
-{
- uint32_t w;
-
- if (y < 0 || y >= image->height)
- {
- memset (buffer, 0, width * (wide? 8 : 4));
- return;
- }
-
- if (x < 0)
- {
- w = MIN (width, -x);
-
- memset (buffer, 0, w * (wide ? 8 : 4));
-
- width -= w;
- buffer += w * (wide? 2 : 1);
- x += w;
- }
-
- if (x < image->width)
- {
- w = MIN (width, image->width - x);
-
- if (wide)
- image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
- else
- image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-
- width -= w;
- buffer += w * (wide? 2 : 1);
- x += w;
- }
-
- memset (buffer, 0, width * (wide ? 8 : 4));
-}
-
-static void
-bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
- pixman_bool_t wide,
- int x,
- int y,
- int width,
- uint32_t * buffer)
-{
- uint32_t w;
-
- while (y < 0)
- y += image->height;
-
- while (y >= image->height)
- y -= image->height;
-
- while (width)
- {
- while (x < 0)
- x += image->width;
- while (x >= image->width)
- x -= image->width;
-
- w = MIN (width, image->width - x);
-
- if (wide)
- image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
- else
- image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-
- buffer += w * (wide? 2 : 1);
- x += w;
- width -= w;
- }
-}
-
-static void
-bits_image_fetch_untransformed_32 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- if (image->common.repeat == PIXMAN_REPEAT_NONE)
- {
- bits_image_fetch_untransformed_repeat_none (
- &image->bits, FALSE, x, y, width, buffer);
- }
- else
- {
- bits_image_fetch_untransformed_repeat_normal (
- &image->bits, FALSE, x, y, width, buffer);
- }
-}
-
-static void
-bits_image_fetch_untransformed_64 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * buffer,
- const uint32_t * unused)
-{
- if (image->common.repeat == PIXMAN_REPEAT_NONE)
- {
- bits_image_fetch_untransformed_repeat_none (
- &image->bits, TRUE, x, y, width, buffer);
- }
- else
- {
- bits_image_fetch_untransformed_repeat_normal (
- &image->bits, TRUE, x, y, width, buffer);
- }
-}
-
-typedef struct
-{
- pixman_format_code_t format;
- uint32_t flags;
- fetch_scanline_t fetch_32;
- fetch_scanline_t fetch_64;
-} fetcher_info_t;
-
-static const fetcher_info_t fetcher_info[] =
-{
- { PIXMAN_solid,
- FAST_PATH_NO_ALPHA_MAP,
- bits_image_fetch_solid_32,
- bits_image_fetch_solid_64
- },
-
- { PIXMAN_any,
- (FAST_PATH_NO_ALPHA_MAP |
- FAST_PATH_ID_TRANSFORM |
- FAST_PATH_NO_CONVOLUTION_FILTER |
- FAST_PATH_NO_PAD_REPEAT |
- FAST_PATH_NO_REFLECT_REPEAT),
- bits_image_fetch_untransformed_32,
- bits_image_fetch_untransformed_64
- },
-
-#define FAST_BILINEAR_FLAGS \
- (FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_HAS_TRANSFORM | \
- FAST_PATH_AFFINE_TRANSFORM | \
- FAST_PATH_X_UNIT_POSITIVE | \
- FAST_PATH_Y_UNIT_ZERO | \
- FAST_PATH_NONE_REPEAT | \
- FAST_PATH_BILINEAR_FILTER)
-
- { PIXMAN_a8r8g8b8,
- FAST_BILINEAR_FLAGS,
- bits_image_fetch_bilinear_no_repeat_8888,
- _pixman_image_get_scanline_generic_64
- },
-
- { PIXMAN_x8r8g8b8,
- FAST_BILINEAR_FLAGS,
- bits_image_fetch_bilinear_no_repeat_8888,
- _pixman_image_get_scanline_generic_64
- },
-
-#define GENERAL_BILINEAR_FLAGS \
- (FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_HAS_TRANSFORM | \
- FAST_PATH_AFFINE_TRANSFORM | \
- FAST_PATH_BILINEAR_FILTER)
-
-#define GENERAL_NEAREST_FLAGS \
- (FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_HAS_TRANSFORM | \
- FAST_PATH_AFFINE_TRANSFORM | \
- FAST_PATH_NEAREST_FILTER)
-
-#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \
- { PIXMAN_ ## format, \
- GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
- bits_image_fetch_bilinear_affine_ ## name, \
- _pixman_image_get_scanline_generic_64 \
- },
-
-#define NEAREST_AFFINE_FAST_PATH(name, format, repeat) \
- { PIXMAN_ ## format, \
- GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
- bits_image_fetch_nearest_affine_ ## name, \
- _pixman_image_get_scanline_generic_64 \
- },
-
-#define AFFINE_FAST_PATHS(name, format, repeat) \
- BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \
- NEAREST_AFFINE_FAST_PATH(name, format, repeat)
-
- AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
- AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
- AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
- AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
- AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
- AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
- AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
- AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
- AFFINE_FAST_PATHS (pad_a8, a8, PAD)
- AFFINE_FAST_PATHS (none_a8, a8, NONE)
- AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
- AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
- AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
- AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
- AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
- AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
-
- /* Affine, no alpha */
- { PIXMAN_any,
- (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
- bits_image_fetch_affine_no_alpha,
- _pixman_image_get_scanline_generic_64
- },
-
- /* General */
- { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
-
- { PIXMAN_null },
-};
-
-static void
-bits_image_property_changed (pixman_image_t *image)
-{
- uint32_t flags = image->common.flags;
- pixman_format_code_t format = image->common.extended_format_code;
- const fetcher_info_t *info;
-
- _pixman_bits_image_setup_accessors (&image->bits);
-
- info = fetcher_info;
- while (info->format != PIXMAN_null)
- {
- if ((info->format == format || info->format == PIXMAN_any) &&
- (info->flags & flags) == info->flags)
- {
- image->common.get_scanline_32 = info->fetch_32;
- image->common.get_scanline_64 = info->fetch_64;
- break;
- }
-
- info++;
- }
-}
-
-static uint32_t *
-create_bits (pixman_format_code_t format,
- int width,
- int height,
- int * rowstride_bytes)
-{
- int stride;
- int buf_size;
- int bpp;
-
- /* what follows is a long-winded way, avoiding any possibility of integer
- * overflows, of saying:
- * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
- */
-
- bpp = PIXMAN_FORMAT_BPP (format);
- if (pixman_multiply_overflows_int (width, bpp))
- return NULL;
-
- stride = width * bpp;
- if (pixman_addition_overflows_int (stride, 0x1f))
- return NULL;
-
- stride += 0x1f;
- stride >>= 5;
-
- stride *= sizeof (uint32_t);
-
- if (pixman_multiply_overflows_int (height, stride))
- return NULL;
-
- buf_size = height * stride;
-
- if (rowstride_bytes)
- *rowstride_bytes = stride;
-
- return calloc (buf_size, 1);
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_bits (pixman_format_code_t format,
- int width,
- int height,
- uint32_t * bits,
- int rowstride_bytes)
-{
- pixman_image_t *image;
- uint32_t *free_me = NULL;
-
- /* must be a whole number of uint32_t's
- */
- return_val_if_fail (
- bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
-
- return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
-
- if (!bits && width && height)
- {
- free_me = bits = create_bits (format, width, height, &rowstride_bytes);
- if (!bits)
- return NULL;
- }
-
- image = _pixman_image_allocate ();
-
- if (!image)
- {
- if (free_me)
- free (free_me);
-
- return NULL;
- }
-
- image->type = BITS;
- image->bits.format = format;
- image->bits.width = width;
- image->bits.height = height;
- image->bits.bits = bits;
- image->bits.free_me = free_me;
- image->bits.read_func = NULL;
- image->bits.write_func = NULL;
-
- /* The rowstride is stored in number of uint32_t */
- image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t);
-
- image->bits.indexed = NULL;
-
- image->common.property_changed = bits_image_property_changed;
-
- _pixman_image_reset_clip_region (image);
-
- return image;
-}
+/* + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + * 2005 Lars Knoll & Zack Rusin, Trolltech + * 2008 Aaron Plattner, NVIDIA Corporation + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007, 2009 Red Hat, Inc. + * Copyright © 2008 André Tupinambá <andrelrt@gmail.com> + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Keith Packard makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "pixman-private.h" +#include "pixman-combine32.h" + +/* Store functions */ +void +_pixman_image_store_scanline_32 (bits_image_t * image, + int x, + int y, + int width, + const uint32_t *buffer) +{ + image->store_scanline_32 (image, x, y, width, buffer); + + if (image->common.alpha_map) + { + x -= image->common.alpha_origin_x; + y -= image->common.alpha_origin_y; + + image->common.alpha_map->store_scanline_32 ( + image->common.alpha_map, x, y, width, buffer); + } +} + +void +_pixman_image_store_scanline_64 (bits_image_t * image, + int x, + int y, + int width, + const uint32_t *buffer) +{ + image->store_scanline_64 (image, x, y, width, buffer); + + if (image->common.alpha_map) + { + x -= image->common.alpha_origin_x; + y -= image->common.alpha_origin_y; + + image->common.alpha_map->store_scanline_64 ( + image->common.alpha_map, x, y, width, buffer); + } +} + +/* Fetch functions */ + +static force_inline uint32_t +fetch_pixel_no_alpha (bits_image_t *image, + int x, int y, pixman_bool_t check_bounds) +{ + if (check_bounds && + (x < 0 || x >= image->width || y < 0 || y >= image->height)) + { + return 0; + } + + return image->fetch_pixel_32 (image, x, y); +} + +typedef uint32_t (* get_pixel_t) (bits_image_t *image, + int x, int y, pixman_bool_t check_bounds); + +static force_inline void +repeat (pixman_repeat_t repeat, int size, int *coord) +{ + switch (repeat) + { + case PIXMAN_REPEAT_NORMAL: + *coord = MOD (*coord, size); + break; + + case PIXMAN_REPEAT_PAD: + *coord = CLIP (*coord, 0, size - 1); + break; + + case PIXMAN_REPEAT_REFLECT: + *coord = MOD (*coord, size * 2); + + if (*coord >= size) + *coord = size * 2 - *coord - 1; + break; + + case PIXMAN_REPEAT_NONE: + break; + + default: + break; + } +} + +static force_inline uint32_t +bits_image_fetch_pixel_nearest (bits_image_t *image, + pixman_fixed_t x, + pixman_fixed_t y, + get_pixel_t get_pixel) +{ + int x0 = pixman_fixed_to_int (x - pixman_fixed_e); + int y0 = pixman_fixed_to_int (y - pixman_fixed_e); + + if (image->common.repeat != PIXMAN_REPEAT_NONE) + { + repeat (image->common.repeat, image->width, &x0); + repeat (image->common.repeat, image->height, &y0); + + return get_pixel (image, x0, y0, FALSE); + } + else + { + return get_pixel (image, x0, y0, TRUE); + } +} + +#if SIZEOF_LONG > 4 + +static force_inline uint32_t +bilinear_interpolation (uint32_t tl, uint32_t tr, + uint32_t bl, uint32_t br, + int distx, int disty) +{ + uint64_t distxy, distxiy, distixy, distixiy; + uint64_t tl64, tr64, bl64, br64; + uint64_t f, r; + + distxy = distx * disty; + distxiy = distx * (256 - disty); + distixy = (256 - distx) * disty; + distixiy = (256 - distx) * (256 - disty); + + /* Alpha and Blue */ + tl64 = tl & 0xff0000ff; + tr64 = tr & 0xff0000ff; + bl64 = bl & 0xff0000ff; + br64 = br & 0xff0000ff; + + f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy; + r = f & 0x0000ff0000ff0000ull; + + /* Red and Green */ + tl64 = tl; + tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull); + + tr64 = tr; + tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull); + + bl64 = bl; + bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull); + + br64 = br; + br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull); + + f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy; + r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull); + + return (uint32_t)(r >> 16); +} + +#else + +static force_inline uint32_t +bilinear_interpolation (uint32_t tl, uint32_t tr, + uint32_t bl, uint32_t br, + int distx, int disty) +{ + int distxy, distxiy, distixy, distixiy; + uint32_t f, r; + + distxy = distx * disty; + distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */ + distixy = (disty << 8) - distxy; /* disty * (256 - distx) */ + distixiy = + 256 * 256 - (disty << 8) - + (distx << 8) + distxy; /* (256 - distx) * (256 - disty) */ + + /* Blue */ + r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy + + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy; + + /* Green */ + f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy + + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy; + r |= f & 0xff000000; + + tl >>= 16; + tr >>= 16; + bl >>= 16; + br >>= 16; + r >>= 16; + + /* Red */ + f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy + + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy; + r |= f & 0x00ff0000; + + /* Alpha */ + f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy + + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy; + r |= f & 0xff000000; + + return r; +} + +#endif + +static force_inline uint32_t +bits_image_fetch_pixel_bilinear (bits_image_t *image, + pixman_fixed_t x, + pixman_fixed_t y, + get_pixel_t get_pixel) +{ + pixman_repeat_t repeat_mode = image->common.repeat; + int width = image->width; + int height = image->height; + int x1, y1, x2, y2; + uint32_t tl, tr, bl, br; + int32_t distx, disty; + + x1 = x - pixman_fixed_1 / 2; + y1 = y - pixman_fixed_1 / 2; + + distx = (x1 >> 8) & 0xff; + disty = (y1 >> 8) & 0xff; + + x1 = pixman_fixed_to_int (x1); + y1 = pixman_fixed_to_int (y1); + x2 = x1 + 1; + y2 = y1 + 1; + + if (repeat_mode != PIXMAN_REPEAT_NONE) + { + repeat (repeat_mode, width, &x1); + repeat (repeat_mode, height, &y1); + repeat (repeat_mode, width, &x2); + repeat (repeat_mode, height, &y2); + + tl = get_pixel (image, x1, y1, FALSE); + bl = get_pixel (image, x1, y2, FALSE); + tr = get_pixel (image, x2, y1, FALSE); + br = get_pixel (image, x2, y2, FALSE); + } + else + { + tl = get_pixel (image, x1, y1, TRUE); + tr = get_pixel (image, x2, y1, TRUE); + bl = get_pixel (image, x1, y2, TRUE); + br = get_pixel (image, x2, y2, TRUE); + } + + return bilinear_interpolation (tl, tr, bl, br, distx, disty); +} + +static void +bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima, + int offset, + int line, + int width, + uint32_t * buffer, + const uint32_t * mask) +{ + bits_image_t *bits = &ima->bits; + pixman_fixed_t x_top, x_bottom, x; + pixman_fixed_t ux_top, ux_bottom, ux; + pixman_vector_t v; + uint32_t top_mask, bottom_mask; + uint32_t *top_row; + uint32_t *bottom_row; + uint32_t *end; + uint32_t zero[2] = { 0, 0 }; + int y, y1, y2; + int disty; + int mask_inc; + int w; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (bits->common.transform, &v)) + return; + + ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0]; + x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2; + + y = v.vector[1] - pixman_fixed_1/2; + disty = (y >> 8) & 0xff; + + /* Load the pointers to the first and second lines from the source + * image that bilinear code must read. + * + * The main trick in this code is about the check if any line are + * outside of the image; + * + * When I realize that a line (any one) is outside, I change + * the pointer to a dummy area with zeros. Once I change this, I + * must be sure the pointer will not change, so I set the + * variables to each pointer increments inside the loop. + */ + y1 = pixman_fixed_to_int (y); + y2 = y1 + 1; + + if (y1 < 0 || y1 >= bits->height) + { + top_row = zero; + x_top = 0; + ux_top = 0; + } + else + { + top_row = bits->bits + y1 * bits->rowstride; + x_top = x; + ux_top = ux; + } + + if (y2 < 0 || y2 >= bits->height) + { + bottom_row = zero; + x_bottom = 0; + ux_bottom = 0; + } + else + { + bottom_row = bits->bits + y2 * bits->rowstride; + x_bottom = x; + ux_bottom = ux; + } + + /* Instead of checking whether the operation uses the mast in + * each loop iteration, verify this only once and prepare the + * variables to make the code smaller inside the loop. + */ + if (!mask) + { + uint32_t mask_bits = 1; + + mask_inc = 0; + mask = &mask_bits; + } + else + { + /* If have a mask, prepare the variables to check it */ + mask_inc = 1; + } + + /* If both are zero, then the whole thing is zero */ + if (top_row == zero && bottom_row == zero) + { + memset (buffer, 0, width * sizeof (uint32_t)); + return; + } + else if (bits->format == PIXMAN_x8r8g8b8) + { + if (top_row == zero) + { + top_mask = 0; + bottom_mask = 0xff000000; + } + else if (bottom_row == zero) + { + top_mask = 0xff000000; + bottom_mask = 0; + } + else + { + top_mask = 0xff000000; + bottom_mask = 0xff000000; + } + } + else + { + top_mask = 0; + bottom_mask = 0; + } + + end = buffer + width; + + /* Zero fill to the left of the image */ + while (buffer < end && x < pixman_fixed_minus_1) + { + *buffer++ = 0; + x += ux; + x_top += ux_top; + x_bottom += ux_bottom; + mask += mask_inc; + } + + /* Left edge + */ + while (buffer < end && x < 0) + { + uint32_t tr, br; + int32_t distx; + + tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask; + br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; + + distx = (x >> 8) & 0xff; + + *buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty); + + x += ux; + x_top += ux_top; + x_bottom += ux_bottom; + mask += mask_inc; + } + + /* Main part */ + w = pixman_int_to_fixed (bits->width - 1); + + while (buffer < end && x < w) + { + if (*mask) + { + uint32_t tl, tr, bl, br; + int32_t distx; + + tl = top_row [pixman_fixed_to_int (x_top)] | top_mask; + tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask; + bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask; + br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; + + distx = (x >> 8) & 0xff; + + *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty); + } + + buffer++; + x += ux; + x_top += ux_top; + x_bottom += ux_bottom; + mask += mask_inc; + } + + /* Right Edge */ + w = pixman_int_to_fixed (bits->width); + while (buffer < end && x < w) + { + if (*mask) + { + uint32_t tl, bl; + int32_t distx; + + tl = top_row [pixman_fixed_to_int (x_top)] | top_mask; + bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask; + + distx = (x >> 8) & 0xff; + + *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty); + } + + buffer++; + x += ux; + x_top += ux_top; + x_bottom += ux_bottom; + mask += mask_inc; + } + + /* Zero fill to the left of the image */ + while (buffer < end) + *buffer++ = 0; +} + +static force_inline uint32_t +bits_image_fetch_pixel_convolution (bits_image_t *image, + pixman_fixed_t x, + pixman_fixed_t y, + get_pixel_t get_pixel) +{ + pixman_fixed_t *params = image->common.filter_params; + int x_off = (params[0] - pixman_fixed_1) >> 1; + int y_off = (params[1] - pixman_fixed_1) >> 1; + int32_t cwidth = pixman_fixed_to_int (params[0]); + int32_t cheight = pixman_fixed_to_int (params[1]); + int32_t srtot, sgtot, sbtot, satot; + int32_t i, j, x1, x2, y1, y2; + pixman_repeat_t repeat_mode = image->common.repeat; + int width = image->width; + int height = image->height; + + params += 2; + + x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off); + y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off); + x2 = x1 + cwidth; + y2 = y1 + cheight; + + srtot = sgtot = sbtot = satot = 0; + + for (i = y1; i < y2; ++i) + { + for (j = x1; j < x2; ++j) + { + int rx = j; + int ry = i; + + pixman_fixed_t f = *params; + + if (f) + { + uint32_t pixel; + + if (repeat_mode != PIXMAN_REPEAT_NONE) + { + repeat (repeat_mode, width, &rx); + repeat (repeat_mode, height, &ry); + + pixel = get_pixel (image, rx, ry, FALSE); + } + else + { + pixel = get_pixel (image, rx, ry, TRUE); + } + + srtot += RED_8 (pixel) * f; + sgtot += GREEN_8 (pixel) * f; + sbtot += BLUE_8 (pixel) * f; + satot += ALPHA_8 (pixel) * f; + } + + params++; + } + } + + satot >>= 16; + srtot >>= 16; + sgtot >>= 16; + sbtot >>= 16; + + satot = CLIP (satot, 0, 0xff); + srtot = CLIP (srtot, 0, 0xff); + sgtot = CLIP (sgtot, 0, 0xff); + sbtot = CLIP (sbtot, 0, 0xff); + + return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot)); +} + +static force_inline uint32_t +bits_image_fetch_pixel_filtered (bits_image_t *image, + pixman_fixed_t x, + pixman_fixed_t y, + get_pixel_t get_pixel) +{ + switch (image->common.filter) + { + case PIXMAN_FILTER_NEAREST: + case PIXMAN_FILTER_FAST: + return bits_image_fetch_pixel_nearest (image, x, y, get_pixel); + break; + + case PIXMAN_FILTER_BILINEAR: + case PIXMAN_FILTER_GOOD: + case PIXMAN_FILTER_BEST: + return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel); + break; + + case PIXMAN_FILTER_CONVOLUTION: + return bits_image_fetch_pixel_convolution (image, x, y, get_pixel); + break; + + default: + break; + } + + return 0; +} + +static void +bits_image_fetch_affine_no_alpha (pixman_image_t * image, + int offset, + int line, + int width, + uint32_t * buffer, + const uint32_t * mask) +{ + pixman_fixed_t x, y; + pixman_fixed_t ux, uy; + pixman_vector_t v; + int i; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (image->common.transform) + { + if (!pixman_transform_point_3d (image->common.transform, &v)) + return; + + ux = image->common.transform->matrix[0][0]; + uy = image->common.transform->matrix[1][0]; + } + else + { + ux = pixman_fixed_1; + uy = 0; + } + + x = v.vector[0]; + y = v.vector[1]; + + for (i = 0; i < width; ++i) + { + if (!mask || mask[i]) + { + buffer[i] = bits_image_fetch_pixel_filtered ( + &image->bits, x, y, fetch_pixel_no_alpha); + } + + x += ux; + y += uy; + } +} + +/* General fetcher */ +static force_inline uint32_t +fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds) +{ + uint32_t pixel; + + if (check_bounds && + (x < 0 || x >= image->width || y < 0 || y >= image->height)) + { + return 0; + } + + pixel = image->fetch_pixel_32 (image, x, y); + + if (image->common.alpha_map) + { + uint32_t pixel_a; + + x -= image->common.alpha_origin_x; + y -= image->common.alpha_origin_y; + + if (x < 0 || x >= image->common.alpha_map->width || + y < 0 || y >= image->common.alpha_map->height) + { + pixel_a = 0; + } + else + { + pixel_a = image->common.alpha_map->fetch_pixel_32 ( + image->common.alpha_map, x, y); + + pixel_a = ALPHA_8 (pixel_a); + } + + pixel &= 0x00ffffff; + pixel |= (pixel_a << 24); + } + + return pixel; +} + +static void +bits_image_fetch_general (pixman_image_t * image, + int offset, + int line, + int width, + uint32_t * buffer, + const uint32_t * mask) +{ + pixman_fixed_t x, y, w; + pixman_fixed_t ux, uy, uw; + pixman_vector_t v; + int i; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (image->common.transform) + { + if (!pixman_transform_point_3d (image->common.transform, &v)) + return; + + ux = image->common.transform->matrix[0][0]; + uy = image->common.transform->matrix[1][0]; + uw = image->common.transform->matrix[2][0]; + } + else + { + ux = pixman_fixed_1; + uy = 0; + uw = 0; + } + + x = v.vector[0]; + y = v.vector[1]; + w = v.vector[2]; + + for (i = 0; i < width; ++i) + { + pixman_fixed_t x0, y0; + + if (!mask || mask[i]) + { + if (w != 0) + { + x0 = ((pixman_fixed_48_16_t)x << 16) / w; + y0 = ((pixman_fixed_48_16_t)y << 16) / w; + } + else + { + x0 = 0; + y0 = 0; + } + + buffer[i] = bits_image_fetch_pixel_filtered ( + &image->bits, x0, y0, fetch_pixel_general); + } + + x += ux; + y += uy; + w += uw; + } +} + +static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + +typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x); + +static force_inline void +bits_image_fetch_bilinear_affine (pixman_image_t * image, + int offset, + int line, + int width, + uint32_t * buffer, + const uint32_t * mask, + + convert_pixel_t convert_pixel, + pixman_format_code_t format, + pixman_repeat_t repeat_mode) +{ + pixman_fixed_t x, y; + pixman_fixed_t ux, uy; + pixman_vector_t v; + bits_image_t *bits = &image->bits; + int i; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (image->common.transform, &v)) + return; + + ux = image->common.transform->matrix[0][0]; + uy = image->common.transform->matrix[1][0]; + + x = v.vector[0]; + y = v.vector[1]; + + for (i = 0; i < width; ++i) + { + int x1, y1, x2, y2; + uint32_t tl, tr, bl, br; + int32_t distx, disty; + int width = image->bits.width; + int height = image->bits.height; + const uint8_t *row1; + const uint8_t *row2; + + if (mask && !mask[i]) + goto next; + + x1 = x - pixman_fixed_1 / 2; + y1 = y - pixman_fixed_1 / 2; + + distx = (x1 >> 8) & 0xff; + disty = (y1 >> 8) & 0xff; + + y1 = pixman_fixed_to_int (y1); + y2 = y1 + 1; + x1 = pixman_fixed_to_int (x1); + x2 = x1 + 1; + + if (repeat_mode != PIXMAN_REPEAT_NONE) + { + uint32_t mask; + + mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + + repeat (repeat_mode, width, &x1); + repeat (repeat_mode, height, &y1); + repeat (repeat_mode, width, &x2); + repeat (repeat_mode, height, &y2); + + row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1; + row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2; + + tl = convert_pixel (row1, x1) | mask; + tr = convert_pixel (row1, x2) | mask; + bl = convert_pixel (row2, x1) | mask; + br = convert_pixel (row2, x2) | mask; + } + else + { + uint32_t mask1, mask2; + int bpp; + + /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value, + * which means if you use it in expressions, those + * expressions become unsigned themselves. Since + * the variables below can be negative in some cases, + * that will lead to crashes on 64 bit architectures. + * + * So this line makes sure bpp is signed + */ + bpp = PIXMAN_FORMAT_BPP (format); + + if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0) + { + buffer[i] = 0; + goto next; + } + + if (y2 == 0) + { + row1 = zero; + mask1 = 0; + } + else + { + row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1; + row1 += bpp / 8 * x1; + + mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + } + + if (y1 == height - 1) + { + row2 = zero; + mask2 = 0; + } + else + { + row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2; + row2 += bpp / 8 * x1; + + mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + } + + if (x2 == 0) + { + tl = 0; + bl = 0; + } + else + { + tl = convert_pixel (row1, 0) | mask1; + bl = convert_pixel (row2, 0) | mask2; + } + + if (x1 == width - 1) + { + tr = 0; + br = 0; + } + else + { + tr = convert_pixel (row1, 1) | mask1; + br = convert_pixel (row2, 1) | mask2; + } + } + + buffer[i] = bilinear_interpolation ( + tl, tr, bl, br, distx, disty); + + next: + x += ux; + y += uy; + } +} + +static force_inline void +bits_image_fetch_nearest_affine (pixman_image_t * image, + int offset, + int line, + int width, + uint32_t * buffer, + const uint32_t * mask, + + convert_pixel_t convert_pixel, + pixman_format_code_t format, + pixman_repeat_t repeat_mode) +{ + pixman_fixed_t x, y; + pixman_fixed_t ux, uy; + pixman_vector_t v; + bits_image_t *bits = &image->bits; + int i; + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (image->common.transform, &v)) + return; + + ux = image->common.transform->matrix[0][0]; + uy = image->common.transform->matrix[1][0]; + + x = v.vector[0]; + y = v.vector[1]; + + for (i = 0; i < width; ++i) + { + int width, height, x0, y0; + const uint8_t *row; + + if (mask && !mask[i]) + goto next; + + width = image->bits.width; + height = image->bits.height; + x0 = pixman_fixed_to_int (x - pixman_fixed_e); + y0 = pixman_fixed_to_int (y - pixman_fixed_e); + + if (repeat_mode == PIXMAN_REPEAT_NONE && + (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width)) + { + buffer[i] = 0; + } + else + { + uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + + if (repeat_mode != PIXMAN_REPEAT_NONE) + { + repeat (repeat_mode, width, &x0); + repeat (repeat_mode, height, &y0); + } + + row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0; + + buffer[i] = convert_pixel (row, x0) | mask; + } + + next: + x += ux; + y += uy; + } +} + +static force_inline uint32_t +convert_a8r8g8b8 (const uint8_t *row, int x) +{ + return *(((uint32_t *)row) + x); +} + +static force_inline uint32_t +convert_x8r8g8b8 (const uint8_t *row, int x) +{ + return *(((uint32_t *)row) + x); +} + +static force_inline uint32_t +convert_a8 (const uint8_t *row, int x) +{ + return *(row + x) << 24; +} + +static force_inline uint32_t +convert_r5g6b5 (const uint8_t *row, int x) +{ + return CONVERT_0565_TO_0888 (*((uint16_t *)row + x)); +} + +#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode) \ + static void \ + bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image, \ + int offset, \ + int line, \ + int width, \ + uint32_t * buffer, \ + const uint32_t * mask) \ + { \ + bits_image_fetch_bilinear_affine (image, offset, line, \ + width, buffer, mask, \ + convert_ ## format, \ + PIXMAN_ ## format, \ + repeat_mode); \ + } + +#define MAKE_NEAREST_FETCHER(name, format, repeat_mode) \ + static void \ + bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image, \ + int offset, \ + int line, \ + int width, \ + uint32_t * buffer, \ + const uint32_t * mask) \ + { \ + bits_image_fetch_nearest_affine (image, offset, line, \ + width, buffer, mask, \ + convert_ ## format, \ + PIXMAN_ ## format, \ + repeat_mode); \ + } + +#define MAKE_FETCHERS(name, format, repeat_mode) \ + MAKE_NEAREST_FETCHER (name, format, repeat_mode) \ + MAKE_BILINEAR_FETCHER (name, format, repeat_mode) + +MAKE_FETCHERS (pad_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NORMAL) +MAKE_FETCHERS (pad_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NORMAL) +MAKE_FETCHERS (pad_a8, a8, PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_a8, a8, PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_a8, a8, PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_a8, a8, PIXMAN_REPEAT_NORMAL) +MAKE_FETCHERS (pad_r5g6b5, r5g6b5, PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_r5g6b5, r5g6b5, PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_r5g6b5, r5g6b5, PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_r5g6b5, r5g6b5, PIXMAN_REPEAT_NORMAL) + +static void +bits_image_fetch_solid_32 (pixman_image_t * image, + int x, + int y, + int width, + uint32_t * buffer, + const uint32_t * mask) +{ + uint32_t color; + uint32_t *end; + + color = image->bits.fetch_pixel_32 (&image->bits, 0, 0); + + end = buffer + width; + while (buffer < end) + *(buffer++) = color; +} + +static void +bits_image_fetch_solid_64 (pixman_image_t * image, + int x, + int y, + int width, + uint32_t * b, + const uint32_t * unused) +{ + uint64_t color; + uint64_t *buffer = (uint64_t *)b; + uint64_t *end; + + color = image->bits.fetch_pixel_64 (&image->bits, 0, 0); + + end = buffer + width; + while (buffer < end) + *(buffer++) = color; +} + +static void +bits_image_fetch_untransformed_repeat_none (bits_image_t *image, + pixman_bool_t wide, + int x, + int y, + int width, + uint32_t * buffer) +{ + uint32_t w; + + if (y < 0 || y >= image->height) + { + memset (buffer, 0, width * (wide? 8 : 4)); + return; + } + + if (x < 0) + { + w = MIN (width, -x); + + memset (buffer, 0, w * (wide ? 8 : 4)); + + width -= w; + buffer += w * (wide? 2 : 1); + x += w; + } + + if (x < image->width) + { + w = MIN (width, image->width - x); + + if (wide) + image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL); + else + image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL); + + width -= w; + buffer += w * (wide? 2 : 1); + x += w; + } + + memset (buffer, 0, width * (wide ? 8 : 4)); +} + +static void +bits_image_fetch_untransformed_repeat_normal (bits_image_t *image, + pixman_bool_t wide, + int x, + int y, + int width, + uint32_t * buffer) +{ + uint32_t w; + + while (y < 0) + y += image->height; + + while (y >= image->height) + y -= image->height; + + while (width) + { + while (x < 0) + x += image->width; + while (x >= image->width) + x -= image->width; + + w = MIN (width, image->width - x); + + if (wide) + image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL); + else + image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL); + + buffer += w * (wide? 2 : 1); + x += w; + width -= w; + } +} + +static void +bits_image_fetch_untransformed_32 (pixman_image_t * image, + int x, + int y, + int width, + uint32_t * buffer, + const uint32_t * mask) +{ + if (image->common.repeat == PIXMAN_REPEAT_NONE) + { + bits_image_fetch_untransformed_repeat_none ( + &image->bits, FALSE, x, y, width, buffer); + } + else + { + bits_image_fetch_untransformed_repeat_normal ( + &image->bits, FALSE, x, y, width, buffer); + } +} + +static void +bits_image_fetch_untransformed_64 (pixman_image_t * image, + int x, + int y, + int width, + uint32_t * buffer, + const uint32_t * unused) +{ + if (image->common.repeat == PIXMAN_REPEAT_NONE) + { + bits_image_fetch_untransformed_repeat_none ( + &image->bits, TRUE, x, y, width, buffer); + } + else + { + bits_image_fetch_untransformed_repeat_normal ( + &image->bits, TRUE, x, y, width, buffer); + } +} + +typedef struct +{ + pixman_format_code_t format; + uint32_t flags; + fetch_scanline_t fetch_32; + fetch_scanline_t fetch_64; +} fetcher_info_t; + +static const fetcher_info_t fetcher_info[] = +{ + { PIXMAN_solid, + FAST_PATH_NO_ALPHA_MAP, + bits_image_fetch_solid_32, + bits_image_fetch_solid_64 + }, + + { PIXMAN_any, + (FAST_PATH_NO_ALPHA_MAP | + FAST_PATH_ID_TRANSFORM | + FAST_PATH_NO_CONVOLUTION_FILTER | + FAST_PATH_NO_PAD_REPEAT | + FAST_PATH_NO_REFLECT_REPEAT), + bits_image_fetch_untransformed_32, + bits_image_fetch_untransformed_64 + }, + +#define FAST_BILINEAR_FLAGS \ + (FAST_PATH_NO_ALPHA_MAP | \ + FAST_PATH_NO_ACCESSORS | \ + FAST_PATH_HAS_TRANSFORM | \ + FAST_PATH_AFFINE_TRANSFORM | \ + FAST_PATH_X_UNIT_POSITIVE | \ + FAST_PATH_Y_UNIT_ZERO | \ + FAST_PATH_NONE_REPEAT | \ + FAST_PATH_BILINEAR_FILTER) + + { PIXMAN_a8r8g8b8, + FAST_BILINEAR_FLAGS, + bits_image_fetch_bilinear_no_repeat_8888, + _pixman_image_get_scanline_generic_64 + }, + + { PIXMAN_x8r8g8b8, + FAST_BILINEAR_FLAGS, + bits_image_fetch_bilinear_no_repeat_8888, + _pixman_image_get_scanline_generic_64 + }, + +#define GENERAL_BILINEAR_FLAGS \ + (FAST_PATH_NO_ALPHA_MAP | \ + FAST_PATH_NO_ACCESSORS | \ + FAST_PATH_HAS_TRANSFORM | \ + FAST_PATH_AFFINE_TRANSFORM | \ + FAST_PATH_BILINEAR_FILTER) + +#define GENERAL_NEAREST_FLAGS \ + (FAST_PATH_NO_ALPHA_MAP | \ + FAST_PATH_NO_ACCESSORS | \ + FAST_PATH_HAS_TRANSFORM | \ + FAST_PATH_AFFINE_TRANSFORM | \ + FAST_PATH_NEAREST_FILTER) + +#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \ + { PIXMAN_ ## format, \ + GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \ + bits_image_fetch_bilinear_affine_ ## name, \ + _pixman_image_get_scanline_generic_64 \ + }, + +#define NEAREST_AFFINE_FAST_PATH(name, format, repeat) \ + { PIXMAN_ ## format, \ + GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \ + bits_image_fetch_nearest_affine_ ## name, \ + _pixman_image_get_scanline_generic_64 \ + }, + +#define AFFINE_FAST_PATHS(name, format, repeat) \ + BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \ + NEAREST_AFFINE_FAST_PATH(name, format, repeat) + + AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD) + AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE) + AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT) + AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL) + AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD) + AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE) + AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT) + AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL) + AFFINE_FAST_PATHS (pad_a8, a8, PAD) + AFFINE_FAST_PATHS (none_a8, a8, NONE) + AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT) + AFFINE_FAST_PATHS (normal_a8, a8, NORMAL) + AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD) + AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE) + AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT) + AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL) + + /* Affine, no alpha */ + { PIXMAN_any, + (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM), + bits_image_fetch_affine_no_alpha, + _pixman_image_get_scanline_generic_64 + }, + + /* General */ + { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 }, + + { PIXMAN_null }, +}; + +static void +bits_image_property_changed (pixman_image_t *image) +{ + uint32_t flags = image->common.flags; + pixman_format_code_t format = image->common.extended_format_code; + const fetcher_info_t *info; + + _pixman_bits_image_setup_accessors (&image->bits); + + info = fetcher_info; + while (info->format != PIXMAN_null) + { + if ((info->format == format || info->format == PIXMAN_any) && + (info->flags & flags) == info->flags) + { + image->common.get_scanline_32 = info->fetch_32; + image->common.get_scanline_64 = info->fetch_64; + break; + } + + info++; + } +} + +static uint32_t * +create_bits (pixman_format_code_t format, + int width, + int height, + int * rowstride_bytes) +{ + int stride; + int buf_size; + int bpp; + + /* what follows is a long-winded way, avoiding any possibility of integer + * overflows, of saying: + * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t); + */ + + bpp = PIXMAN_FORMAT_BPP (format); + if (pixman_multiply_overflows_int (width, bpp)) + return NULL; + + stride = width * bpp; + if (pixman_addition_overflows_int (stride, 0x1f)) + return NULL; + + stride += 0x1f; + stride >>= 5; + + stride *= sizeof (uint32_t); + + if (pixman_multiply_overflows_int (height, stride)) + return NULL; + + buf_size = height * stride; + + if (rowstride_bytes) + *rowstride_bytes = stride; + + return calloc (buf_size, 1); +} + +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_bits (pixman_format_code_t format, + int width, + int height, + uint32_t * bits, + int rowstride_bytes) +{ + pixman_image_t *image; + uint32_t *free_me = NULL; + + /* must be a whole number of uint32_t's + */ + return_val_if_fail ( + bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL); + + return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL); + + if (!bits && width && height) + { + free_me = bits = create_bits (format, width, height, &rowstride_bytes); + if (!bits) + return NULL; + } + + image = _pixman_image_allocate (); + + if (!image) + { + if (free_me) + free (free_me); + + return NULL; + } + + image->type = BITS; + image->bits.format = format; + image->bits.width = width; + image->bits.height = height; + image->bits.bits = bits; + image->bits.free_me = free_me; + image->bits.read_func = NULL; + image->bits.write_func = NULL; + + /* The rowstride is stored in number of uint32_t */ + image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t); + + image->bits.indexed = NULL; + + image->common.property_changed = bits_image_property_changed; + + _pixman_image_reset_clip_region (image); + + return image; +} diff --git a/pixman/pixman/pixman-compiler.h b/pixman/pixman/pixman-compiler.h index 340b402a3..ebbffc3f3 100644 --- a/pixman/pixman/pixman-compiler.h +++ b/pixman/pixman/pixman-compiler.h @@ -1,216 +1,215 @@ -/* Pixman uses some non-standard compiler features. This file ensures
- * they exist
- *
- * The features are:
- *
- * FUNC must be defined to expand to the current function
- * PIXMAN_EXPORT should be defined to whatever is required to
- * export functions from a shared library
- * limits limits for various types must be defined
- * inline must be defined
- * force_inline must be defined
- */
-#if defined (__GNUC__)
-# define FUNC ((const char*) (__PRETTY_FUNCTION__))
-#elif defined (__sun) || (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
-# define FUNC ((const char*) (__func__))
-#else
-# define FUNC ((const char*) ("???"))
-#endif
-
-#ifndef INT16_MIN
-# define INT16_MIN (-32767-1)
-#endif
-
-#ifndef INT16_MAX
-# define INT16_MAX (32767)
-#endif
-
-#ifndef INT32_MIN
-# define INT32_MIN (-2147483647-1)
-#endif
-
-#ifndef INT32_MAX
-# define INT32_MAX (2147483647)
-#endif
-
-#ifndef UINT32_MIN
-# define UINT32_MIN (0)
-#endif
-
-#ifndef UINT32_MAX
-# define UINT32_MAX (4294967295U)
-#endif
-
-#ifndef M_PI
-# define M_PI 3.14159265358979323846
-#endif
-
-#ifdef _MSC_VER
-/* 'inline' is available only in C++ in MSVC */
-# define inline __inline
-# define force_inline __forceinline
-# define noinline __declspec(noinline)
-#elif defined __GNUC__ || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
-# define inline __inline__
-# define force_inline __inline__ __attribute__ ((__always_inline__))
-# define noinline __attribute__((noinline))
-#else
-# ifndef force_inline
-# define force_inline inline
-# endif
-# ifndef noinline
-# define noinline
-# endif
-#endif
-
-/* GCC visibility */
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32)
-# define PIXMAN_EXPORT __attribute__ ((visibility("default")))
-/* Sun Studio 8 visibility */
-#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
-# define PIXMAN_EXPORT __global
-#else
-# define PIXMAN_EXPORT
-#endif
-
-/* TLS */
-#if defined(PIXMAN_NO_TLS)
-
-# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static type name
-# define PIXMAN_GET_THREAD_LOCAL(name) \
- (&name)
-
-#elif defined(TOOLCHAIN_SUPPORTS__THREAD)
-
-# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static __thread type name
-# define PIXMAN_GET_THREAD_LOCAL(name) \
- (&name)
-
-#elif defined(__MINGW32__) && !defined(__WIN64)
-
-/* We can't include <windows.h> as it causes carious clashes with
- * identifiers in pixman, sigh. So just declare the functions we need
- * here.
- */
-extern long __stdcall InterlockedCompareExchange(long volatile *, long, long);
-#define InterlockedCompareExchangePointer(d,e,c) \
- (void *)InterlockedCompareExchange((long volatile *)(d),(long)(e),(long)(c))
-extern int __stdcall TlsAlloc (void);
-extern void * __stdcall TlsGetValue (unsigned);
-extern int __stdcall TlsSetValue (unsigned, void *);
-extern void * __stdcall CreateMutexA(void *, int, char *);
-extern int __stdcall CloseHandle(void *);
-extern unsigned __stdcall WaitForSingleObject (void *, unsigned);
-extern int __stdcall ReleaseMutex (void *);
-
-# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static volatile int tls_ ## name ## _initialized = 0; \
- static void *tls_ ## name ## _mutex = NULL; \
- static unsigned tls_ ## name ## _index; \
- \
- static type * \
- tls_ ## name ## _alloc (void) \
- { \
- type *value = calloc (1, sizeof (type)); \
- if (value) \
- TlsSetValue (tls_ ## name ## _index, value); \
- return value; \
- } \
- \
- static force_inline type * \
- tls_ ## name ## _get (void) \
- { \
- type *value; \
- if (!tls_ ## name ## _initialized) \
- { \
- if (!tls_ ## name ## _mutex) \
- { \
- void *mutex = CreateMutexA (NULL, 0, NULL); \
- if (InterlockedCompareExchangePointer ( \
- &tls_ ## name ## _mutex, mutex, NULL) != NULL) \
- { \
- CloseHandle (mutex); \
- } \
- } \
- WaitForSingleObject (tls_ ## name ## _mutex, 0xFFFFFFFF); \
- if (!tls_ ## name ## _initialized) \
- { \
- tls_ ## name ## _index = TlsAlloc (); \
- tls_ ## name ## _initialized = 1; \
- } \
- ReleaseMutex (tls_ ## name ## _mutex); \
- } \
- if (tls_ ## name ## _index == 0xFFFFFFFF) \
- return NULL; \
- value = TlsGetValue (tls_ ## name ## _index); \
- if (!value) \
- value = tls_ ## name ## _alloc (); \
- return value; \
- }
-
-# define PIXMAN_GET_THREAD_LOCAL(name) \
- tls_ ## name ## _get ()
-
-#elif defined(_MSC_VER)
-
-# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static __declspec(thread) type name
-# define PIXMAN_GET_THREAD_LOCAL(name) \
- (&name)
-
-#elif defined(HAVE_PTHREAD_SETSPECIFIC)
-
-#include <pthread.h>
-
-# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static pthread_once_t tls_ ## name ## _once_control = PTHREAD_ONCE_INIT; \
- static pthread_key_t tls_ ## name ## _key; \
- \
- static void \
- tls_ ## name ## _destroy_value (void *value) \
- { \
- free (value); \
- } \
- \
- static void \
- tls_ ## name ## _make_key (void) \
- { \
- pthread_key_create (&tls_ ## name ## _key, \
- tls_ ## name ## _destroy_value); \
- } \
- \
- static type * \
- tls_ ## name ## _alloc (void) \
- { \
- type *value = calloc (1, sizeof (type)); \
- if (value) \
- pthread_setspecific (tls_ ## name ## _key, value); \
- return value; \
- } \
- \
- static force_inline type * \
- tls_ ## name ## _get (void) \
- { \
- type *value = NULL; \
- if (pthread_once (&tls_ ## name ## _once_control, \
- tls_ ## name ## _make_key) == 0) \
- { \
- value = pthread_getspecific (tls_ ## name ## _key); \
- if (!value) \
- value = tls_ ## name ## _alloc (); \
- } \
- return value; \
- } \
- extern int no_such_variable
-
-# define PIXMAN_GET_THREAD_LOCAL(name) \
- tls_ ## name ## _get ()
-
-#else
-
-# error "Unknown thread local support for this system. Pixman will not work with multiple threads. Define PIXMAN_NO_TLS to acknowledge and accept this limitation and compile pixman without thread-safety support."
-
-#endif
+/* Pixman uses some non-standard compiler features. This file ensures + * they exist + * + * The features are: + * + * FUNC must be defined to expand to the current function + * PIXMAN_EXPORT should be defined to whatever is required to + * export functions from a shared library + * limits limits for various types must be defined + * inline must be defined + * force_inline must be defined + */ +#if defined (__GNUC__) +# define FUNC ((const char*) (__PRETTY_FUNCTION__)) +#elif defined (__sun) || (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) +# define FUNC ((const char*) (__func__)) +#else +# define FUNC ((const char*) ("???")) +#endif + +#ifndef INT16_MIN +# define INT16_MIN (-32767-1) +#endif + +#ifndef INT16_MAX +# define INT16_MAX (32767) +#endif + +#ifndef INT32_MIN +# define INT32_MIN (-2147483647-1) +#endif + +#ifndef INT32_MAX +# define INT32_MAX (2147483647) +#endif + +#ifndef UINT32_MIN +# define UINT32_MIN (0) +#endif + +#ifndef UINT32_MAX +# define UINT32_MAX (4294967295U) +#endif + +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + +#ifdef _MSC_VER +/* 'inline' is available only in C++ in MSVC */ +# define inline __inline +# define force_inline __forceinline +# define noinline __declspec(noinline) +#elif defined __GNUC__ || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)) +# define inline __inline__ +# define force_inline __inline__ __attribute__ ((__always_inline__)) +# define noinline __attribute__((noinline)) +#else +# ifndef force_inline +# define force_inline inline +# endif +# ifndef noinline +# define noinline +# endif +#endif + +/* GCC visibility */ +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32) +# define PIXMAN_EXPORT __attribute__ ((visibility("default"))) +/* Sun Studio 8 visibility */ +#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550) +# define PIXMAN_EXPORT __global +#else +# define PIXMAN_EXPORT +#endif + +/* TLS */ +#if defined(PIXMAN_NO_TLS) + +# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \ + static type name +# define PIXMAN_GET_THREAD_LOCAL(name) \ + (&name) + +#elif defined(TOOLCHAIN_SUPPORTS__THREAD) + +# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \ + static __thread type name +# define PIXMAN_GET_THREAD_LOCAL(name) \ + (&name) + +#elif defined(__MINGW32__) && !defined(__WIN64) + +/* We can't include <windows.h> as it causes carious clashes with + * identifiers in pixman, sigh. So just declare the functions we need + * here. + */ +extern long __stdcall InterlockedCompareExchange(long volatile *, long, long); +#define InterlockedCompareExchangePointer(d,e,c) \ + (void *)InterlockedCompareExchange((long volatile *)(d),(long)(e),(long)(c)) +extern int __stdcall TlsAlloc (void); +extern void * __stdcall TlsGetValue (unsigned); +extern int __stdcall TlsSetValue (unsigned, void *); +extern void * __stdcall CreateMutexA(void *, int, char *); +extern int __stdcall CloseHandle(void *); +extern unsigned __stdcall WaitForSingleObject (void *, unsigned); +extern int __stdcall ReleaseMutex (void *); + +# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \ + static volatile int tls_ ## name ## _initialized = 0; \ + static void *tls_ ## name ## _mutex = NULL; \ + static unsigned tls_ ## name ## _index; \ + \ + static type * \ + tls_ ## name ## _alloc (void) \ + { \ + type *value = calloc (1, sizeof (type)); \ + if (value) \ + TlsSetValue (tls_ ## name ## _index, value); \ + return value; \ + } \ + \ + static force_inline type * \ + tls_ ## name ## _get (void) \ + { \ + type *value; \ + if (!tls_ ## name ## _initialized) \ + { \ + if (!tls_ ## name ## _mutex) \ + { \ + void *mutex = CreateMutexA (NULL, 0, NULL); \ + if (InterlockedCompareExchangePointer ( \ + &tls_ ## name ## _mutex, mutex, NULL) != NULL) \ + { \ + CloseHandle (mutex); \ + } \ + } \ + WaitForSingleObject (tls_ ## name ## _mutex, 0xFFFFFFFF); \ + if (!tls_ ## name ## _initialized) \ + { \ + tls_ ## name ## _index = TlsAlloc (); \ + tls_ ## name ## _initialized = 1; \ + } \ + ReleaseMutex (tls_ ## name ## _mutex); \ + } \ + if (tls_ ## name ## _index == 0xFFFFFFFF) \ + return NULL; \ + value = TlsGetValue (tls_ ## name ## _index); \ + if (!value) \ + value = tls_ ## name ## _alloc (); \ + return value; \ + } + +# define PIXMAN_GET_THREAD_LOCAL(name) \ + tls_ ## name ## _get () + +#elif defined(_MSC_VER) + +# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \ + static __declspec(thread) type name +# define PIXMAN_GET_THREAD_LOCAL(name) \ + (&name) + +#elif defined(HAVE_PTHREAD_SETSPECIFIC) + +#include <pthread.h> + +# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \ + static pthread_once_t tls_ ## name ## _once_control = PTHREAD_ONCE_INIT; \ + static pthread_key_t tls_ ## name ## _key; \ + \ + static void \ + tls_ ## name ## _destroy_value (void *value) \ + { \ + free (value); \ + } \ + \ + static void \ + tls_ ## name ## _make_key (void) \ + { \ + pthread_key_create (&tls_ ## name ## _key, \ + tls_ ## name ## _destroy_value); \ + } \ + \ + static type * \ + tls_ ## name ## _alloc (void) \ + { \ + type *value = calloc (1, sizeof (type)); \ + if (value) \ + pthread_setspecific (tls_ ## name ## _key, value); \ + return value; \ + } \ + \ + static force_inline type * \ + tls_ ## name ## _get (void) \ + { \ + type *value = NULL; \ + if (pthread_once (&tls_ ## name ## _once_control, \ + tls_ ## name ## _make_key) == 0) \ + { \ + value = pthread_getspecific (tls_ ## name ## _key); \ + if (!value) \ + value = tls_ ## name ## _alloc (); \ + } \ + return value; \ + } + +# define PIXMAN_GET_THREAD_LOCAL(name) \ + tls_ ## name ## _get () + +#else + +# error "Unknown thread local support for this system. Pixman will not work with multiple threads. Define PIXMAN_NO_TLS to acknowledge and accept this limitation and compile pixman without thread-safety support." + +#endif diff --git a/pixman/pixman/pixman-cpu.c b/pixman/pixman/pixman-cpu.c index 8032fb42c..70253d1ea 100644 --- a/pixman/pixman/pixman-cpu.c +++ b/pixman/pixman/pixman-cpu.c @@ -1,598 +1,603 @@ -/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. SuSE makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <string.h>
-
-#if defined(USE_ARM_SIMD) && defined(_MSC_VER)
-/* Needed for EXCEPTION_ILLEGAL_INSTRUCTION */
-#include <windows.h>
-#endif
-
-#include "pixman-private.h"
-
-#ifdef USE_VMX
-
-/* The CPU detection code needs to be in a file not compiled with
- * "-maltivec -mabi=altivec", as gcc would try to save vector register
- * across function calls causing SIGILL on cpus without Altivec/vmx.
- */
-static pixman_bool_t initialized = FALSE;
-static volatile pixman_bool_t have_vmx = TRUE;
-
-#ifdef __APPLE__
-#include <sys/sysctl.h>
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
- if (!initialized)
- {
- size_t length = sizeof(have_vmx);
- int error =
- sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0);
-
- if (error)
- have_vmx = FALSE;
-
- initialized = TRUE;
- }
- return have_vmx;
-}
-
-#elif defined (__OpenBSD__)
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <machine/cpu.h>
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
- if (!initialized)
- {
- int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
- size_t length = sizeof(have_vmx);
- int error =
- sysctl (mib, 2, &have_vmx, &length, NULL, 0);
-
- if (error != 0)
- have_vmx = FALSE;
-
- initialized = TRUE;
- }
- return have_vmx;
-}
-
-#elif defined (__linux__)
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <linux/auxvec.h>
-#include <asm/cputable.h>
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
- if (!initialized)
- {
- char fname[64];
- unsigned long buf[64];
- ssize_t count = 0;
- pid_t pid;
- int fd, i;
-
- pid = getpid ();
- snprintf (fname, sizeof(fname) - 1, "/proc/%d/auxv", pid);
-
- fd = open (fname, O_RDONLY);
- if (fd >= 0)
- {
- for (i = 0; i <= (count / sizeof(unsigned long)); i += 2)
- {
- /* Read more if buf is empty... */
- if (i == (count / sizeof(unsigned long)))
- {
- count = read (fd, buf, sizeof(buf));
- if (count <= 0)
- break;
- i = 0;
- }
-
- if (buf[i] == AT_HWCAP)
- {
- have_vmx = !!(buf[i + 1] & PPC_FEATURE_HAS_ALTIVEC);
- initialized = TRUE;
- break;
- }
- else if (buf[i] == AT_NULL)
- {
- break;
- }
- }
- close (fd);
- }
- }
- if (!initialized)
- {
- /* Something went wrong. Assume 'no' rather than playing
- fragile tricks with catching SIGILL. */
- have_vmx = FALSE;
- initialized = TRUE;
- }
-
- return have_vmx;
-}
-
-#else /* !__APPLE__ && !__OpenBSD__ && !__linux__ */
-#include <signal.h>
-#include <setjmp.h>
-
-static jmp_buf jump_env;
-
-static void
-vmx_test (int sig,
- siginfo_t *si,
- void * unused)
-{
- longjmp (jump_env, 1);
-}
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
- struct sigaction sa, osa;
- int jmp_result;
-
- if (!initialized)
- {
- sa.sa_flags = SA_SIGINFO;
- sigemptyset (&sa.sa_mask);
- sa.sa_sigaction = vmx_test;
- sigaction (SIGILL, &sa, &osa);
- jmp_result = setjmp (jump_env);
- if (jmp_result == 0)
- {
- asm volatile ( "vor 0, 0, 0" );
- }
- sigaction (SIGILL, &osa, NULL);
- have_vmx = (jmp_result == 0);
- initialized = TRUE;
- }
- return have_vmx;
-}
-
-#endif /* __APPLE__ */
-#endif /* USE_VMX */
-
-#if defined(USE_ARM_SIMD) || defined(USE_ARM_NEON)
-
-#if defined(_MSC_VER)
-
-#if defined(USE_ARM_SIMD)
-extern int pixman_msvc_try_arm_simd_op ();
-
-pixman_bool_t
-pixman_have_arm_simd (void)
-{
- static pixman_bool_t initialized = FALSE;
- static pixman_bool_t have_arm_simd = FALSE;
-
- if (!initialized)
- {
- __try {
- pixman_msvc_try_arm_simd_op ();
- have_arm_simd = TRUE;
- } __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION) {
- have_arm_simd = FALSE;
- }
- initialized = TRUE;
- }
-
- return have_arm_simd;
-}
-
-#endif /* USE_ARM_SIMD */
-
-#if defined(USE_ARM_NEON)
-extern int pixman_msvc_try_arm_neon_op ();
-
-pixman_bool_t
-pixman_have_arm_neon (void)
-{
- static pixman_bool_t initialized = FALSE;
- static pixman_bool_t have_arm_neon = FALSE;
-
- if (!initialized)
- {
- __try
- {
- pixman_msvc_try_arm_neon_op ();
- have_arm_neon = TRUE;
- }
- __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION)
- {
- have_arm_neon = FALSE;
- }
- initialized = TRUE;
- }
-
- return have_arm_neon;
-}
-
-#endif /* USE_ARM_NEON */
-
-#else /* linux ELF */
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <string.h>
-#include <elf.h>
-
-static pixman_bool_t arm_has_v7 = FALSE;
-static pixman_bool_t arm_has_v6 = FALSE;
-static pixman_bool_t arm_has_vfp = FALSE;
-static pixman_bool_t arm_has_neon = FALSE;
-static pixman_bool_t arm_has_iwmmxt = FALSE;
-static pixman_bool_t arm_tests_initialized = FALSE;
-
-static void
-pixman_arm_read_auxv ()
-{
- int fd;
- Elf32_auxv_t aux;
-
- fd = open ("/proc/self/auxv", O_RDONLY);
- if (fd >= 0)
- {
- while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t))
- {
- if (aux.a_type == AT_HWCAP)
- {
- uint32_t hwcap = aux.a_un.a_val;
- /* hardcode these values to avoid depending on specific
- * versions of the hwcap header, e.g. HWCAP_NEON
- */
- arm_has_vfp = (hwcap & 64) != 0;
- arm_has_iwmmxt = (hwcap & 512) != 0;
- /* this flag is only present on kernel 2.6.29 */
- arm_has_neon = (hwcap & 4096) != 0;
- }
- else if (aux.a_type == AT_PLATFORM)
- {
- const char *plat = (const char*) aux.a_un.a_val;
- if (strncmp (plat, "v7l", 3) == 0)
- {
- arm_has_v7 = TRUE;
- arm_has_v6 = TRUE;
- }
- else if (strncmp (plat, "v6l", 3) == 0)
- {
- arm_has_v6 = TRUE;
- }
- }
- }
- close (fd);
- }
-
- arm_tests_initialized = TRUE;
-}
-
-#if defined(USE_ARM_SIMD)
-pixman_bool_t
-pixman_have_arm_simd (void)
-{
- if (!arm_tests_initialized)
- pixman_arm_read_auxv ();
-
- return arm_has_v6;
-}
-
-#endif /* USE_ARM_SIMD */
-
-#if defined(USE_ARM_NEON)
-pixman_bool_t
-pixman_have_arm_neon (void)
-{
- if (!arm_tests_initialized)
- pixman_arm_read_auxv ();
-
- return arm_has_neon;
-}
-
-#endif /* USE_ARM_NEON */
-
-#endif /* linux */
-
-#endif /* USE_ARM_SIMD || USE_ARM_NEON */
-
-#if defined(USE_MMX) || defined(USE_SSE2)
-/* The CPU detection code needs to be in a file not compiled with
- * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
- * that would lead to SIGILL instructions on old CPUs that don't have
- * it.
- */
-#if !defined(__amd64__) && !defined(__x86_64__) && !defined(_M_AMD64)
-
-#ifdef HAVE_GETISAX
-#include <sys/auxv.h>
-#endif
-
-typedef enum
-{
- NO_FEATURES = 0,
- MMX = 0x1,
- MMX_EXTENSIONS = 0x2,
- SSE = 0x6,
- SSE2 = 0x8,
- CMOV = 0x10
-} cpu_features_t;
-
-
-static unsigned int
-detect_cpu_features (void)
-{
- unsigned int features = 0;
- unsigned int result = 0;
-
-#ifdef HAVE_GETISAX
- if (getisax (&result, 1))
- {
- if (result & AV_386_CMOV)
- features |= CMOV;
- if (result & AV_386_MMX)
- features |= MMX;
- if (result & AV_386_AMD_MMX)
- features |= MMX_EXTENSIONS;
- if (result & AV_386_SSE)
- features |= SSE;
- if (result & AV_386_SSE2)
- features |= SSE2;
- }
-#else
- char vendor[13];
-#ifdef _MSC_VER
- int vendor0 = 0, vendor1, vendor2;
-#endif
- vendor[0] = 0;
- vendor[12] = 0;
-
-#ifdef __GNUC__
- /* see p. 118 of amd64 instruction set manual Vol3 */
- /* We need to be careful about the handling of %ebx and
- * %esp here. We can't declare either one as clobbered
- * since they are special registers (%ebx is the "PIC
- * register" holding an offset to global data, %esp the
- * stack pointer), so we need to make sure they have their
- * original values when we access the output operands.
- */
- __asm__ (
- "pushf\n"
- "pop %%eax\n"
- "mov %%eax, %%ecx\n"
- "xor $0x00200000, %%eax\n"
- "push %%eax\n"
- "popf\n"
- "pushf\n"
- "pop %%eax\n"
- "mov $0x0, %%edx\n"
- "xor %%ecx, %%eax\n"
- "jz 1f\n"
-
- "mov $0x00000000, %%eax\n"
- "push %%ebx\n"
- "cpuid\n"
- "mov %%ebx, %%eax\n"
- "pop %%ebx\n"
- "mov %%eax, %1\n"
- "mov %%edx, %2\n"
- "mov %%ecx, %3\n"
- "mov $0x00000001, %%eax\n"
- "push %%ebx\n"
- "cpuid\n"
- "pop %%ebx\n"
- "1:\n"
- "mov %%edx, %0\n"
- : "=r" (result),
- "=m" (vendor[0]),
- "=m" (vendor[4]),
- "=m" (vendor[8])
- :
- : "%eax", "%ecx", "%edx"
- );
-
-#elif defined (_MSC_VER)
-
- _asm {
- pushfd
- pop eax
- mov ecx, eax
- xor eax, 00200000h
- push eax
- popfd
- pushfd
- pop eax
- mov edx, 0
- xor eax, ecx
- jz nocpuid
-
- mov eax, 0
- push ebx
- cpuid
- mov eax, ebx
- pop ebx
- mov vendor0, eax
- mov vendor1, edx
- mov vendor2, ecx
- mov eax, 1
- push ebx
- cpuid
- pop ebx
- nocpuid:
- mov result, edx
- }
- memmove (vendor + 0, &vendor0, 4);
- memmove (vendor + 4, &vendor1, 4);
- memmove (vendor + 8, &vendor2, 4);
-
-#else
-# error unsupported compiler
-#endif
-
- features = 0;
- if (result)
- {
- /* result now contains the standard feature bits */
- if (result & (1 << 15))
- features |= CMOV;
- if (result & (1 << 23))
- features |= MMX;
- if (result & (1 << 25))
- features |= SSE;
- if (result & (1 << 26))
- features |= SSE2;
- if ((features & MMX) && !(features & SSE) &&
- (strcmp (vendor, "AuthenticAMD") == 0 ||
- strcmp (vendor, "Geode by NSC") == 0))
- {
- /* check for AMD MMX extensions */
-#ifdef __GNUC__
- __asm__ (
- " push %%ebx\n"
- " mov $0x80000000, %%eax\n"
- " cpuid\n"
- " xor %%edx, %%edx\n"
- " cmp $0x1, %%eax\n"
- " jge 2f\n"
- " mov $0x80000001, %%eax\n"
- " cpuid\n"
- "2:\n"
- " pop %%ebx\n"
- " mov %%edx, %0\n"
- : "=r" (result)
- :
- : "%eax", "%ecx", "%edx"
- );
-#elif defined _MSC_VER
- _asm {
- push ebx
- mov eax, 80000000h
- cpuid
- xor edx, edx
- cmp eax, 1
- jge notamd
- mov eax, 80000001h
- cpuid
- notamd:
- pop ebx
- mov result, edx
- }
-#endif
- if (result & (1 << 22))
- features |= MMX_EXTENSIONS;
- }
- }
-#endif /* HAVE_GETISAX */
-
- return features;
-}
-
-static pixman_bool_t
-pixman_have_mmx (void)
-{
- static pixman_bool_t initialized = FALSE;
- static pixman_bool_t mmx_present;
-
- if (!initialized)
- {
- unsigned int features = detect_cpu_features ();
- mmx_present = (features & (MMX | MMX_EXTENSIONS)) == (MMX | MMX_EXTENSIONS);
- initialized = TRUE;
- }
-
- return mmx_present;
-}
-
-#ifdef USE_SSE2
-static pixman_bool_t
-pixman_have_sse2 (void)
-{
- static pixman_bool_t initialized = FALSE;
- static pixman_bool_t sse2_present;
-
- if (!initialized)
- {
- unsigned int features = detect_cpu_features ();
- sse2_present = (features & (MMX | MMX_EXTENSIONS | SSE | SSE2)) == (MMX | MMX_EXTENSIONS | SSE | SSE2);
- initialized = TRUE;
- }
-
- return sse2_present;
-}
-
-#endif
-
-#else /* __amd64__ */
-#ifdef USE_MMX
-#define pixman_have_mmx() TRUE
-#endif
-#ifdef USE_SSE2
-#define pixman_have_sse2() TRUE
-#endif
-#endif /* __amd64__ */
-#endif
-
-pixman_implementation_t *
-_pixman_choose_implementation (void)
-{
-#ifdef USE_SSE2
- if (pixman_have_sse2 ())
- return _pixman_implementation_create_sse2 ();
-#endif
-#ifdef USE_MMX
- if (pixman_have_mmx ())
- return _pixman_implementation_create_mmx ();
-#endif
-
-#ifdef USE_ARM_NEON
- if (pixman_have_arm_neon ())
- return _pixman_implementation_create_arm_neon ();
-#endif
-#ifdef USE_ARM_SIMD
- if (pixman_have_arm_simd ())
- return _pixman_implementation_create_arm_simd ();
-#endif
-#ifdef USE_VMX
- if (pixman_have_vmx ())
- return _pixman_implementation_create_vmx ();
-#endif
-
- return _pixman_implementation_create_fast_path ();
-}
-
+/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. SuSE makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <string.h> + +#if defined(USE_ARM_SIMD) && defined(_MSC_VER) +/* Needed for EXCEPTION_ILLEGAL_INSTRUCTION */ +#include <windows.h> +#endif + +#include "pixman-private.h" + +#ifdef USE_VMX + +/* The CPU detection code needs to be in a file not compiled with + * "-maltivec -mabi=altivec", as gcc would try to save vector register + * across function calls causing SIGILL on cpus without Altivec/vmx. + */ +static pixman_bool_t initialized = FALSE; +static volatile pixman_bool_t have_vmx = TRUE; + +#ifdef __APPLE__ +#include <sys/sysctl.h> + +static pixman_bool_t +pixman_have_vmx (void) +{ + if (!initialized) + { + size_t length = sizeof(have_vmx); + int error = + sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0); + + if (error) + have_vmx = FALSE; + + initialized = TRUE; + } + return have_vmx; +} + +#elif defined (__OpenBSD__) +#include <sys/param.h> +#include <sys/sysctl.h> +#include <machine/cpu.h> + +static pixman_bool_t +pixman_have_vmx (void) +{ + if (!initialized) + { + int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC }; + size_t length = sizeof(have_vmx); + int error = + sysctl (mib, 2, &have_vmx, &length, NULL, 0); + + if (error != 0) + have_vmx = FALSE; + + initialized = TRUE; + } + return have_vmx; +} + +#elif defined (__linux__) +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdio.h> +#include <linux/auxvec.h> +#include <asm/cputable.h> + +static pixman_bool_t +pixman_have_vmx (void) +{ + if (!initialized) + { + char fname[64]; + unsigned long buf[64]; + ssize_t count = 0; + pid_t pid; + int fd, i; + + pid = getpid (); + snprintf (fname, sizeof(fname) - 1, "/proc/%d/auxv", pid); + + fd = open (fname, O_RDONLY); + if (fd >= 0) + { + for (i = 0; i <= (count / sizeof(unsigned long)); i += 2) + { + /* Read more if buf is empty... */ + if (i == (count / sizeof(unsigned long))) + { + count = read (fd, buf, sizeof(buf)); + if (count <= 0) + break; + i = 0; + } + + if (buf[i] == AT_HWCAP) + { + have_vmx = !!(buf[i + 1] & PPC_FEATURE_HAS_ALTIVEC); + initialized = TRUE; + break; + } + else if (buf[i] == AT_NULL) + { + break; + } + } + close (fd); + } + } + if (!initialized) + { + /* Something went wrong. Assume 'no' rather than playing + fragile tricks with catching SIGILL. */ + have_vmx = FALSE; + initialized = TRUE; + } + + return have_vmx; +} + +#else /* !__APPLE__ && !__OpenBSD__ && !__linux__ */ +#include <signal.h> +#include <setjmp.h> + +static jmp_buf jump_env; + +static void +vmx_test (int sig, + siginfo_t *si, + void * unused) +{ + longjmp (jump_env, 1); +} + +static pixman_bool_t +pixman_have_vmx (void) +{ + struct sigaction sa, osa; + int jmp_result; + + if (!initialized) + { + sa.sa_flags = SA_SIGINFO; + sigemptyset (&sa.sa_mask); + sa.sa_sigaction = vmx_test; + sigaction (SIGILL, &sa, &osa); + jmp_result = setjmp (jump_env); + if (jmp_result == 0) + { + asm volatile ( "vor 0, 0, 0" ); + } + sigaction (SIGILL, &osa, NULL); + have_vmx = (jmp_result == 0); + initialized = TRUE; + } + return have_vmx; +} + +#endif /* __APPLE__ */ +#endif /* USE_VMX */ + +#if defined(USE_ARM_SIMD) || defined(USE_ARM_NEON) + +#if defined(_MSC_VER) + +#if defined(USE_ARM_SIMD) +extern int pixman_msvc_try_arm_simd_op (); + +pixman_bool_t +pixman_have_arm_simd (void) +{ + static pixman_bool_t initialized = FALSE; + static pixman_bool_t have_arm_simd = FALSE; + + if (!initialized) + { + __try { + pixman_msvc_try_arm_simd_op (); + have_arm_simd = TRUE; + } __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION) { + have_arm_simd = FALSE; + } + initialized = TRUE; + } + + return have_arm_simd; +} + +#endif /* USE_ARM_SIMD */ + +#if defined(USE_ARM_NEON) +extern int pixman_msvc_try_arm_neon_op (); + +pixman_bool_t +pixman_have_arm_neon (void) +{ + static pixman_bool_t initialized = FALSE; + static pixman_bool_t have_arm_neon = FALSE; + + if (!initialized) + { + __try + { + pixman_msvc_try_arm_neon_op (); + have_arm_neon = TRUE; + } + __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION) + { + have_arm_neon = FALSE; + } + initialized = TRUE; + } + + return have_arm_neon; +} + +#endif /* USE_ARM_NEON */ + +#elif defined (__linux__) /* linux ELF */ + +#include <stdlib.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <string.h> +#include <elf.h> + +static pixman_bool_t arm_has_v7 = FALSE; +static pixman_bool_t arm_has_v6 = FALSE; +static pixman_bool_t arm_has_vfp = FALSE; +static pixman_bool_t arm_has_neon = FALSE; +static pixman_bool_t arm_has_iwmmxt = FALSE; +static pixman_bool_t arm_tests_initialized = FALSE; + +static void +pixman_arm_read_auxv () +{ + int fd; + Elf32_auxv_t aux; + + fd = open ("/proc/self/auxv", O_RDONLY); + if (fd >= 0) + { + while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) + { + if (aux.a_type == AT_HWCAP) + { + uint32_t hwcap = aux.a_un.a_val; + /* hardcode these values to avoid depending on specific + * versions of the hwcap header, e.g. HWCAP_NEON + */ + arm_has_vfp = (hwcap & 64) != 0; + arm_has_iwmmxt = (hwcap & 512) != 0; + /* this flag is only present on kernel 2.6.29 */ + arm_has_neon = (hwcap & 4096) != 0; + } + else if (aux.a_type == AT_PLATFORM) + { + const char *plat = (const char*) aux.a_un.a_val; + if (strncmp (plat, "v7l", 3) == 0) + { + arm_has_v7 = TRUE; + arm_has_v6 = TRUE; + } + else if (strncmp (plat, "v6l", 3) == 0) + { + arm_has_v6 = TRUE; + } + } + } + close (fd); + } + + arm_tests_initialized = TRUE; +} + +#if defined(USE_ARM_SIMD) +pixman_bool_t +pixman_have_arm_simd (void) +{ + if (!arm_tests_initialized) + pixman_arm_read_auxv (); + + return arm_has_v6; +} + +#endif /* USE_ARM_SIMD */ + +#if defined(USE_ARM_NEON) +pixman_bool_t +pixman_have_arm_neon (void) +{ + if (!arm_tests_initialized) + pixman_arm_read_auxv (); + + return arm_has_neon; +} + +#endif /* USE_ARM_NEON */ + +#else /* linux ELF */ + +#define pixman_have_arm_simd() FALSE +#define pixman_have_arm_neon() FALSE + +#endif + +#endif /* USE_ARM_SIMD || USE_ARM_NEON */ + +#if defined(USE_MMX) || defined(USE_SSE2) +/* The CPU detection code needs to be in a file not compiled with + * "-mmmx -msse", as gcc would generate CMOV instructions otherwise + * that would lead to SIGILL instructions on old CPUs that don't have + * it. + */ +#if !defined(__amd64__) && !defined(__x86_64__) && !defined(_M_AMD64) + +#ifdef HAVE_GETISAX +#include <sys/auxv.h> +#endif + +typedef enum +{ + NO_FEATURES = 0, + MMX = 0x1, + MMX_EXTENSIONS = 0x2, + SSE = 0x6, + SSE2 = 0x8, + CMOV = 0x10 +} cpu_features_t; + + +static unsigned int +detect_cpu_features (void) +{ + unsigned int features = 0; + unsigned int result = 0; + +#ifdef HAVE_GETISAX + if (getisax (&result, 1)) + { + if (result & AV_386_CMOV) + features |= CMOV; + if (result & AV_386_MMX) + features |= MMX; + if (result & AV_386_AMD_MMX) + features |= MMX_EXTENSIONS; + if (result & AV_386_SSE) + features |= SSE; + if (result & AV_386_SSE2) + features |= SSE2; + } +#else + char vendor[13]; +#ifdef _MSC_VER + int vendor0 = 0, vendor1, vendor2; +#endif + vendor[0] = 0; + vendor[12] = 0; + +#ifdef __GNUC__ + /* see p. 118 of amd64 instruction set manual Vol3 */ + /* We need to be careful about the handling of %ebx and + * %esp here. We can't declare either one as clobbered + * since they are special registers (%ebx is the "PIC + * register" holding an offset to global data, %esp the + * stack pointer), so we need to make sure they have their + * original values when we access the output operands. + */ + __asm__ ( + "pushf\n" + "pop %%eax\n" + "mov %%eax, %%ecx\n" + "xor $0x00200000, %%eax\n" + "push %%eax\n" + "popf\n" + "pushf\n" + "pop %%eax\n" + "mov $0x0, %%edx\n" + "xor %%ecx, %%eax\n" + "jz 1f\n" + + "mov $0x00000000, %%eax\n" + "push %%ebx\n" + "cpuid\n" + "mov %%ebx, %%eax\n" + "pop %%ebx\n" + "mov %%eax, %1\n" + "mov %%edx, %2\n" + "mov %%ecx, %3\n" + "mov $0x00000001, %%eax\n" + "push %%ebx\n" + "cpuid\n" + "pop %%ebx\n" + "1:\n" + "mov %%edx, %0\n" + : "=r" (result), + "=m" (vendor[0]), + "=m" (vendor[4]), + "=m" (vendor[8]) + : + : "%eax", "%ecx", "%edx" + ); + +#elif defined (_MSC_VER) + + _asm { + pushfd + pop eax + mov ecx, eax + xor eax, 00200000h + push eax + popfd + pushfd + pop eax + mov edx, 0 + xor eax, ecx + jz nocpuid + + mov eax, 0 + push ebx + cpuid + mov eax, ebx + pop ebx + mov vendor0, eax + mov vendor1, edx + mov vendor2, ecx + mov eax, 1 + push ebx + cpuid + pop ebx + nocpuid: + mov result, edx + } + memmove (vendor + 0, &vendor0, 4); + memmove (vendor + 4, &vendor1, 4); + memmove (vendor + 8, &vendor2, 4); + +#else +# error unsupported compiler +#endif + + features = 0; + if (result) + { + /* result now contains the standard feature bits */ + if (result & (1 << 15)) + features |= CMOV; + if (result & (1 << 23)) + features |= MMX; + if (result & (1 << 25)) + features |= SSE; + if (result & (1 << 26)) + features |= SSE2; + if ((features & MMX) && !(features & SSE) && + (strcmp (vendor, "AuthenticAMD") == 0 || + strcmp (vendor, "Geode by NSC") == 0)) + { + /* check for AMD MMX extensions */ +#ifdef __GNUC__ + __asm__ ( + " push %%ebx\n" + " mov $0x80000000, %%eax\n" + " cpuid\n" + " xor %%edx, %%edx\n" + " cmp $0x1, %%eax\n" + " jge 2f\n" + " mov $0x80000001, %%eax\n" + " cpuid\n" + "2:\n" + " pop %%ebx\n" + " mov %%edx, %0\n" + : "=r" (result) + : + : "%eax", "%ecx", "%edx" + ); +#elif defined _MSC_VER + _asm { + push ebx + mov eax, 80000000h + cpuid + xor edx, edx + cmp eax, 1 + jge notamd + mov eax, 80000001h + cpuid + notamd: + pop ebx + mov result, edx + } +#endif + if (result & (1 << 22)) + features |= MMX_EXTENSIONS; + } + } +#endif /* HAVE_GETISAX */ + + return features; +} + +static pixman_bool_t +pixman_have_mmx (void) +{ + static pixman_bool_t initialized = FALSE; + static pixman_bool_t mmx_present; + + if (!initialized) + { + unsigned int features = detect_cpu_features (); + mmx_present = (features & (MMX | MMX_EXTENSIONS)) == (MMX | MMX_EXTENSIONS); + initialized = TRUE; + } + + return mmx_present; +} + +#ifdef USE_SSE2 +static pixman_bool_t +pixman_have_sse2 (void) +{ + static pixman_bool_t initialized = FALSE; + static pixman_bool_t sse2_present; + + if (!initialized) + { + unsigned int features = detect_cpu_features (); + sse2_present = (features & (MMX | MMX_EXTENSIONS | SSE | SSE2)) == (MMX | MMX_EXTENSIONS | SSE | SSE2); + initialized = TRUE; + } + + return sse2_present; +} + +#endif + +#else /* __amd64__ */ +#ifdef USE_MMX +#define pixman_have_mmx() TRUE +#endif +#ifdef USE_SSE2 +#define pixman_have_sse2() TRUE +#endif +#endif /* __amd64__ */ +#endif + +pixman_implementation_t * +_pixman_choose_implementation (void) +{ +#ifdef USE_SSE2 + if (pixman_have_sse2 ()) + return _pixman_implementation_create_sse2 (); +#endif +#ifdef USE_MMX + if (pixman_have_mmx ()) + return _pixman_implementation_create_mmx (); +#endif + +#ifdef USE_ARM_NEON + if (pixman_have_arm_neon ()) + return _pixman_implementation_create_arm_neon (); +#endif +#ifdef USE_ARM_SIMD + if (pixman_have_arm_simd ()) + return _pixman_implementation_create_arm_simd (); +#endif +#ifdef USE_VMX + if (pixman_have_vmx ()) + return _pixman_implementation_create_vmx (); +#endif + + return _pixman_implementation_create_fast_path (); +} + diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c index 01a3017c7..f103b4cf1 100644 --- a/pixman/pixman/pixman-fast-path.c +++ b/pixman/pixman/pixman-fast-path.c @@ -1,1937 +1,1937 @@ -/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. SuSE makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author: Keith Packard, SuSE, Inc.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <string.h>
-#include <stdlib.h>
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-fast-path.h"
-
-static force_inline uint32_t
-fetch_24 (uint8_t *a)
-{
- if (((unsigned long)a) & 1)
- {
-#ifdef WORDS_BIGENDIAN
- return (*a << 16) | (*(uint16_t *)(a + 1));
-#else
- return *a | (*(uint16_t *)(a + 1) << 8);
-#endif
- }
- else
- {
-#ifdef WORDS_BIGENDIAN
- return (*(uint16_t *)a << 8) | *(a + 2);
-#else
- return *(uint16_t *)a | (*(a + 2) << 16);
-#endif
- }
-}
-
-static force_inline void
-store_24 (uint8_t *a,
- uint32_t v)
-{
- if (((unsigned long)a) & 1)
- {
-#ifdef WORDS_BIGENDIAN
- *a = (uint8_t) (v >> 16);
- *(uint16_t *)(a + 1) = (uint16_t) (v);
-#else
- *a = (uint8_t) (v);
- *(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
-#endif
- }
- else
- {
-#ifdef WORDS_BIGENDIAN
- *(uint16_t *)a = (uint16_t)(v >> 8);
- *(a + 2) = (uint8_t)v;
-#else
- *(uint16_t *)a = (uint16_t)v;
- *(a + 2) = (uint8_t)(v >> 16);
-#endif
- }
-}
-
-static force_inline uint32_t
-over (uint32_t src,
- uint32_t dest)
-{
- uint32_t a = ~src >> 24;
-
- UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
-
- return dest;
-}
-
-static uint32_t
-in (uint32_t x,
- uint8_t y)
-{
- uint16_t a = y;
-
- UN8x4_MUL_UN8 (x, a);
-
- return x;
-}
-
-/*
- * Naming convention:
- *
- * op_src_mask_dest
- */
-static void
-fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line;
- uint32_t *dst, *dst_line;
- uint8_t *mask, *mask_line;
- int src_stride, mask_stride, dst_stride;
- uint8_t m;
- uint32_t s, d;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
- while (w--)
- {
- m = *mask++;
- if (m)
- {
- s = *src | 0xff000000;
-
- if (m == 0xff)
- {
- *dst = s;
- }
- else
- {
- d = in (s, m);
- *dst = over (d, *dst);
- }
- }
- src++;
- dst++;
- }
- }
-}
-
-static void
-fast_composite_in_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dest_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
- uint16_t t;
-
- src = _pixman_image_get_solid (src_image, dest_image->bits.format);
-
- srca = src >> 24;
-
- PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- if (srca == 0xff)
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
-
- if (m == 0)
- *dst = 0;
- else if (m != 0xff)
- *dst = MUL_UN8 (m, *dst, t);
-
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- m = MUL_UN8 (m, srca, t);
-
- if (m == 0)
- *dst = 0;
- else if (m != 0xff)
- *dst = MUL_UN8 (m, *dst, t);
-
- dst++;
- }
- }
- }
-}
-
-static void
-fast_composite_in_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dest_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint8_t s;
- uint16_t t;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
-
- if (s == 0)
- *dst = 0;
- else if (s != 0xff)
- *dst = MUL_UN8 (s, *dst, t);
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, *dst, d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- *dst = src;
- else
- *dst = over (src, *dst);
- }
- else if (m)
- {
- d = in (src, m);
- *dst = over (d, *dst);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca, s;
- uint32_t *dst_line, *dst, d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
-
- if (ma)
- {
- d = *dst;
- s = src;
-
- UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
-
- *dst = s;
- }
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca, s;
- uint32_t *dst_line, *dst, d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
- if (ma == 0xffffffff)
- {
- if (srca == 0xff)
- *dst = src;
- else
- *dst = over (src, *dst);
- }
- else if (ma)
- {
- d = *dst;
- s = src;
-
- UN8x4_MUL_UN8x4 (s, ma);
- UN8x4_MUL_UN8 (ma, srca);
- ma = ~ma;
- UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
- *dst = d;
- }
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint8_t *dst_line, *dst;
- uint32_t d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- {
- d = src;
- }
- else
- {
- d = fetch_24 (dst);
- d = over (src, d);
- }
- store_24 (dst, d);
- }
- else if (m)
- {
- d = over (in (src, m), fetch_24 (dst));
- store_24 (dst, d);
- }
- dst += 3;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- {
- d = src;
- }
- else
- {
- d = *dst;
- d = over (src, CONVERT_0565_TO_0888 (d));
- }
- *dst = CONVERT_8888_TO_0565 (d);
- }
- else if (m)
- {
- d = *dst;
- d = over (in (src, m), CONVERT_0565_TO_0888 (d));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca, s;
- uint16_t src16;
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- src16 = CONVERT_8888_TO_0565 (src);
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
- if (ma == 0xffffffff)
- {
- if (srca == 0xff)
- {
- *dst = src16;
- }
- else
- {
- d = *dst;
- d = over (src, CONVERT_0565_TO_0888 (d));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- }
- else if (ma)
- {
- d = *dst;
- d = CONVERT_0565_TO_0888 (d);
-
- s = src;
-
- UN8x4_MUL_UN8x4 (s, ma);
- UN8x4_MUL_UN8 (ma, srca);
- ma = ~ma;
- UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- uint8_t a;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (a == 0xff)
- *dst = s;
- else if (s)
- *dst = over (s, *dst);
- dst++;
- }
- }
-}
-
-static void
-fast_composite_src_x888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- *dst++ = (*src++) | 0xff000000;
- }
-}
-
-#if 0
-static void
-fast_composite_over_8888_0888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint32_t d;
- uint32_t *src_line, *src, s;
- uint8_t a;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (a)
- {
- if (a == 0xff)
- d = s;
- else
- d = over (s, fetch_24 (dst));
-
- store_24 (dst, d);
- }
- dst += 3;
- }
- }
-}
-#endif
-
-static void
-fast_composite_over_8888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint32_t *src_line, *src, s;
- uint8_t a;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (s)
- {
- if (a == 0xff)
- {
- d = s;
- }
- else
- {
- d = *dst;
- d = over (s, CONVERT_0565_TO_0888 (d));
- }
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_src_x888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- *dst = CONVERT_8888_TO_0565 (s);
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint8_t s, d;
- uint16_t t;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- if (s)
- {
- if (s != 0xff)
- {
- d = *dst;
- t = d + s;
- s = t | (0 - (t >> 8));
- }
- *dst = s;
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint32_t s, d;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- if (s)
- {
- if (s != 0xffffffff)
- {
- d = *dst;
- if (d)
- UN8x4_ADD_UN8x4 (s, d);
- }
- *dst = s;
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t src;
- uint8_t sa;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- sa = (src >> 24);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- uint16_t tmp;
- uint16_t a;
- uint32_t m, d;
- uint32_t r;
-
- a = *mask++;
- d = *dst;
-
- m = MUL_UN8 (sa, a, tmp);
- r = ADD_UN8 (m, d, tmp);
-
- *dst++ = r;
- }
- }
-}
-
-#ifdef WORDS_BIGENDIAN
-#define CREATE_BITMASK(n) (0x80000000 >> (n))
-#define UPDATE_BITMASK(n) ((n) >> 1)
-#else
-#define CREATE_BITMASK(n) (1 << (n))
-#define UPDATE_BITMASK(n) ((n) << 1)
-#endif
-
-#define TEST_BIT(p, n) \
- (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
-#define SET_BIT(p, n) \
- do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
-
-static void
-fast_composite_add_1000_1000 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
- src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
- dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- /*
- * TODO: improve performance by processing uint32_t data instead
- * of individual bits
- */
- if (TEST_BIT (src, src_x + w))
- SET_BIT (dst, dest_x + w);
- }
- }
-}
-
-static void
-fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst, *dst_line;
- uint32_t *mask, *mask_line;
- int mask_stride, dst_stride;
- uint32_t bitcache, bitmask;
- int32_t w;
-
- if (width <= 0)
- return;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
- dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
- mask_stride, mask_line, 1);
- mask_line += mask_x >> 5;
-
- if (srca == 0xff)
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = src;
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = over (src, *dst);
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
-}
-
-static void
-fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dst, *dst_line;
- uint32_t *mask, *mask_line;
- int mask_stride, dst_stride;
- uint32_t bitcache, bitmask;
- int32_t w;
- uint32_t d;
- uint16_t src565;
-
- if (width <= 0)
- return;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
- dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
- mask_stride, mask_line, 1);
- mask_line += mask_x >> 5;
-
- if (srca == 0xff)
- {
- src565 = CONVERT_8888_TO_0565 (src);
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = src565;
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- {
- d = over (src, CONVERT_0565_TO_0888 (*dst));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
-}
-
-/*
- * Simple bitblt
- */
-
-static void
-fast_composite_solid_fill (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (dst_image->bits.format == PIXMAN_a1)
- {
- src = src >> 31;
- }
- else if (dst_image->bits.format == PIXMAN_a8)
- {
- src = src >> 24;
- }
- else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
- dst_image->bits.format == PIXMAN_b5g6r5)
- {
- src = CONVERT_8888_TO_0565 (src);
- }
-
- pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
- dest_x, dest_y,
- width, height,
- src);
-}
-
-static void
-fast_composite_src_memcpy (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8;
- uint32_t n_bytes = width * bpp;
- int dst_stride, src_stride;
- uint8_t *dst;
- uint8_t *src;
-
- src_stride = src_image->bits.rowstride * 4;
- dst_stride = dst_image->bits.rowstride * 4;
-
- src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
- dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
-
- while (height--)
- {
- memcpy (dst, src, n_bytes);
-
- dst += dst_stride;
- src += src_stride;
- }
-}
-
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER);
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE);
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD);
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL);
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER);
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE);
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD);
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL);
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER);
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE);
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD);
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL);
-FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL);
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER);
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE);
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD);
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL);
-
-/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
-static force_inline void
-scaled_nearest_scanline_565_565_SRC (uint16_t * dst,
- uint16_t * src,
- int32_t w,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- pixman_fixed_t max_vx)
-{
- uint16_t tmp1, tmp2, tmp3, tmp4;
- while ((w -= 4) >= 0)
- {
- tmp1 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp3 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp4 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- *dst++ = tmp1;
- *dst++ = tmp2;
- *dst++ = tmp3;
- *dst++ = tmp4;
- }
- if (w & 2)
- {
- tmp1 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- *dst++ = tmp1;
- *dst++ = tmp2;
- }
- if (w & 1)
- *dst++ = src[pixman_fixed_to_int (vx)];
-}
-
-FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, COVER);
-FAST_NEAREST_MAINLOOP (565_565_none_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, NONE);
-FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, PAD);
-
-static force_inline uint32_t
-fetch_nearest (pixman_repeat_t src_repeat,
- pixman_format_code_t format,
- uint32_t *src, int x, int src_width)
-{
- if (repeat (src_repeat, &x, src_width))
- {
- if (format == PIXMAN_x8r8g8b8)
- return *(src + x) | 0xff000000;
- else
- return *(src + x);
- }
- else
- {
- return 0;
- }
-}
-
-static force_inline void
-combine_over (uint32_t s, uint32_t *dst)
-{
- if (s)
- {
- uint8_t ia = 0xff - (s >> 24);
-
- if (ia)
- UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
- else
- *dst = s;
- }
-}
-
-static force_inline void
-combine_src (uint32_t s, uint32_t *dst)
-{
- *dst = s;
-}
-
-static void
-fast_composite_scaled_nearest (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line;
- uint32_t *src_line;
- int dst_stride, src_stride;
- int src_width, src_height;
- pixman_repeat_t src_repeat;
- pixman_fixed_t unit_x, unit_y;
- pixman_format_code_t src_format;
- pixman_vector_t v;
- pixman_fixed_t vy;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
- * transformed from destination space to source space
- */
- PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (src_image->common.transform, &v))
- return;
-
- unit_x = src_image->common.transform->matrix[0][0];
- unit_y = src_image->common.transform->matrix[1][1];
-
- /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
- v.vector[0] -= pixman_fixed_e;
- v.vector[1] -= pixman_fixed_e;
-
- src_height = src_image->bits.height;
- src_width = src_image->bits.width;
- src_repeat = src_image->common.repeat;
- src_format = src_image->bits.format;
-
- vy = v.vector[1];
- while (height--)
- {
- pixman_fixed_t vx = v.vector[0];
- int y = pixman_fixed_to_int (vy);
- uint32_t *dst = dst_line;
-
- dst_line += dst_stride;
-
- /* adjust the y location by a unit vector in the y direction
- * this is equivalent to transforming y+1 of the destination point to source space */
- vy += unit_y;
-
- if (!repeat (src_repeat, &y, src_height))
- {
- if (op == PIXMAN_OP_SRC)
- memset (dst, 0, sizeof (*dst) * width);
- }
- else
- {
- int w = width;
-
- uint32_t *src = src_line + y * src_stride;
-
- while (w >= 2)
- {
- uint32_t s1, s2;
- int x1, x2;
-
- x1 = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- x2 = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- w -= 2;
-
- s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
- s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
-
- if (op == PIXMAN_OP_OVER)
- {
- combine_over (s1, dst++);
- combine_over (s2, dst++);
- }
- else
- {
- combine_src (s1, dst++);
- combine_src (s2, dst++);
- }
- }
-
- while (w--)
- {
- uint32_t s;
- int x;
-
- x = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- s = fetch_nearest (src_repeat, src_format, src, x, src_width);
-
- if (op == PIXMAN_OP_OVER)
- combine_over (s, dst++);
- else
- combine_src (s, dst++);
- }
- }
- }
-}
-
-static const pixman_fast_path_t c_fast_paths[] =
-{
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5, fast_composite_over_n_1_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5, fast_composite_over_n_1_0565),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
- PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
- PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
- PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
-
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
-
-#define NEAREST_FAST_PATH(op,s,d) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, SCALED_NEAREST_FLAGS, \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest, \
- }
-
- NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
- NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
-
- NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
- NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
-
- NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
- NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
-
- NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
- NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
-
- { PIXMAN_OP_NONE },
-};
-
-#ifdef WORDS_BIGENDIAN
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
-#else
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
-#endif
-
-static force_inline void
-pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
-{
- if (offs)
- {
- int leading_pixels = 32 - offs;
- if (leading_pixels >= width)
- {
- if (v)
- *dst |= A1_FILL_MASK (width, offs);
- else
- *dst &= ~A1_FILL_MASK (width, offs);
- return;
- }
- else
- {
- if (v)
- *dst++ |= A1_FILL_MASK (leading_pixels, offs);
- else
- *dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
- width -= leading_pixels;
- }
- }
- while (width >= 32)
- {
- if (v)
- *dst++ = 0xFFFFFFFF;
- else
- *dst++ = 0;
- width -= 32;
- }
- if (width > 0)
- {
- if (v)
- *dst |= A1_FILL_MASK (width, 0);
- else
- *dst &= ~A1_FILL_MASK (width, 0);
- }
-}
-
-static void
-pixman_fill1 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- uint32_t *dst = bits + y * stride + (x >> 5);
- int offs = x & 31;
-
- if (xor & 1)
- {
- while (height--)
- {
- pixman_fill1_line (dst, offs, width, 1);
- dst += stride;
- }
- }
- else
- {
- while (height--)
- {
- pixman_fill1_line (dst, offs, width, 0);
- dst += stride;
- }
- }
-}
-
-static void
-pixman_fill8 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int byte_stride = stride * (int) sizeof (uint32_t);
- uint8_t *dst = (uint8_t *) bits;
- uint8_t v = xor & 0xff;
- int i;
-
- dst = dst + y * byte_stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- dst[i] = v;
-
- dst += byte_stride;
- }
-}
-
-static void
-pixman_fill16 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int short_stride =
- (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
- uint16_t *dst = (uint16_t *)bits;
- uint16_t v = xor & 0xffff;
- int i;
-
- dst = dst + y * short_stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- dst[i] = v;
-
- dst += short_stride;
- }
-}
-
-static void
-pixman_fill32 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int i;
-
- bits = bits + y * stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- bits[i] = xor;
-
- bits += stride;
- }
-}
-
-static pixman_bool_t
-fast_path_fill (pixman_implementation_t *imp,
- uint32_t * bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- switch (bpp)
- {
- case 1:
- pixman_fill1 (bits, stride, x, y, width, height, xor);
- break;
-
- case 8:
- pixman_fill8 (bits, stride, x, y, width, height, xor);
- break;
-
- case 16:
- pixman_fill16 (bits, stride, x, y, width, height, xor);
- break;
-
- case 32:
- pixman_fill32 (bits, stride, x, y, width, height, xor);
- break;
-
- default:
- return _pixman_implementation_fill (
- imp->delegate, bits, stride, bpp, x, y, width, height, xor);
- break;
- }
-
- return TRUE;
-}
-
-pixman_implementation_t *
-_pixman_implementation_create_fast_path (void)
-{
- pixman_implementation_t *general = _pixman_implementation_create_general ();
- pixman_implementation_t *imp = _pixman_implementation_create (general, c_fast_paths);
-
- imp->fill = fast_path_fill;
-
- return imp;
-}
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. SuSE makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author: Keith Packard, SuSE, Inc. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <string.h> +#include <stdlib.h> +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "pixman-fast-path.h" + +static force_inline uint32_t +fetch_24 (uint8_t *a) +{ + if (((unsigned long)a) & 1) + { +#ifdef WORDS_BIGENDIAN + return (*a << 16) | (*(uint16_t *)(a + 1)); +#else + return *a | (*(uint16_t *)(a + 1) << 8); +#endif + } + else + { +#ifdef WORDS_BIGENDIAN + return (*(uint16_t *)a << 8) | *(a + 2); +#else + return *(uint16_t *)a | (*(a + 2) << 16); +#endif + } +} + +static force_inline void +store_24 (uint8_t *a, + uint32_t v) +{ + if (((unsigned long)a) & 1) + { +#ifdef WORDS_BIGENDIAN + *a = (uint8_t) (v >> 16); + *(uint16_t *)(a + 1) = (uint16_t) (v); +#else + *a = (uint8_t) (v); + *(uint16_t *)(a + 1) = (uint16_t) (v >> 8); +#endif + } + else + { +#ifdef WORDS_BIGENDIAN + *(uint16_t *)a = (uint16_t)(v >> 8); + *(a + 2) = (uint8_t)v; +#else + *(uint16_t *)a = (uint16_t)v; + *(a + 2) = (uint8_t)(v >> 16); +#endif + } +} + +static force_inline uint32_t +over (uint32_t src, + uint32_t dest) +{ + uint32_t a = ~src >> 24; + + UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src); + + return dest; +} + +static uint32_t +in (uint32_t x, + uint8_t y) +{ + uint16_t a = y; + + UN8x4_MUL_UN8 (x, a); + + return x; +} + +/* + * Naming convention: + * + * op_src_mask_dest + */ +static void +fast_composite_over_x888_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *src, *src_line; + uint32_t *dst, *dst_line; + uint8_t *mask, *mask_line; + int src_stride, mask_stride, dst_stride; + uint8_t m; + uint32_t s, d; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + while (w--) + { + m = *mask++; + if (m) + { + s = *src | 0xff000000; + + if (m == 0xff) + { + *dst = s; + } + else + { + d = in (s, m); + *dst = over (d, *dst); + } + } + src++; + dst++; + } + } +} + +static void +fast_composite_in_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dest_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + uint16_t t; + + src = _pixman_image_get_solid (src_image, dest_image->bits.format); + + srca = src >> 24; + + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + if (srca == 0xff) + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + + if (m == 0) + *dst = 0; + else if (m != 0xff) + *dst = MUL_UN8 (m, *dst, t); + + dst++; + } + } + } + else + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + m = MUL_UN8 (m, srca, t); + + if (m == 0) + *dst = 0; + else if (m != 0xff) + *dst = MUL_UN8 (m, *dst, t); + + dst++; + } + } + } +} + +static void +fast_composite_in_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dest_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint8_t s; + uint16_t t; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + + if (s == 0) + *dst = 0; + else if (s != 0xff) + *dst = MUL_UN8 (s, *dst, t); + + dst++; + } + } +} + +static void +fast_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, *dst, d; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + if (m == 0xff) + { + if (srca == 0xff) + *dst = src; + else + *dst = over (src, *dst); + } + else if (m) + { + d = in (src, m); + *dst = over (d, *dst); + } + dst++; + } + } +} + +static void +fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca, s; + uint32_t *dst_line, *dst, d; + uint32_t *mask_line, *mask, ma; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + ma = *mask++; + + if (ma) + { + d = *dst; + s = src; + + UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d); + + *dst = s; + } + + dst++; + } + } +} + +static void +fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca, s; + uint32_t *dst_line, *dst, d; + uint32_t *mask_line, *mask, ma; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + ma = *mask++; + if (ma == 0xffffffff) + { + if (srca == 0xff) + *dst = src; + else + *dst = over (src, *dst); + } + else if (ma) + { + d = *dst; + s = src; + + UN8x4_MUL_UN8x4 (s, ma); + UN8x4_MUL_UN8 (ma, srca); + ma = ~ma; + UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s); + + *dst = d; + } + + dst++; + } + } +} + +static void +fast_composite_over_n_8_0888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint8_t *dst_line, *dst; + uint32_t d; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + if (m == 0xff) + { + if (srca == 0xff) + { + d = src; + } + else + { + d = fetch_24 (dst); + d = over (src, d); + } + store_24 (dst, d); + } + else if (m) + { + d = over (in (src, m), fetch_24 (dst)); + store_24 (dst, d); + } + dst += 3; + } + } +} + +static void +fast_composite_over_n_8_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint16_t *dst_line, *dst; + uint32_t d; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + if (m == 0xff) + { + if (srca == 0xff) + { + d = src; + } + else + { + d = *dst; + d = over (src, CONVERT_0565_TO_0888 (d)); + } + *dst = CONVERT_8888_TO_0565 (d); + } + else if (m) + { + d = *dst; + d = over (in (src, m), CONVERT_0565_TO_0888 (d)); + *dst = CONVERT_8888_TO_0565 (d); + } + dst++; + } + } +} + +static void +fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca, s; + uint16_t src16; + uint16_t *dst_line, *dst; + uint32_t d; + uint32_t *mask_line, *mask, ma; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + src16 = CONVERT_8888_TO_0565 (src); + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + ma = *mask++; + if (ma == 0xffffffff) + { + if (srca == 0xff) + { + *dst = src16; + } + else + { + d = *dst; + d = over (src, CONVERT_0565_TO_0888 (d)); + *dst = CONVERT_8888_TO_0565 (d); + } + } + else if (ma) + { + d = *dst; + d = CONVERT_0565_TO_0888 (d); + + s = src; + + UN8x4_MUL_UN8x4 (s, ma); + UN8x4_MUL_UN8 (ma, srca); + ma = ~ma; + UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s); + + *dst = CONVERT_8888_TO_0565 (d); + } + dst++; + } + } +} + +static void +fast_composite_over_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + uint8_t a; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + a = s >> 24; + if (a == 0xff) + *dst = s; + else if (s) + *dst = over (s, *dst); + dst++; + } + } +} + +static void +fast_composite_src_x888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + *dst++ = (*src++) | 0xff000000; + } +} + +#if 0 +static void +fast_composite_over_8888_0888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint32_t d; + uint32_t *src_line, *src, s; + uint8_t a; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + a = s >> 24; + if (a) + { + if (a == 0xff) + d = s; + else + d = over (s, fetch_24 (dst)); + + store_24 (dst, d); + } + dst += 3; + } + } +} +#endif + +static void +fast_composite_over_8888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst; + uint32_t d; + uint32_t *src_line, *src, s; + uint8_t a; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + a = s >> 24; + if (s) + { + if (a == 0xff) + { + d = s; + } + else + { + d = *dst; + d = over (s, CONVERT_0565_TO_0888 (d)); + } + *dst = CONVERT_8888_TO_0565 (d); + } + dst++; + } + } +} + +static void +fast_composite_src_x888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + *dst = CONVERT_8888_TO_0565 (s); + dst++; + } + } +} + +static void +fast_composite_add_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint8_t s, d; + uint16_t t; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + if (s) + { + if (s != 0xff) + { + d = *dst; + t = d + s; + s = t | (0 - (t >> 8)); + } + *dst = s; + } + dst++; + } + } +} + +static void +fast_composite_add_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint32_t s, d; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + if (s) + { + if (s != 0xffffffff) + { + d = *dst; + if (d) + UN8x4_ADD_UN8x4 (s, d); + } + *dst = s; + } + dst++; + } + } +} + +static void +fast_composite_add_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t src; + uint8_t sa; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + sa = (src >> 24); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + uint16_t tmp; + uint16_t a; + uint32_t m, d; + uint32_t r; + + a = *mask++; + d = *dst; + + m = MUL_UN8 (sa, a, tmp); + r = ADD_UN8 (m, d, tmp); + + *dst++ = r; + } + } +} + +#ifdef WORDS_BIGENDIAN +#define CREATE_BITMASK(n) (0x80000000 >> (n)) +#define UPDATE_BITMASK(n) ((n) >> 1) +#else +#define CREATE_BITMASK(n) (1 << (n)) +#define UPDATE_BITMASK(n) ((n) << 1) +#endif + +#define TEST_BIT(p, n) \ + (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31)) +#define SET_BIT(p, n) \ + do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0); + +static void +fast_composite_add_1000_1000 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t, + src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t, + dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + /* + * TODO: improve performance by processing uint32_t data instead + * of individual bits + */ + if (TEST_BIT (src, src_x + w)) + SET_BIT (dst, dest_x + w); + } + } +} + +static void +fast_composite_over_n_1_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst, *dst_line; + uint32_t *mask, *mask_line; + int mask_stride, dst_stride; + uint32_t bitcache, bitmask; + int32_t w; + + if (width <= 0) + return; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, + dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t, + mask_stride, mask_line, 1); + mask_line += mask_x >> 5; + + if (srca == 0xff) + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + *dst = src; + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } + else + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + *dst = over (src, *dst); + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } +} + +static void +fast_composite_over_n_1_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint16_t *dst, *dst_line; + uint32_t *mask, *mask_line; + int mask_stride, dst_stride; + uint32_t bitcache, bitmask; + int32_t w; + uint32_t d; + uint16_t src565; + + if (width <= 0) + return; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, + dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t, + mask_stride, mask_line, 1); + mask_line += mask_x >> 5; + + if (srca == 0xff) + { + src565 = CONVERT_8888_TO_0565 (src); + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + *dst = src565; + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } + else + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + { + d = over (src, CONVERT_0565_TO_0888 (*dst)); + *dst = CONVERT_8888_TO_0565 (d); + } + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } +} + +/* + * Simple bitblt + */ + +static void +fast_composite_solid_fill (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (dst_image->bits.format == PIXMAN_a1) + { + src = src >> 31; + } + else if (dst_image->bits.format == PIXMAN_a8) + { + src = src >> 24; + } + else if (dst_image->bits.format == PIXMAN_r5g6b5 || + dst_image->bits.format == PIXMAN_b5g6r5) + { + src = CONVERT_8888_TO_0565 (src); + } + + pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride, + PIXMAN_FORMAT_BPP (dst_image->bits.format), + dest_x, dest_y, + width, height, + src); +} + +static void +fast_composite_src_memcpy (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8; + uint32_t n_bytes = width * bpp; + int dst_stride, src_stride; + uint8_t *dst; + uint8_t *src; + + src_stride = src_image->bits.rowstride * 4; + dst_stride = dst_image->bits.rowstride * 4; + + src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp; + dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp; + + while (height--) + { + memcpy (dst, src, n_bytes); + + dst += dst_stride; + src += src_stride; + } +} + +FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER) +FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE) +FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD) +FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL) +FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER) +FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE) +FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD) +FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL) +FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER) +FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE) +FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD) +FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL) +FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL) +FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER) +FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE) +FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD) +FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL) + +/* Use more unrolling for src_0565_0565 because it is typically CPU bound */ +static force_inline void +scaled_nearest_scanline_565_565_SRC (uint16_t * dst, + uint16_t * src, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx) +{ + uint16_t tmp1, tmp2, tmp3, tmp4; + while ((w -= 4) >= 0) + { + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp3 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp4 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + *dst++ = tmp1; + *dst++ = tmp2; + *dst++ = tmp3; + *dst++ = tmp4; + } + if (w & 2) + { + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + *dst++ = tmp1; + *dst++ = tmp2; + } + if (w & 1) + *dst++ = src[pixman_fixed_to_int (vx)]; +} + +FAST_NEAREST_MAINLOOP (565_565_cover_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, COVER) +FAST_NEAREST_MAINLOOP (565_565_none_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, NONE) +FAST_NEAREST_MAINLOOP (565_565_pad_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, PAD) + +static force_inline uint32_t +fetch_nearest (pixman_repeat_t src_repeat, + pixman_format_code_t format, + uint32_t *src, int x, int src_width) +{ + if (repeat (src_repeat, &x, src_width)) + { + if (format == PIXMAN_x8r8g8b8) + return *(src + x) | 0xff000000; + else + return *(src + x); + } + else + { + return 0; + } +} + +static force_inline void +combine_over (uint32_t s, uint32_t *dst) +{ + if (s) + { + uint8_t ia = 0xff - (s >> 24); + + if (ia) + UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s); + else + *dst = s; + } +} + +static force_inline void +combine_src (uint32_t s, uint32_t *dst) +{ + *dst = s; +} + +static void +fast_composite_scaled_nearest (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line; + uint32_t *src_line; + int dst_stride, src_stride; + int src_width, src_height; + pixman_repeat_t src_repeat; + pixman_fixed_t unit_x, unit_y; + pixman_format_code_t src_format; + pixman_vector_t v; + pixman_fixed_t vy; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + /* pass in 0 instead of src_x and src_y because src_x and src_y need to be + * transformed from destination space to source space + */ + PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1); + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (src_image->common.transform, &v)) + return; + + unit_x = src_image->common.transform->matrix[0][0]; + unit_y = src_image->common.transform->matrix[1][1]; + + /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ + v.vector[0] -= pixman_fixed_e; + v.vector[1] -= pixman_fixed_e; + + src_height = src_image->bits.height; + src_width = src_image->bits.width; + src_repeat = src_image->common.repeat; + src_format = src_image->bits.format; + + vy = v.vector[1]; + while (height--) + { + pixman_fixed_t vx = v.vector[0]; + int y = pixman_fixed_to_int (vy); + uint32_t *dst = dst_line; + + dst_line += dst_stride; + + /* adjust the y location by a unit vector in the y direction + * this is equivalent to transforming y+1 of the destination point to source space */ + vy += unit_y; + + if (!repeat (src_repeat, &y, src_height)) + { + if (op == PIXMAN_OP_SRC) + memset (dst, 0, sizeof (*dst) * width); + } + else + { + int w = width; + + uint32_t *src = src_line + y * src_stride; + + while (w >= 2) + { + uint32_t s1, s2; + int x1, x2; + + x1 = pixman_fixed_to_int (vx); + vx += unit_x; + + x2 = pixman_fixed_to_int (vx); + vx += unit_x; + + w -= 2; + + s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width); + s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width); + + if (op == PIXMAN_OP_OVER) + { + combine_over (s1, dst++); + combine_over (s2, dst++); + } + else + { + combine_src (s1, dst++); + combine_src (s2, dst++); + } + } + + while (w--) + { + uint32_t s; + int x; + + x = pixman_fixed_to_int (vx); + vx += unit_x; + + s = fetch_nearest (src_repeat, src_format, src, x, src_width); + + if (op == PIXMAN_OP_OVER) + combine_over (s, dst++); + else + combine_src (s, dst++); + } + } + } +} + +static const pixman_fast_path_t c_fast_paths[] = +{ + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5, fast_composite_over_n_1_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5, fast_composite_over_n_1_0565), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8), + PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000), + PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8), + PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8), + + SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888), + + SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888), + + SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565), + SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565), + + SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565), + + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888), + + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565), + +#define NEAREST_FAST_PATH(op,s,d) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, SCALED_NEAREST_FLAGS, \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest, \ + } + + NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8), + NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8), + + NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8), + NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8), + + NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8), + NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8), + + NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8), + NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8), + + { PIXMAN_OP_NONE }, +}; + +#ifdef WORDS_BIGENDIAN +#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n))) +#else +#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs)) +#endif + +static force_inline void +pixman_fill1_line (uint32_t *dst, int offs, int width, int v) +{ + if (offs) + { + int leading_pixels = 32 - offs; + if (leading_pixels >= width) + { + if (v) + *dst |= A1_FILL_MASK (width, offs); + else + *dst &= ~A1_FILL_MASK (width, offs); + return; + } + else + { + if (v) + *dst++ |= A1_FILL_MASK (leading_pixels, offs); + else + *dst++ &= ~A1_FILL_MASK (leading_pixels, offs); + width -= leading_pixels; + } + } + while (width >= 32) + { + if (v) + *dst++ = 0xFFFFFFFF; + else + *dst++ = 0; + width -= 32; + } + if (width > 0) + { + if (v) + *dst |= A1_FILL_MASK (width, 0); + else + *dst &= ~A1_FILL_MASK (width, 0); + } +} + +static void +pixman_fill1 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + uint32_t *dst = bits + y * stride + (x >> 5); + int offs = x & 31; + + if (xor & 1) + { + while (height--) + { + pixman_fill1_line (dst, offs, width, 1); + dst += stride; + } + } + else + { + while (height--) + { + pixman_fill1_line (dst, offs, width, 0); + dst += stride; + } + } +} + +static void +pixman_fill8 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + int byte_stride = stride * (int) sizeof (uint32_t); + uint8_t *dst = (uint8_t *) bits; + uint8_t v = xor & 0xff; + int i; + + dst = dst + y * byte_stride + x; + + while (height--) + { + for (i = 0; i < width; ++i) + dst[i] = v; + + dst += byte_stride; + } +} + +static void +pixman_fill16 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + int short_stride = + (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t); + uint16_t *dst = (uint16_t *)bits; + uint16_t v = xor & 0xffff; + int i; + + dst = dst + y * short_stride + x; + + while (height--) + { + for (i = 0; i < width; ++i) + dst[i] = v; + + dst += short_stride; + } +} + +static void +pixman_fill32 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + int i; + + bits = bits + y * stride + x; + + while (height--) + { + for (i = 0; i < width; ++i) + bits[i] = xor; + + bits += stride; + } +} + +static pixman_bool_t +fast_path_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + switch (bpp) + { + case 1: + pixman_fill1 (bits, stride, x, y, width, height, xor); + break; + + case 8: + pixman_fill8 (bits, stride, x, y, width, height, xor); + break; + + case 16: + pixman_fill16 (bits, stride, x, y, width, height, xor); + break; + + case 32: + pixman_fill32 (bits, stride, x, y, width, height, xor); + break; + + default: + return _pixman_implementation_fill ( + imp->delegate, bits, stride, bpp, x, y, width, height, xor); + break; + } + + return TRUE; +} + +pixman_implementation_t * +_pixman_implementation_create_fast_path (void) +{ + pixman_implementation_t *general = _pixman_implementation_create_general (); + pixman_implementation_t *imp = _pixman_implementation_create (general, c_fast_paths); + + imp->fill = fast_path_fill; + + return imp; +} diff --git a/pixman/pixman/pixman-fast-path.h b/pixman/pixman/pixman-fast-path.h index 0273cd33b..b46937a28 100644 --- a/pixman/pixman/pixman-fast-path.h +++ b/pixman/pixman/pixman-fast-path.h @@ -1,445 +1,449 @@ -/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. SuSE makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author: Keith Packard, SuSE, Inc.
- */
-
-#ifndef PIXMAN_FAST_PATH_H__
-#define PIXMAN_FAST_PATH_H__
-
-#include "pixman-private.h"
-
-#define PIXMAN_REPEAT_COVER -1
-
-static force_inline pixman_bool_t
-repeat (pixman_repeat_t repeat, int *c, int size)
-{
- if (repeat == PIXMAN_REPEAT_NONE)
- {
- if (*c < 0 || *c >= size)
- return FALSE;
- }
- else if (repeat == PIXMAN_REPEAT_NORMAL)
- {
- while (*c >= size)
- *c -= size;
- while (*c < 0)
- *c += size;
- }
- else if (repeat == PIXMAN_REPEAT_PAD)
- {
- *c = CLIP (*c, 0, size - 1);
- }
- else /* REFLECT */
- {
- *c = MOD (*c, size * 2);
- if (*c >= size)
- *c = size * 2 - *c - 1;
- }
- return TRUE;
-}
-
-/*
- * For each scanline fetched from source image with PAD repeat:
- * - calculate how many pixels need to be padded on the left side
- * - calculate how many pixels need to be padded on the right side
- * - update width to only count pixels which are fetched from the image
- * All this information is returned via 'width', 'left_pad', 'right_pad'
- * arguments. The code is assuming that 'unit_x' is positive.
- *
- * Note: 64-bit math is used in order to avoid potential overflows, which
- * is probably excessive in many cases. This particular function
- * may need its own correctness test and performance tuning.
- */
-static force_inline void
-pad_repeat_get_scanline_bounds (int32_t source_image_width,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- int32_t * width,
- int32_t * left_pad,
- int32_t * right_pad)
-{
- int64_t max_vx = (int64_t) source_image_width << 16;
- int64_t tmp;
- if (vx < 0)
- {
- tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
- if (tmp > *width)
- {
- *left_pad = *width;
- *width = 0;
- }
- else
- {
- *left_pad = (int32_t) tmp;
- *width -= (int32_t) tmp;
- }
- }
- else
- {
- *left_pad = 0;
- }
- tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
- if (tmp < 0)
- {
- *right_pad = *width;
- *width = 0;
- }
- else if (tmp >= *width)
- {
- *right_pad = 0;
- }
- else
- {
- *right_pad = *width - (int32_t) tmp;
- *width = (int32_t) tmp;
- }
-}
-
-/* A macroified version of specialized nearest scalers for some
- * common 8888 and 565 formats. It supports SRC and OVER ops.
- *
- * There are two repeat versions, one that handles repeat normal,
- * and one without repeat handling that only works if the src region
- * used is completely covered by the pre-repeated source samples.
- *
- * The loops are unrolled to process two pixels per iteration for better
- * performance on most CPU architectures (superscalar processors
- * can issue several operations simultaneously, other processors can hide
- * instructions latencies by pipelining operations). Unrolling more
- * does not make much sense because the compiler will start running out
- * of spare registers soon.
- */
-
-#define GET_8888_ALPHA(s) ((s) >> 24)
- /* This is not actually used since we don't have an OVER with
- 565 source, but it is needed to build. */
-#define GET_0565_ALPHA(s) 0xff
-
-#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT, \
- src_type_t, dst_type_t, OP, repeat_mode) \
-static force_inline void \
-scanline_func_name (dst_type_t *dst, \
- src_type_t *src, \
- int32_t w, \
- pixman_fixed_t vx, \
- pixman_fixed_t unit_x, \
- pixman_fixed_t max_vx) \
-{ \
- uint32_t d; \
- src_type_t s1, s2; \
- uint8_t a1, a2; \
- int x1, x2; \
- \
- if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER) \
- abort(); \
- \
- while ((w -= 2) >= 0) \
- { \
- x1 = vx >> 16; \
- vx += unit_x; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* This works because we know that unit_x is positive */ \
- while (vx >= max_vx) \
- vx -= max_vx; \
- } \
- s1 = src[x1]; \
- \
- x2 = vx >> 16; \
- vx += unit_x; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* This works because we know that unit_x is positive */ \
- while (vx >= max_vx) \
- vx -= max_vx; \
- } \
- s2 = src[x2]; \
- \
- if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
- { \
- a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
- a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \
- \
- if (a1 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- else if (s1) \
- { \
- d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst); \
- s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
- a1 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- \
- if (a2 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
- } \
- else if (s2) \
- { \
- d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
- s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2); \
- a2 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- } \
- else /* PIXMAN_OP_SRC */ \
- { \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
- } \
- } \
- \
- if (w & 1) \
- { \
- x1 = vx >> 16; \
- s1 = src[x1]; \
- \
- if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
- { \
- a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
- \
- if (a1 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- else if (s1) \
- { \
- d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
- s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
- a1 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- } \
- else /* PIXMAN_OP_SRC */ \
- { \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- } \
-}
-
-#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t, \
- repeat_mode) \
-static void \
-fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp, \
- pixman_op_t op, \
- pixman_image_t * src_image, \
- pixman_image_t * mask_image, \
- pixman_image_t * dst_image, \
- int32_t src_x, \
- int32_t src_y, \
- int32_t mask_x, \
- int32_t mask_y, \
- int32_t dst_x, \
- int32_t dst_y, \
- int32_t width, \
- int32_t height) \
-{ \
- dst_type_t *dst_line; \
- src_type_t *src_first_line; \
- int y; \
- pixman_fixed_t max_vx = 0; /* suppress uninitialized variable warning */ \
- pixman_fixed_t max_vy; \
- pixman_vector_t v; \
- pixman_fixed_t vx, vy; \
- pixman_fixed_t unit_x, unit_y; \
- int32_t left_pad, right_pad; \
- \
- src_type_t *src; \
- dst_type_t *dst; \
- int src_stride, dst_stride; \
- \
- PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \
- /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \
- * transformed from destination space to source space */ \
- PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \
- \
- /* reference point is the center of the pixel */ \
- v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \
- v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \
- v.vector[2] = pixman_fixed_1; \
- \
- if (!pixman_transform_point_3d (src_image->common.transform, &v)) \
- return; \
- \
- unit_x = src_image->common.transform->matrix[0][0]; \
- unit_y = src_image->common.transform->matrix[1][1]; \
- \
- /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ \
- v.vector[0] -= pixman_fixed_e; \
- v.vector[1] -= pixman_fixed_e; \
- \
- vx = v.vector[0]; \
- vy = v.vector[1]; \
- \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* Clamp repeating positions inside the actual samples */ \
- max_vx = src_image->bits.width << 16; \
- max_vy = src_image->bits.height << 16; \
- \
- repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \
- repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
- } \
- \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \
- PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
- { \
- pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x, \
- &width, &left_pad, &right_pad); \
- vx += left_pad * unit_x; \
- } \
- \
- while (--height >= 0) \
- { \
- dst = dst_line; \
- dst_line += dst_stride; \
- \
- y = vy >> 16; \
- vy += unit_y; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \
- { \
- repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height); \
- src = src_first_line + src_stride * y; \
- if (left_pad > 0) \
- { \
- scanline_func (dst, src, left_pad, 0, 0, 0); \
- } \
- if (width > 0) \
- { \
- scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \
- } \
- if (right_pad > 0) \
- { \
- scanline_func (dst + left_pad + width, src + src_image->bits.width - 1, \
- right_pad, 0, 0, 0); \
- } \
- } \
- else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
- { \
- static src_type_t zero = 0; \
- if (y < 0 || y >= src_image->bits.height) \
- { \
- scanline_func (dst, &zero, left_pad + width + right_pad, 0, 0, 0); \
- continue; \
- } \
- src = src_first_line + src_stride * y; \
- if (left_pad > 0) \
- { \
- scanline_func (dst, &zero, left_pad, 0, 0, 0); \
- } \
- if (width > 0) \
- { \
- scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \
- } \
- if (right_pad > 0) \
- { \
- scanline_func (dst + left_pad + width, &zero, right_pad, 0, 0, 0); \
- } \
- } \
- else \
- { \
- src = src_first_line + src_stride * y; \
- scanline_func (dst, src, width, vx, unit_x, max_vx); \
- } \
- } \
-}
-
-#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT, \
- src_type_t, dst_type_t, OP, repeat_mode) \
- FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
- SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t, \
- OP, repeat_mode) \
- FAST_NEAREST_MAINLOOP(scale_func_name##_##OP, \
- scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
- src_type_t, dst_type_t, repeat_mode) \
- \
- extern int no_such_variable
-
-
-#define SCALED_NEAREST_FLAGS \
- (FAST_PATH_SCALE_TRANSFORM | \
- FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NEAREST_FILTER | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_NARROW_FORMAT)
-
-#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_NORMAL_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_PAD_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_NONE_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \
- }
-
-/* Prefer the use of 'cover' variant, because it is faster */
-#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \
- SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
-
-#endif
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. SuSE makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author: Keith Packard, SuSE, Inc. + */ + +#ifndef PIXMAN_FAST_PATH_H__ +#define PIXMAN_FAST_PATH_H__ + +#include "pixman-private.h" + +#define PIXMAN_REPEAT_COVER -1 + +static force_inline pixman_bool_t +repeat (pixman_repeat_t repeat, int *c, int size) +{ + if (repeat == PIXMAN_REPEAT_NONE) + { + if (*c < 0 || *c >= size) + return FALSE; + } + else if (repeat == PIXMAN_REPEAT_NORMAL) + { + while (*c >= size) + *c -= size; + while (*c < 0) + *c += size; + } + else if (repeat == PIXMAN_REPEAT_PAD) + { + *c = CLIP (*c, 0, size - 1); + } + else /* REFLECT */ + { + *c = MOD (*c, size * 2); + if (*c >= size) + *c = size * 2 - *c - 1; + } + return TRUE; +} + +/* + * For each scanline fetched from source image with PAD repeat: + * - calculate how many pixels need to be padded on the left side + * - calculate how many pixels need to be padded on the right side + * - update width to only count pixels which are fetched from the image + * All this information is returned via 'width', 'left_pad', 'right_pad' + * arguments. The code is assuming that 'unit_x' is positive. + * + * Note: 64-bit math is used in order to avoid potential overflows, which + * is probably excessive in many cases. This particular function + * may need its own correctness test and performance tuning. + */ +static force_inline void +pad_repeat_get_scanline_bounds (int32_t source_image_width, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + int32_t * width, + int32_t * left_pad, + int32_t * right_pad) +{ + int64_t max_vx = (int64_t) source_image_width << 16; + int64_t tmp; + if (vx < 0) + { + tmp = ((int64_t) unit_x - 1 - vx) / unit_x; + if (tmp > *width) + { + *left_pad = *width; + *width = 0; + } + else + { + *left_pad = (int32_t) tmp; + *width -= (int32_t) tmp; + } + } + else + { + *left_pad = 0; + } + tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad; + if (tmp < 0) + { + *right_pad = *width; + *width = 0; + } + else if (tmp >= *width) + { + *right_pad = 0; + } + else + { + *right_pad = *width - (int32_t) tmp; + *width = (int32_t) tmp; + } +} + +/* A macroified version of specialized nearest scalers for some + * common 8888 and 565 formats. It supports SRC and OVER ops. + * + * There are two repeat versions, one that handles repeat normal, + * and one without repeat handling that only works if the src region + * used is completely covered by the pre-repeated source samples. + * + * The loops are unrolled to process two pixels per iteration for better + * performance on most CPU architectures (superscalar processors + * can issue several operations simultaneously, other processors can hide + * instructions latencies by pipelining operations). Unrolling more + * does not make much sense because the compiler will start running out + * of spare registers soon. + */ + +#define GET_8888_ALPHA(s) ((s) >> 24) + /* This is not actually used since we don't have an OVER with + 565 source, but it is needed to build. */ +#define GET_0565_ALPHA(s) 0xff + +#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT, \ + src_type_t, dst_type_t, OP, repeat_mode) \ +static force_inline void \ +scanline_func_name (dst_type_t *dst, \ + src_type_t *src, \ + int32_t w, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx) \ +{ \ + uint32_t d; \ + src_type_t s1, s2; \ + uint8_t a1, a2; \ + int x1, x2; \ + \ + if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER) \ + abort(); \ + \ + while ((w -= 2) >= 0) \ + { \ + x1 = vx >> 16; \ + vx += unit_x; \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + { \ + /* This works because we know that unit_x is positive */ \ + while (vx >= max_vx) \ + vx -= max_vx; \ + } \ + s1 = src[x1]; \ + \ + x2 = vx >> 16; \ + vx += unit_x; \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + { \ + /* This works because we know that unit_x is positive */ \ + while (vx >= max_vx) \ + vx -= max_vx; \ + } \ + s2 = src[x2]; \ + \ + if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ + { \ + a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ + a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \ + \ + if (a1 == 0xff) \ + { \ + *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + } \ + else if (s1) \ + { \ + d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst); \ + s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \ + a1 ^= 0xff; \ + UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \ + *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + } \ + dst++; \ + \ + if (a2 == 0xff) \ + { \ + *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \ + } \ + else if (s2) \ + { \ + d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \ + s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2); \ + a2 ^= 0xff; \ + UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2); \ + *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + } \ + dst++; \ + } \ + else /* PIXMAN_OP_SRC */ \ + { \ + *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \ + } \ + } \ + \ + if (w & 1) \ + { \ + x1 = vx >> 16; \ + s1 = src[x1]; \ + \ + if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ + { \ + a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ + \ + if (a1 == 0xff) \ + { \ + *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + } \ + else if (s1) \ + { \ + d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \ + s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \ + a1 ^= 0xff; \ + UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \ + *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + } \ + dst++; \ + } \ + else /* PIXMAN_OP_SRC */ \ + { \ + *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + } \ + } \ +} + +#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, dst_type_t, \ + repeat_mode) \ +static void \ +fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, \ + pixman_op_t op, \ + pixman_image_t * src_image, \ + pixman_image_t * mask_image, \ + pixman_image_t * dst_image, \ + int32_t src_x, \ + int32_t src_y, \ + int32_t mask_x, \ + int32_t mask_y, \ + int32_t dst_x, \ + int32_t dst_y, \ + int32_t width, \ + int32_t height) \ +{ \ + dst_type_t *dst_line; \ + src_type_t *src_first_line; \ + int y; \ + pixman_fixed_t max_vx = 0; /* suppress uninitialized variable warning */ \ + pixman_fixed_t max_vy; \ + pixman_vector_t v; \ + pixman_fixed_t vx, vy; \ + pixman_fixed_t unit_x, unit_y; \ + int32_t left_pad, right_pad; \ + \ + src_type_t *src; \ + dst_type_t *dst; \ + int src_stride, dst_stride; \ + \ + PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \ + /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \ + * transformed from destination space to source space */ \ + PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \ + \ + /* reference point is the center of the pixel */ \ + v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \ + v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \ + v.vector[2] = pixman_fixed_1; \ + \ + if (!pixman_transform_point_3d (src_image->common.transform, &v)) \ + return; \ + \ + unit_x = src_image->common.transform->matrix[0][0]; \ + unit_y = src_image->common.transform->matrix[1][1]; \ + \ + /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ \ + v.vector[0] -= pixman_fixed_e; \ + v.vector[1] -= pixman_fixed_e; \ + \ + vx = v.vector[0]; \ + vy = v.vector[1]; \ + \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + { \ + /* Clamp repeating positions inside the actual samples */ \ + max_vx = src_image->bits.width << 16; \ + max_vy = src_image->bits.height << 16; \ + \ + repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \ + repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ + } \ + \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \ + PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ + { \ + pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x, \ + &width, &left_pad, &right_pad); \ + vx += left_pad * unit_x; \ + } \ + \ + while (--height >= 0) \ + { \ + dst = dst_line; \ + dst_line += dst_stride; \ + \ + y = vy >> 16; \ + vy += unit_y; \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \ + { \ + repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height); \ + src = src_first_line + src_stride * y; \ + if (left_pad > 0) \ + { \ + scanline_func (dst, src, left_pad, 0, 0, 0); \ + } \ + if (width > 0) \ + { \ + scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \ + } \ + if (right_pad > 0) \ + { \ + scanline_func (dst + left_pad + width, src + src_image->bits.width - 1, \ + right_pad, 0, 0, 0); \ + } \ + } \ + else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ + { \ + static src_type_t zero[1] = { 0 }; \ + if (y < 0 || y >= src_image->bits.height) \ + { \ + scanline_func (dst, zero, left_pad + width + right_pad, 0, 0, 0); \ + continue; \ + } \ + src = src_first_line + src_stride * y; \ + if (left_pad > 0) \ + { \ + scanline_func (dst, zero, left_pad, 0, 0, 0); \ + } \ + if (width > 0) \ + { \ + scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \ + } \ + if (right_pad > 0) \ + { \ + scanline_func (dst + left_pad + width, zero, right_pad, 0, 0, 0); \ + } \ + } \ + else \ + { \ + src = src_first_line + src_stride * y; \ + scanline_func (dst, src, width, vx, unit_x, max_vx); \ + } \ + } \ +} + +/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ +#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t, \ + repeat_mode) \ + FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, dst_type_t, \ + repeat_mode) \ + +#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT, \ + src_type_t, dst_type_t, OP, repeat_mode) \ + FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \ + SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t, \ + OP, repeat_mode) \ + FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name ## _ ## OP, \ + scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \ + src_type_t, dst_type_t, repeat_mode) + + +#define SCALED_NEAREST_FLAGS \ + (FAST_PATH_SCALE_TRANSFORM | \ + FAST_PATH_NO_ALPHA_MAP | \ + FAST_PATH_NEAREST_FILTER | \ + FAST_PATH_NO_ACCESSORS | \ + FAST_PATH_NARROW_FORMAT) + +#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NORMAL_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \ + } + +#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_PAD_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \ + } + +#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NONE_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \ + } + +#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \ + } + +/* Prefer the use of 'cover' variant, because it is faster */ +#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \ + SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \ + SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \ + SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func) + +#endif diff --git a/pixman/pixman/pixman-matrix.c b/pixman/pixman/pixman-matrix.c index abdfa0525..f2f67ab41 100644 --- a/pixman/pixman/pixman-matrix.c +++ b/pixman/pixman/pixman-matrix.c @@ -425,7 +425,8 @@ pixman_transform_is_inverse (const struct pixman_transform *a, { struct pixman_transform t; - pixman_transform_multiply (&t, a, b); + if (!pixman_transform_multiply (&t, a, b)) + return FALSE; return pixman_transform_is_identity (&t); } diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index b19043bbd..94ba54cbf 100644 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -1,6031 +1,6031 @@ -/*
- * Copyright © 2008 Rodrigo Kumpera
- * Copyright © 2008 André Tupinambá
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Red Hat not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. Red Hat makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author: Rodrigo Kumpera (kumpera@gmail.com)
- * André Tupinambá (andrelrt@gmail.com)
- *
- * Based on work by Owen Taylor and Søren Sandmann
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <mmintrin.h>
-#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
-#include <emmintrin.h> /* for SSE2 intrinsics */
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-fast-path.h"
-
-#if defined(_MSC_VER) && defined(_M_AMD64)
-/* Windows 64 doesn't allow MMX to be used, so
- * the pixman-x64-mmx-emulation.h file contains
- * implementations of those MMX intrinsics that
- * are used in the SSE2 implementation.
- */
-# include "pixman-x64-mmx-emulation.h"
-#endif
-
-#ifdef USE_SSE2
-
-/* --------------------------------------------------------------------
- * Locals
- */
-
-static __m64 mask_x0080;
-static __m64 mask_x00ff;
-static __m64 mask_x0101;
-static __m64 mask_x_alpha;
-
-static __m64 mask_x565_rgb;
-static __m64 mask_x565_unpack;
-
-static __m128i mask_0080;
-static __m128i mask_00ff;
-static __m128i mask_0101;
-static __m128i mask_ffff;
-static __m128i mask_ff000000;
-static __m128i mask_alpha;
-
-static __m128i mask_565_r;
-static __m128i mask_565_g1, mask_565_g2;
-static __m128i mask_565_b;
-static __m128i mask_red;
-static __m128i mask_green;
-static __m128i mask_blue;
-
-static __m128i mask_565_fix_rb;
-static __m128i mask_565_fix_g;
-
-/* ----------------------------------------------------------------------
- * SSE2 Inlines
- */
-static force_inline __m128i
-unpack_32_1x128 (uint32_t data)
-{
- return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
-}
-
-static force_inline void
-unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
-{
- *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
- *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
-}
-
-static force_inline __m128i
-unpack_565_to_8888 (__m128i lo)
-{
- __m128i r, g, b, rb, t;
-
- r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
- g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
- b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
-
- rb = _mm_or_si128 (r, b);
- t = _mm_and_si128 (rb, mask_565_fix_rb);
- t = _mm_srli_epi32 (t, 5);
- rb = _mm_or_si128 (rb, t);
-
- t = _mm_and_si128 (g, mask_565_fix_g);
- t = _mm_srli_epi32 (t, 6);
- g = _mm_or_si128 (g, t);
-
- return _mm_or_si128 (rb, g);
-}
-
-static force_inline void
-unpack_565_128_4x128 (__m128i data,
- __m128i* data0,
- __m128i* data1,
- __m128i* data2,
- __m128i* data3)
-{
- __m128i lo, hi;
-
- lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
- hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
-
- lo = unpack_565_to_8888 (lo);
- hi = unpack_565_to_8888 (hi);
-
- unpack_128_2x128 (lo, data0, data1);
- unpack_128_2x128 (hi, data2, data3);
-}
-
-static force_inline uint16_t
-pack_565_32_16 (uint32_t pixel)
-{
- return (uint16_t) (((pixel >> 8) & 0xf800) |
- ((pixel >> 5) & 0x07e0) |
- ((pixel >> 3) & 0x001f));
-}
-
-static force_inline __m128i
-pack_2x128_128 (__m128i lo, __m128i hi)
-{
- return _mm_packus_epi16 (lo, hi);
-}
-
-static force_inline __m128i
-pack_565_2x128_128 (__m128i lo, __m128i hi)
-{
- __m128i data;
- __m128i r, g1, g2, b;
-
- data = pack_2x128_128 (lo, hi);
-
- r = _mm_and_si128 (data, mask_565_r);
- g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
- g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
- b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
-
- return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
-}
-
-static force_inline __m128i
-pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
-{
- return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
- pack_565_2x128_128 (*xmm2, *xmm3));
-}
-
-static force_inline int
-is_opaque (__m128i x)
-{
- __m128i ffs = _mm_cmpeq_epi8 (x, x);
-
- return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
-}
-
-static force_inline int
-is_zero (__m128i x)
-{
- return _mm_movemask_epi8 (
- _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
-}
-
-static force_inline int
-is_transparent (__m128i x)
-{
- return (_mm_movemask_epi8 (
- _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
-}
-
-static force_inline __m128i
-expand_pixel_32_1x128 (uint32_t data)
-{
- return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
-}
-
-static force_inline __m128i
-expand_alpha_1x128 (__m128i data)
-{
- return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
- _MM_SHUFFLE (3, 3, 3, 3)),
- _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline void
-expand_alpha_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
- hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
-
- *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
- *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline void
-expand_alpha_rev_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
- hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
- *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
- *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline void
-pix_multiply_2x128 (__m128i* data_lo,
- __m128i* data_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi,
- __m128i* ret_lo,
- __m128i* ret_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
- hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
- lo = _mm_adds_epu16 (lo, mask_0080);
- hi = _mm_adds_epu16 (hi, mask_0080);
- *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
- *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
-}
-
-static force_inline void
-pix_add_multiply_2x128 (__m128i* src_lo,
- __m128i* src_hi,
- __m128i* alpha_dst_lo,
- __m128i* alpha_dst_hi,
- __m128i* dst_lo,
- __m128i* dst_hi,
- __m128i* alpha_src_lo,
- __m128i* alpha_src_hi,
- __m128i* ret_lo,
- __m128i* ret_hi)
-{
- __m128i t1_lo, t1_hi;
- __m128i t2_lo, t2_hi;
-
- pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
- pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
-
- *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
- *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
-}
-
-static force_inline void
-negate_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* neg_lo,
- __m128i* neg_hi)
-{
- *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
- *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
-}
-
-static force_inline void
-invert_colors_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* inv_lo,
- __m128i* inv_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
- hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
- *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
- *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
-static force_inline void
-over_2x128 (__m128i* src_lo,
- __m128i* src_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi,
- __m128i* dst_lo,
- __m128i* dst_hi)
-{
- __m128i t1, t2;
-
- negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
-
- pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
-
- *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
- *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
-}
-
-static force_inline void
-over_rev_non_pre_2x128 (__m128i src_lo,
- __m128i src_hi,
- __m128i* dst_lo,
- __m128i* dst_hi)
-{
- __m128i lo, hi;
- __m128i alpha_lo, alpha_hi;
-
- expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
-
- lo = _mm_or_si128 (alpha_lo, mask_alpha);
- hi = _mm_or_si128 (alpha_hi, mask_alpha);
-
- invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
-
- pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
-
- over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
-}
-
-static force_inline void
-in_over_2x128 (__m128i* src_lo,
- __m128i* src_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi,
- __m128i* mask_lo,
- __m128i* mask_hi,
- __m128i* dst_lo,
- __m128i* dst_hi)
-{
- __m128i s_lo, s_hi;
- __m128i a_lo, a_hi;
-
- pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
- pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
-
- over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
-}
-
-/* load 4 pixels from a 16-byte boundary aligned address */
-static force_inline __m128i
-load_128_aligned (__m128i* src)
-{
- return _mm_load_si128 (src);
-}
-
-/* load 4 pixels from a unaligned address */
-static force_inline __m128i
-load_128_unaligned (const __m128i* src)
-{
- return _mm_loadu_si128 (src);
-}
-
-/* save 4 pixels using Write Combining memory on a 16-byte
- * boundary aligned address
- */
-static force_inline void
-save_128_write_combining (__m128i* dst,
- __m128i data)
-{
- _mm_stream_si128 (dst, data);
-}
-
-/* save 4 pixels on a 16-byte boundary aligned address */
-static force_inline void
-save_128_aligned (__m128i* dst,
- __m128i data)
-{
- _mm_store_si128 (dst, data);
-}
-
-/* save 4 pixels on a unaligned address */
-static force_inline void
-save_128_unaligned (__m128i* dst,
- __m128i data)
-{
- _mm_storeu_si128 (dst, data);
-}
-
-/* ------------------------------------------------------------------
- * MMX inlines
- */
-
-static force_inline __m64
-load_32_1x64 (uint32_t data)
-{
- return _mm_cvtsi32_si64 (data);
-}
-
-static force_inline __m64
-unpack_32_1x64 (uint32_t data)
-{
- return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
-}
-
-static force_inline __m64
-expand_alpha_1x64 (__m64 data)
-{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline __m64
-expand_alpha_rev_1x64 (__m64 data)
-{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline __m64
-expand_pixel_8_1x64 (uint8_t data)
-{
- return _mm_shuffle_pi16 (
- unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline __m64
-pix_multiply_1x64 (__m64 data,
- __m64 alpha)
-{
- return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
- mask_x0080),
- mask_x0101);
-}
-
-static force_inline __m64
-pix_add_multiply_1x64 (__m64* src,
- __m64* alpha_dst,
- __m64* dst,
- __m64* alpha_src)
-{
- __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
- __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
-
- return _mm_adds_pu8 (t1, t2);
-}
-
-static force_inline __m64
-negate_1x64 (__m64 data)
-{
- return _mm_xor_si64 (data, mask_x00ff);
-}
-
-static force_inline __m64
-invert_colors_1x64 (__m64 data)
-{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
-static force_inline __m64
-over_1x64 (__m64 src, __m64 alpha, __m64 dst)
-{
- return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
-}
-
-static force_inline __m64
-in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
-{
- return over_1x64 (pix_multiply_1x64 (*src, *mask),
- pix_multiply_1x64 (*alpha, *mask),
- *dst);
-}
-
-static force_inline __m64
-over_rev_non_pre_1x64 (__m64 src, __m64 dst)
-{
- __m64 alpha = expand_alpha_1x64 (src);
-
- return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
- _mm_or_si64 (alpha, mask_x_alpha)),
- alpha,
- dst);
-}
-
-static force_inline uint32_t
-pack_1x64_32 (__m64 data)
-{
- return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
-}
-
-/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
- *
- * 00RR00GG00BB
- *
- * --- Expanding 565 in the low word ---
- *
- * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
- * m = m & (01f0003f001f);
- * m = m * (008404100840);
- * m = m >> 8;
- *
- * Note the trick here - the top word is shifted by another nibble to
- * avoid it bumping into the middle word
- */
-static force_inline __m64
-expand565_16_1x64 (uint16_t pixel)
-{
- __m64 p;
- __m64 t1, t2;
-
- p = _mm_cvtsi32_si64 ((uint32_t) pixel);
-
- t1 = _mm_slli_si64 (p, 36 - 11);
- t2 = _mm_slli_si64 (p, 16 - 5);
-
- p = _mm_or_si64 (t1, p);
- p = _mm_or_si64 (t2, p);
- p = _mm_and_si64 (p, mask_x565_rgb);
- p = _mm_mullo_pi16 (p, mask_x565_unpack);
-
- return _mm_srli_pi16 (p, 8);
-}
-
-/* ----------------------------------------------------------------------------
- * Compose Core transformations
- */
-static force_inline uint32_t
-core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
-{
- uint8_t a;
- __m64 ms;
-
- a = src >> 24;
-
- if (a == 0xff)
- {
- return src;
- }
- else if (src)
- {
- ms = unpack_32_1x64 (src);
- return pack_1x64_32 (
- over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
- }
-
- return dst;
-}
-
-static force_inline uint32_t
-combine1 (const uint32_t *ps, const uint32_t *pm)
-{
- uint32_t s = *ps;
-
- if (pm)
- {
- __m64 ms, mm;
-
- mm = unpack_32_1x64 (*pm);
- mm = expand_alpha_1x64 (mm);
-
- ms = unpack_32_1x64 (s);
- ms = pix_multiply_1x64 (ms, mm);
-
- s = pack_1x64_32 (ms);
- }
-
- return s;
-}
-
-static force_inline __m128i
-combine4 (const __m128i *ps, const __m128i *pm)
-{
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_msk_lo, xmm_msk_hi;
- __m128i s;
-
- if (pm)
- {
- xmm_msk_lo = load_128_unaligned (pm);
-
- if (is_transparent (xmm_msk_lo))
- return _mm_setzero_si128 ();
- }
-
- s = load_128_unaligned (ps);
-
- if (pm)
- {
- unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
-
- expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_msk_lo, &xmm_msk_hi,
- &xmm_src_lo, &xmm_src_hi);
-
- s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
- }
-
- return s;
-}
-
-static force_inline void
-core_combine_over_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- /* Align dst on a 16-byte boundary */
- while (w && ((unsigned long)pd & 15))
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
- ps++;
- if (pm)
- pm++;
- w--;
- }
-
- while (w >= 4)
- {
- /* I'm loading unaligned because I'm not sure about
- * the address alignment.
- */
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
-
- if (is_opaque (xmm_src_hi))
- {
- save_128_aligned ((__m128i*)pd, xmm_src_hi);
- }
- else if (!is_zero (xmm_src_hi))
- {
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (
- xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- /* rebuid the 4 pixel data and save*/
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- w -= 4;
- ps += 4;
- pd += 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
- ps++;
- if (pm)
- pm++;
-
- w--;
- }
-}
-
-static force_inline void
-core_combine_over_reverse_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- /* Align dst on a 16-byte boundary */
- while (w &&
- ((unsigned long)pd & 15))
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- *pd++ = core_combine_over_u_pixel_sse2 (d, s);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- /* I'm loading unaligned because I'm not sure
- * about the address alignment.
- */
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_src_lo, &xmm_src_hi);
-
- /* rebuid the 4 pixel data and save*/
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_src_lo, xmm_src_hi));
-
- w -= 4;
- ps += 4;
- pd += 4;
-
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- *pd++ = core_combine_over_u_pixel_sse2 (d, s);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
-{
- uint32_t maska = src >> 24;
-
- if (maska == 0)
- {
- return 0;
- }
- else if (maska != 0xff)
- {
- return pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (dst),
- expand_alpha_1x64 (unpack_32_1x64 (src))));
- }
-
- return dst;
-}
-
-static force_inline void
-core_combine_in_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixelsse2 (d, s);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixelsse2 (d, s);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline void
-core_combine_reverse_in_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixelsse2 (s, d);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixelsse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline void
-core_combine_reverse_out_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- while (w && ((unsigned long) pd & 15))
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
-
- if (pm)
- pm++;
- ps++;
- w--;
- }
-
- while (w >= 4)
- {
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- if (pm)
- pm += 4;
-
- w -= 4;
- }
-
- while (w)
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
- ps++;
- if (pm)
- pm++;
- w--;
- }
-}
-
-static force_inline void
-core_combine_out_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- while (w && ((unsigned long) pd & 15))
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (d)))));
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (d)))));
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_atop_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
-
- __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
- __m64 da = expand_alpha_1x64 (d);
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
-}
-
-static force_inline void
-core_combine_atop_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-
- pix_add_multiply_2x128 (
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
-
- __m64 sa = expand_alpha_1x64 (s);
- __m64 da = negate_1x64 (expand_alpha_1x64 (d));
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
-}
-
-static force_inline void
-core_combine_reverse_atop_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_add_multiply_2x128 (
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_xor_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
-
- __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
- __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
-}
-
-static force_inline void
-core_combine_xor_u_sse2 (uint32_t* dst,
- const uint32_t* src,
- const uint32_t *mask,
- int width)
-{
- int w = width;
- uint32_t s, d;
- uint32_t* pd = dst;
- const uint32_t* ps = src;
- const uint32_t* pm = mask;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
- xmm_dst = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_add_multiply_2x128 (
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline void
-core_combine_add_u_sse2 (uint32_t* dst,
- const uint32_t* src,
- const uint32_t* mask,
- int width)
-{
- int w = width;
- uint32_t s, d;
- uint32_t* pd = dst;
- const uint32_t* ps = src;
- const uint32_t* pm = mask;
-
- while (w && (unsigned long)pd & 15)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- ps++;
- if (pm)
- pm++;
- *pd++ = _mm_cvtsi64_si32 (
- _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
- w--;
- }
-
- while (w >= 4)
- {
- __m128i s;
-
- s = combine4 ((__m128i*)ps, (__m128i*)pm);
-
- save_128_aligned (
- (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
-
- pd += 4;
- ps += 4;
- if (pm)
- pm += 4;
- w -= 4;
- }
-
- while (w--)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- ps++;
- *pd++ = _mm_cvtsi64_si32 (
- _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_saturate_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m64 ms = unpack_32_1x64 (src);
- __m64 md = unpack_32_1x64 (dst);
- uint32_t sa = src >> 24;
- uint32_t da = ~dst >> 24;
-
- if (sa > da)
- {
- ms = pix_multiply_1x64 (
- ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
- }
-
- return pack_1x64_32 (_mm_adds_pu16 (md, ms));
-}
-
-static force_inline void
-core_combine_saturate_u_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, d;
-
- uint32_t pack_cmp;
- __m128i xmm_src, xmm_dst;
-
- while (w && (unsigned long)pd & 15)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_dst = load_128_aligned ((__m128i*)pd);
- xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
-
- pack_cmp = _mm_movemask_epi8 (
- _mm_cmpgt_epi32 (
- _mm_srli_epi32 (xmm_src, 24),
- _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
-
- /* if some alpha src is grater than respective ~alpha dst */
- if (pack_cmp)
- {
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
- }
- else
- {
- save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
-
- pd += 4;
- ps += 4;
- if (pm)
- pm += 4;
- }
-
- w -= 4;
- }
-
- while (w--)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline void
-core_combine_src_ca_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_over_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m64 s = unpack_32_1x64 (src);
- __m64 expAlpha = expand_alpha_1x64 (s);
- __m64 unpk_mask = unpack_32_1x64 (mask);
- __m64 unpk_dst = unpack_32_1x64 (dst);
-
- return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
-}
-
-static force_inline void
-core_combine_over_ca_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m64 d = unpack_32_1x64 (dst);
-
- return pack_1x64_32 (
- over_1x64 (d, expand_alpha_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (src),
- unpack_32_1x64 (mask))));
-}
-
-static force_inline void
-core_combine_over_reverse_ca_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline void
-core_combine_in_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expand_alpha_1x64 (unpack_32_1x64 (d))));
-
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expand_alpha_1x64 (unpack_32_1x64 (d))));
-
- w--;
- }
-}
-
-static force_inline void
-core_combine_in_reverse_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
- w--;
- }
-}
-
-static force_inline void
-core_combine_out_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
- negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
-
- w--;
- }
-}
-
-static force_inline void
-core_combine_out_reverse_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- negate_1x64 (pix_multiply_1x64 (
- unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s))))));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- negate_1x64 (pix_multiply_1x64 (
- unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s))))));
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_atop_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m64 m = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
- __m64 sa = expand_alpha_1x64 (s);
- __m64 da = expand_alpha_1x64 (d);
-
- s = pix_multiply_1x64 (s, m);
- m = negate_1x64 (pix_multiply_1x64 (m, sa));
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
-}
-
-static force_inline void
-core_combine_atop_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- pix_add_multiply_2x128 (
- &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m64 m = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
-
- __m64 da = negate_1x64 (expand_alpha_1x64 (d));
- __m64 sa = expand_alpha_1x64 (s);
-
- s = pix_multiply_1x64 (s, m);
- m = pix_multiply_1x64 (m, sa);
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
-}
-
-static force_inline void
-core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_add_multiply_2x128 (
- &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_xor_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m64 a = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
-
- __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
- a, expand_alpha_1x64 (s)));
- __m64 dest = pix_multiply_1x64 (s, a);
- __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&d,
- &alpha_dst,
- &dest,
- &alpha_src));
-}
-
-static force_inline void
-core_combine_xor_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
- negate_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_add_multiply_2x128 (
- &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline void
-core_combine_add_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (
- _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
- _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
-}
-
-/* ---------------------------------------------------
- * fb_compose_setup_sSE2
- */
-static force_inline __m64
-create_mask_16_64 (uint16_t mask)
-{
- return _mm_set1_pi16 (mask);
-}
-
-static force_inline __m128i
-create_mask_16_128 (uint16_t mask)
-{
- return _mm_set1_epi16 (mask);
-}
-
-static force_inline __m64
-create_mask_2x32_64 (uint32_t mask0,
- uint32_t mask1)
-{
- return _mm_set_pi32 (mask0, mask1);
-}
-
-/* Work around a code generation bug in Sun Studio 12. */
-#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
-# define create_mask_2x32_128(mask0, mask1) \
- (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
-#else
-static force_inline __m128i
-create_mask_2x32_128 (uint32_t mask0,
- uint32_t mask1)
-{
- return _mm_set_epi32 (mask0, mask1, mask0, mask1);
-}
-#endif
-
-/* SSE2 code patch for fbcompose.c */
-
-static void
-sse2_combine_over_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_over_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_reverse_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_in_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_in_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_out_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_out_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_atop_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_xor_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_xor_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_add_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_add_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_saturate_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_saturate_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_src_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_src_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_over_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_in_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_out_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_atop_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_xor_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_xor_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_add_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_add_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------
- * composite_over_n_8888
- */
-
-static void
-sse2_composite_over_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint32_t *dst_line, *dst, d;
- int32_t w;
- int dst_stride;
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
-
- while (height--)
- {
- dst = dst_line;
-
- dst_line += dst_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_dst_lo, &xmm_dst_hi);
-
- /* rebuid the 4 pixel data and save*/
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- w -= 4;
- dst += 4;
- }
-
- while (w)
- {
- d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
- w--;
- }
-
- }
- _mm_empty ();
-}
-
-/* ---------------------------------------------------------------------
- * composite_over_n_0565
- */
-static void
-sse2_composite_over_n_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint16_t *dst_line, *dst, d;
- int32_t w;
- int dst_stride;
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
-
- while (height--)
- {
- dst = dst_line;
-
- dst_line += dst_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- d = *dst;
-
- *dst++ = pack_565_32_16 (
- pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- expand565_16_1x64 (d))));
- w--;
- }
-
- while (w >= 8)
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
- over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_dst0, &xmm_dst1);
- over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_dst2, &xmm_dst3);
-
- xmm_dst = pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
- save_128_aligned ((__m128i*)dst, xmm_dst);
-
- dst += 8;
- w -= 8;
- }
-
- while (w--)
- {
- d = *dst;
- *dst++ = pack_565_32_16 (
- pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- expand565_16_1x64 (d))));
- }
- }
-
- _mm_empty ();
-}
-
-/* ------------------------------
- * composite_add_n_8888_8888_ca
- */
-static void
-sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, d;
- uint32_t *mask_line, m;
- uint32_t pack_cmp;
- int dst_stride, mask_stride;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- srca = src >> 24;
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- xmm_src = _mm_unpacklo_epi8 (
- create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
-
- while (height--)
- {
- int w = width;
- const uint32_t *pm = (uint32_t *)mask_line;
- uint32_t *pd = (uint32_t *)dst_line;
-
- dst_line += dst_stride;
- mask_line += mask_stride;
-
- while (w && (unsigned long)pd & 15)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
-
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
- }
-
- pd++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_mask = load_128_unaligned ((__m128i*)pm);
-
- pack_cmp =
- _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
- if (pack_cmp != 0xffff)
- {
- xmm_dst = load_128_aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_src, &xmm_src,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
- xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
-
- save_128_aligned (
- (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
- }
-
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
-
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
- }
-
- pd++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* ---------------------------------------------------------------------------
- * composite_over_n_8888_8888_ca
- */
-
-static void
-sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint32_t *dst_line, d;
- uint32_t *mask_line, m;
- uint32_t pack_cmp;
- int dst_stride, mask_stride;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- xmm_src = _mm_unpacklo_epi8 (
- create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
-
- while (height--)
- {
- int w = width;
- const uint32_t *pm = (uint32_t *)mask_line;
- uint32_t *pd = (uint32_t *)dst_line;
-
- dst_line += dst_stride;
- mask_line += mask_stride;
-
- while (w && (unsigned long)pd & 15)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
- &mmx_alpha,
- &mmx_mask,
- &mmx_dest));
- }
-
- pd++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_mask = load_128_unaligned ((__m128i*)pm);
-
- pack_cmp =
- _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
- if (pack_cmp != 0xffff)
- {
- xmm_dst = load_128_aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (
- in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
- }
-
- pd++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/*---------------------------------------------------------------------
- * composite_over_8888_n_8888
- */
-
-static void
-sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- uint32_t mask;
- int32_t w;
- int dst_stride, src_stride;
-
- __m128i xmm_mask;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
-
- xmm_mask = create_mask_16_128 (mask >> 24);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t s = *src++;
-
- if (s)
- {
- uint32_t d = *dst;
-
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expand_alpha_1x64 (ms);
- __m64 dest = _mm_movepi64_pi64 (xmm_mask);
- __m64 alpha_dst = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (
- in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
- }
- dst++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
-
- if (!is_zero (xmm_src))
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask, &xmm_mask,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- dst += 4;
- src += 4;
- w -= 4;
- }
-
- while (w)
- {
- uint32_t s = *src++;
-
- if (s)
- {
- uint32_t d = *dst;
-
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expand_alpha_1x64 (ms);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (
- in_over_1x64 (&ms, &alpha, &mask, &dest));
- }
-
- dst++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/*---------------------------------------------------------------------
- * composite_over_8888_n_8888
- */
-
-static void
-sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int32_t w;
- int dst_stride, src_stride;
-
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- *dst++ = *src++ | 0xff000000;
- w--;
- }
-
- while (w >= 16)
- {
- __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
-
- xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
- xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
- xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
- xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
-
- save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
- save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
- save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
- save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
-
- dst += 16;
- src += 16;
- w -= 16;
- }
-
- while (w)
- {
- *dst++ = *src++ | 0xff000000;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* ---------------------------------------------------------------------
- * composite_over_x888_n_8888
- */
-static void
-sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- uint32_t mask;
- int dst_stride, src_stride;
- int32_t w;
-
- __m128i xmm_mask, xmm_alpha;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
-
- xmm_mask = create_mask_16_128 (mask >> 24);
- xmm_alpha = mask_00ff;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t s = (*src++) | 0xff000000;
- uint32_t d = *dst;
-
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
-
- *dst++ = pack_1x64_32 (
- in_over_1x64 (&src, &alpha, &mask, &dest));
-
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src = _mm_or_si128 (
- load_128_unaligned ((__m128i*)src), mask_ff000000);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask, &xmm_mask,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- dst += 4;
- src += 4;
- w -= 4;
-
- }
-
- while (w)
- {
- uint32_t s = (*src++) | 0xff000000;
- uint32_t d = *dst;
-
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
-
- *dst++ = pack_1x64_32 (
- in_over_1x64 (&src, &alpha, &mask, &dest));
-
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* --------------------------------------------------------------------
- * composite_over_8888_8888
- */
-static void
-sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- int dst_stride, src_stride;
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- dst = dst_line;
- src = src_line;
-
- while (height--)
- {
- core_combine_over_u_sse2 (dst, src, NULL, width);
-
- dst += dst_stride;
- src += src_stride;
- }
- _mm_empty ();
-}
-
-/* ------------------------------------------------------------------
- * composite_over_8888_0565
- */
-static force_inline uint16_t
-composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
-{
- __m64 ms;
-
- ms = unpack_32_1x64 (src);
- return pack_565_32_16 (
- pack_1x64_32 (
- over_1x64 (
- ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
-}
-
-static void
-sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst, d;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (src_image->drawable == mask_image->drawable);
-#endif
-
- while (height--)
- {
- dst = dst_line;
- src = src_line;
-
- dst_line += dst_stride;
- src_line += src_stride;
- w = width;
-
- /* Align dst on a 16-byte boundary */
- while (w &&
- ((unsigned long)dst & 15))
- {
- s = *src++;
- d = *dst;
-
- *dst++ = composite_over_8888_0565pixel (s, d);
- w--;
- }
-
- /* It's a 8 pixel loop */
- while (w >= 8)
- {
- /* I'm loading unaligned because I'm not sure
- * about the address alignment.
- */
- xmm_src = load_128_unaligned ((__m128i*) src);
- xmm_dst = load_128_aligned ((__m128i*) dst);
-
- /* Unpacking */
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- /* I'm loading next 4 pixels from memory
- * before to optimze the memory read.
- */
- xmm_src = load_128_unaligned ((__m128i*) (src + 4));
-
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst0, &xmm_dst1);
-
- /* Unpacking */
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst2, &xmm_dst3);
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- dst += 8;
- src += 8;
- }
-
- while (w--)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = composite_over_8888_0565pixel (s, d);
- }
- }
-
- _mm_empty ();
-}
-
-/* -----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
-static void
-sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t m, d;
-
- __m128i xmm_src, xmm_alpha, xmm_def;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- xmm_def = create_mask_2x32_128 (src, src);
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_pixel_8_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
- &mmx_alpha,
- &mmx_mask,
- &mmx_dest));
- }
-
- w--;
- dst++;
- }
-
- while (w >= 4)
- {
- m = *((uint32_t*)mask);
-
- if (srca == 0xff && m == 0xffffffff)
- {
- save_128_aligned ((__m128i*)dst, xmm_def);
- }
- else if (m)
- {
- xmm_dst = load_128_aligned ((__m128i*) dst);
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- w -= 4;
- dst += 4;
- mask += 4;
- }
-
- while (w)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_pixel_8_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
- &mmx_alpha,
- &mmx_mask,
- &mmx_dest));
- }
-
- w--;
- dst++;
- }
- }
-
- _mm_empty ();
-}
-
-/* ----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
-pixman_bool_t
-pixman_fill_sse2 (uint32_t *bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t data)
-{
- uint32_t byte_width;
- uint8_t *byte_line;
-
- __m128i xmm_def;
-
- if (bpp == 8)
- {
- uint8_t b;
- uint16_t w;
-
- stride = stride * (int) sizeof (uint32_t) / 1;
- byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
- byte_width = width;
- stride *= 1;
-
- b = data & 0xff;
- w = (b << 8) | b;
- data = (w << 16) | w;
- }
- else if (bpp == 16)
- {
- stride = stride * (int) sizeof (uint32_t) / 2;
- byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
- byte_width = 2 * width;
- stride *= 2;
-
- data = (data & 0xffff) * 0x00010001;
- }
- else if (bpp == 32)
- {
- stride = stride * (int) sizeof (uint32_t) / 4;
- byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
- byte_width = 4 * width;
- stride *= 4;
- }
- else
- {
- return FALSE;
- }
-
- xmm_def = create_mask_2x32_128 (data, data);
-
- while (height--)
- {
- int w;
- uint8_t *d = byte_line;
- byte_line += stride;
- w = byte_width;
-
- while (w >= 1 && ((unsigned long)d & 1))
- {
- *(uint8_t *)d = data;
- w -= 1;
- d += 1;
- }
-
- while (w >= 2 && ((unsigned long)d & 3))
- {
- *(uint16_t *)d = data;
- w -= 2;
- d += 2;
- }
-
- while (w >= 4 && ((unsigned long)d & 15))
- {
- *(uint32_t *)d = data;
-
- w -= 4;
- d += 4;
- }
-
- while (w >= 128)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
- save_128_aligned ((__m128i*)(d + 16), xmm_def);
- save_128_aligned ((__m128i*)(d + 32), xmm_def);
- save_128_aligned ((__m128i*)(d + 48), xmm_def);
- save_128_aligned ((__m128i*)(d + 64), xmm_def);
- save_128_aligned ((__m128i*)(d + 80), xmm_def);
- save_128_aligned ((__m128i*)(d + 96), xmm_def);
- save_128_aligned ((__m128i*)(d + 112), xmm_def);
-
- d += 128;
- w -= 128;
- }
-
- if (w >= 64)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
- save_128_aligned ((__m128i*)(d + 16), xmm_def);
- save_128_aligned ((__m128i*)(d + 32), xmm_def);
- save_128_aligned ((__m128i*)(d + 48), xmm_def);
-
- d += 64;
- w -= 64;
- }
-
- if (w >= 32)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
- save_128_aligned ((__m128i*)(d + 16), xmm_def);
-
- d += 32;
- w -= 32;
- }
-
- if (w >= 16)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
-
- d += 16;
- w -= 16;
- }
-
- while (w >= 4)
- {
- *(uint32_t *)d = data;
-
- w -= 4;
- d += 4;
- }
-
- if (w >= 2)
- {
- *(uint16_t *)d = data;
- w -= 2;
- d += 2;
- }
-
- if (w >= 1)
- {
- *(uint8_t *)d = data;
- w -= 1;
- d += 1;
- }
- }
-
- _mm_empty ();
- return TRUE;
-}
-
-static void
-sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t m;
-
- __m128i xmm_src, xmm_def;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- {
- pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
- dest_x, dest_y, width, height, 0);
- return;
- }
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- xmm_def = create_mask_2x32_128 (src, src);
- xmm_src = expand_pixel_32_1x128 (src);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- *dst = pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
- }
- else
- {
- *dst = 0;
- }
-
- w--;
- dst++;
- }
-
- while (w >= 4)
- {
- m = *((uint32_t*)mask);
-
- if (srca == 0xff && m == 0xffffffff)
- {
- save_128_aligned ((__m128i*)dst, xmm_def);
- }
- else if (m)
- {
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_src, &xmm_src,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
- }
- else
- {
- save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
- }
-
- w -= 4;
- dst += 4;
- mask += 4;
- }
-
- while (w)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- *dst = pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
- }
- else
- {
- *dst = 0;
- }
-
- w--;
- dst++;
- }
- }
-
- _mm_empty ();
-}
-
-/*-----------------------------------------------------------------------
- * composite_over_n_8_0565
- */
-
-static void
-sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dst_line, *dst, d;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t m;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- mmx_dest = expand565_16_1x64 (d);
-
- *dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- }
-
- while (w >= 8)
- {
- xmm_dst = load_128_aligned ((__m128i*) dst);
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
- m = *((uint32_t*)mask);
- mask += 4;
-
- if (m)
- {
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst0, &xmm_dst1);
- }
-
- m = *((uint32_t*)mask);
- mask += 4;
-
- if (m)
- {
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst2, &xmm_dst3);
- }
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- dst += 8;
- }
-
- while (w)
- {
- m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- mmx_dest = expand565_16_1x64 (d);
-
- *dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- }
- }
-
- _mm_empty ();
-}
-
-/* -----------------------------------------------------------------------
- * composite_over_pixbuf_0565
- */
-
-static void
-sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst, d;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
- uint32_t opaque, zero;
-
- __m64 ms;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (src_image->drawable == mask_image->drawable);
-#endif
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- s = *src++;
- d = *dst;
-
- ms = unpack_32_1x64 (s);
-
- *dst++ = pack_565_32_16 (
- pack_1x64_32 (
- over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
- w--;
- }
-
- while (w >= 8)
- {
- /* First round */
- xmm_src = load_128_unaligned ((__m128i*)src);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- opaque = is_opaque (xmm_src);
- zero = is_zero (xmm_src);
-
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-
- /* preload next round*/
- xmm_src = load_128_unaligned ((__m128i*)(src + 4));
-
- if (opaque)
- {
- invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst0, &xmm_dst1);
- }
- else if (!zero)
- {
- over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst0, &xmm_dst1);
- }
-
- /* Second round */
- opaque = is_opaque (xmm_src);
- zero = is_zero (xmm_src);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-
- if (opaque)
- {
- invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst2, &xmm_dst3);
- }
- else if (!zero)
- {
- over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst2, &xmm_dst3);
- }
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- src += 8;
- dst += 8;
- }
-
- while (w)
- {
- s = *src++;
- d = *dst;
-
- ms = unpack_32_1x64 (s);
-
- *dst++ = pack_565_32_16 (
- pack_1x64_32 (
- over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------------
- * composite_over_pixbuf_8888
- */
-
-static void
-sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst, d;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
- uint32_t opaque, zero;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (src_image->drawable == mask_image->drawable);
-#endif
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = pack_1x64_32 (
- over_rev_non_pre_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
-
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = load_128_unaligned ((__m128i*)src);
-
- opaque = is_opaque (xmm_src_hi);
- zero = is_zero (xmm_src_hi);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
- if (opaque)
- {
- invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
- else if (!zero)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- w -= 4;
- dst += 4;
- src += 4;
- }
-
- while (w)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = pack_1x64_32 (
- over_rev_non_pre_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
-
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------------------------------------
- * composite_over_n_8888_0565_ca
- */
-
-static void
-sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint16_t *dst_line, *dst, d;
- uint32_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int w;
- uint32_t pack_cmp;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
-
- while (height--)
- {
- w = width;
- mask = mask_line;
- dst = dst_line;
- mask_line += mask_stride;
- dst_line += dst_stride;
-
- while (w && ((unsigned long)dst & 15))
- {
- m = *(uint32_t *) mask;
-
- if (m)
- {
- d = *dst;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = expand565_16_1x64 (d);
-
- *dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- mask++;
- }
-
- while (w >= 8)
- {
- /* First round */
- xmm_mask = load_128_unaligned ((__m128i*)mask);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- pack_cmp = _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- /* preload next round */
- xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
-
- /* preload next round */
- if (pack_cmp != 0xffff)
- {
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst0, &xmm_dst1);
- }
-
- /* Second round */
- pack_cmp = _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- if (pack_cmp != 0xffff)
- {
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst2, &xmm_dst3);
- }
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- dst += 8;
- mask += 8;
- }
-
- while (w)
- {
- m = *(uint32_t *) mask;
-
- if (m)
- {
- d = *dst;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = expand565_16_1x64 (d);
-
- *dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- mask++;
- }
- }
-
- _mm_empty ();
-}
-
-/* -----------------------------------------------------------------------
- * composite_in_n_8_8
- */
-
-static void
-sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- uint32_t d, m;
- uint32_t src;
- uint8_t sa;
- int32_t w;
-
- __m128i xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- sa = src >> 24;
-
- xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_mask = load_128_unaligned ((__m128i*)mask);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- mask += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* -----------------------------------------------------------------------
- * composite_in_n_8
- */
-
-static void
-sse2_composite_in_n_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- int dst_stride;
- uint32_t d;
- uint32_t src;
- int32_t w;
-
- __m128i xmm_alpha;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
- src = src >> 24;
-
- if (src == 0xff)
- return;
-
- if (src == 0x00)
- {
- pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
- 8, dest_x, dest_y, width, height, src);
-
- return;
- }
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* ---------------------------------------------------------------------------
- * composite_in_8_8
- */
-
-static void
-sse2_composite_in_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int src_stride, dst_stride;
- int32_t w;
- uint32_t s, d;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- s = (uint32_t) *src++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- src += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- s = (uint32_t) *src++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------------
- * composite_add_n_8_8
- */
-
-static void
-sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t src;
- uint8_t sa;
- uint32_t m, d;
-
- __m128i xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- sa = src >> 24;
-
- xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- _mm_adds_pu16 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_mask = load_128_unaligned ((__m128i*)mask);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
- xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- mask += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- _mm_adds_pu16 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
-
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------------
- * composite_add_n_8_8
- */
-
-static void
-sse2_composite_add_n_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- int dst_stride;
- int32_t w;
- uint32_t src;
-
- __m128i xmm_src;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- src >>= 24;
-
- if (src == 0x00)
- return;
-
- if (src == 0xff)
- {
- pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
- 8, dest_x, dest_y, width, height, 0xff);
-
- return;
- }
-
- src = (src << 24) | (src << 16) | (src << 8) | src;
- xmm_src = _mm_set_epi32 (src, src, src, src);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- *dst = (uint8_t)_mm_cvtsi64_si32 (
- _mm_adds_pu8 (
- _mm_movepi64_pi64 (xmm_src),
- _mm_cvtsi32_si64 (*dst)));
-
- w--;
- dst++;
- }
-
- while (w >= 16)
- {
- save_128_aligned (
- (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
-
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- *dst = (uint8_t)_mm_cvtsi64_si32 (
- _mm_adds_pu8 (
- _mm_movepi64_pi64 (xmm_src),
- _mm_cvtsi32_si64 (*dst)));
-
- w--;
- dst++;
- }
- }
-
- _mm_empty ();
-}
-
-/* ----------------------------------------------------------------------
- * composite_add_8_8
- */
-
-static void
-sse2_composite_add_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint16_t t;
-
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- src = src_line;
-
- dst_line += dst_stride;
- src_line += src_stride;
- w = width;
-
- /* Small head */
- while (w && (unsigned long)dst & 3)
- {
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
- w--;
- }
-
- core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
-
- /* Small tail */
- dst += w & 0xfffc;
- src += w & 0xfffc;
-
- w &= 3;
-
- while (w)
- {
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* ---------------------------------------------------------------------
- * composite_add_8888_8888
- */
-static void
-sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
-
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
-
- core_combine_add_u_sse2 (dst, src, NULL, width);
- }
-
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------------------------------------
- * sse2_composite_copy_area
- */
-
-static pixman_bool_t
-pixman_blt_sse2 (uint32_t *src_bits,
- uint32_t *dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x,
- int src_y,
- int dst_x,
- int dst_y,
- int width,
- int height)
-{
- uint8_t * src_bytes;
- uint8_t * dst_bytes;
- int byte_width;
-
- if (src_bpp != dst_bpp)
- return FALSE;
-
- if (src_bpp == 16)
- {
- src_stride = src_stride * (int) sizeof (uint32_t) / 2;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
- src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 2 * width;
- src_stride *= 2;
- dst_stride *= 2;
- }
- else if (src_bpp == 32)
- {
- src_stride = src_stride * (int) sizeof (uint32_t) / 4;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
- src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 4 * width;
- src_stride *= 4;
- dst_stride *= 4;
- }
- else
- {
- return FALSE;
- }
-
- while (height--)
- {
- int w;
- uint8_t *s = src_bytes;
- uint8_t *d = dst_bytes;
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- w = byte_width;
-
- while (w >= 2 && ((unsigned long)d & 3))
- {
- *(uint16_t *)d = *(uint16_t *)s;
- w -= 2;
- s += 2;
- d += 2;
- }
-
- while (w >= 4 && ((unsigned long)d & 15))
- {
- *(uint32_t *)d = *(uint32_t *)s;
-
- w -= 4;
- s += 4;
- d += 4;
- }
-
- while (w >= 64)
- {
- __m128i xmm0, xmm1, xmm2, xmm3;
-
- xmm0 = load_128_unaligned ((__m128i*)(s));
- xmm1 = load_128_unaligned ((__m128i*)(s + 16));
- xmm2 = load_128_unaligned ((__m128i*)(s + 32));
- xmm3 = load_128_unaligned ((__m128i*)(s + 48));
-
- save_128_aligned ((__m128i*)(d), xmm0);
- save_128_aligned ((__m128i*)(d + 16), xmm1);
- save_128_aligned ((__m128i*)(d + 32), xmm2);
- save_128_aligned ((__m128i*)(d + 48), xmm3);
-
- s += 64;
- d += 64;
- w -= 64;
- }
-
- while (w >= 16)
- {
- save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
-
- w -= 16;
- d += 16;
- s += 16;
- }
-
- while (w >= 4)
- {
- *(uint32_t *)d = *(uint32_t *)s;
-
- w -= 4;
- s += 4;
- d += 4;
- }
-
- if (w >= 2)
- {
- *(uint16_t *)d = *(uint16_t *)s;
- w -= 2;
- s += 2;
- d += 2;
- }
- }
-
- _mm_empty ();
-
- return TRUE;
-}
-
-static void
-sse2_composite_copy_area (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- pixman_blt_sse2 (src_image->bits.bits,
- dst_image->bits.bits,
- src_image->bits.rowstride,
- dst_image->bits.rowstride,
- PIXMAN_FORMAT_BPP (src_image->bits.format),
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
- src_x, src_y, dest_x, dest_y, width, height);
-}
-
-static void
-sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line, s;
- uint32_t *dst, *dst_line, d;
- uint8_t *mask, *mask_line;
- uint32_t m;
- int src_stride, mask_stride, dst_stride;
- int32_t w;
- __m64 ms;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- s = 0xff000000 | *src++;
- m = (uint32_t) *mask++;
- d = *dst;
- ms = unpack_32_1x64 (s);
-
- if (m != 0xff)
- {
- __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- __m64 md = unpack_32_1x64 (d);
-
- ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
- }
-
- *dst++ = pack_1x64_32 (ms);
- w--;
- }
-
- while (w >= 4)
- {
- m = *(uint32_t*) mask;
- xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
-
- if (m == 0xffffffff)
- {
- save_128_aligned ((__m128i*)dst, xmm_src);
- }
- else
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- src += 4;
- dst += 4;
- mask += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
-
- if (m)
- {
- s = 0xff000000 | *src;
-
- if (m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m64 ma, md, ms;
-
- d = *dst;
-
- ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- md = unpack_32_1x64 (d);
- ms = unpack_32_1x64 (s);
-
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
- }
-
- }
-
- src++;
- dst++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-static void
-sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line, s;
- uint32_t *dst, *dst_line, d;
- uint8_t *mask, *mask_line;
- uint32_t m;
- int src_stride, mask_stride, dst_stride;
- int32_t w;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t sa;
-
- s = *src++;
- m = (uint32_t) *mask++;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m64 ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
-
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
-
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
-
- while (w >= 4)
- {
- m = *(uint32_t *) mask;
-
- if (m)
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
-
- if (m == 0xffffffff && is_opaque (xmm_src))
- {
- save_128_aligned ((__m128i *)dst, xmm_src);
- }
- else
- {
- xmm_dst = load_128_aligned ((__m128i *)dst);
-
- xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
- &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
- }
-
- src += 4;
- dst += 4;
- mask += 4;
- w -= 4;
- }
-
- while (w)
- {
- uint32_t sa;
-
- s = *src++;
- m = (uint32_t) *mask++;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m64 ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
-
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
-
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-static void
-sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint32_t *dst_line, *dst;
- __m128i xmm_src;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_dsta_hi, xmm_dsta_lo;
- int dst_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
-
- while (height--)
- {
- dst = dst_line;
-
- dst_line += dst_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- __m64 vd;
-
- vd = unpack_32_1x64 (*dst);
-
- *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
- _mm_movepi64_pi64 (xmm_src)));
- w--;
- dst++;
- }
-
- while (w >= 4)
- {
- __m128i tmp_lo, tmp_hi;
-
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
-
- tmp_lo = xmm_src;
- tmp_hi = xmm_src;
-
- over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_dsta_lo, &xmm_dsta_hi,
- &tmp_lo, &tmp_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
-
- w -= 4;
- dst += 4;
- }
-
- while (w)
- {
- __m64 vd;
-
- vd = unpack_32_1x64 (*dst);
-
- *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
- _mm_movepi64_pi64 (xmm_src)));
- w--;
- dst++;
- }
-
- }
-
- _mm_empty ();
-}
-
-static void
-sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line, s;
- uint32_t *dst, *dst_line, d;
- uint32_t *mask, *mask_line;
- uint32_t m;
- int src_stride, mask_stride, dst_stride;
- int32_t w;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t sa;
-
- s = *src++;
- m = (*mask++) >> 24;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m64 ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
-
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
-
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_mask = load_128_unaligned ((__m128i*)mask);
-
- if (!is_transparent (xmm_mask))
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
-
- if (is_opaque (xmm_mask) && is_opaque (xmm_src))
- {
- save_128_aligned ((__m128i *)dst, xmm_src);
- }
- else
- {
- xmm_dst = load_128_aligned ((__m128i *)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
- expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
- &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
- }
-
- src += 4;
- dst += 4;
- mask += 4;
- w -= 4;
- }
-
- while (w)
- {
- uint32_t sa;
-
- s = *src++;
- m = (*mask++) >> 24;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m64 ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
-
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
-
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* A variant of 'core_combine_over_u_sse2' with minor tweaks */
-static force_inline void
-scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
- const uint32_t* ps,
- int32_t w,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- pixman_fixed_t max_vx)
-{
- uint32_t s, d;
- const uint32_t* pm = NULL;
-
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- /* Align dst on a 16-byte boundary */
- while (w && ((unsigned long)pd & 15))
- {
- d = *pd;
- s = combine1 (ps + (vx >> 16), pm);
- vx += unit_x;
-
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
- w--;
- }
-
- while (w >= 4)
- {
- __m128i tmp;
- uint32_t tmp1, tmp2, tmp3, tmp4;
-
- tmp1 = ps[vx >> 16];
- vx += unit_x;
- tmp2 = ps[vx >> 16];
- vx += unit_x;
- tmp3 = ps[vx >> 16];
- vx += unit_x;
- tmp4 = ps[vx >> 16];
- vx += unit_x;
-
- tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
-
- xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
-
- if (is_opaque (xmm_src_hi))
- {
- save_128_aligned ((__m128i*)pd, xmm_src_hi);
- }
- else if (!is_zero (xmm_src_hi))
- {
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (
- xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- /* rebuid the 4 pixel data and save*/
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- w -= 4;
- pd += 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- d = *pd;
- s = combine1 (ps + (vx >> 16), pm);
- vx += unit_x;
-
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- w--;
- }
- _mm_empty ();
-}
-
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
- scaled_nearest_scanline_sse2_8888_8888_OVER,
- uint32_t, uint32_t, COVER);
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
- scaled_nearest_scanline_sse2_8888_8888_OVER,
- uint32_t, uint32_t, NONE);
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
- scaled_nearest_scanline_sse2_8888_8888_OVER,
- uint32_t, uint32_t, PAD);
-
-static const pixman_fast_path_t sse2_fast_paths[] =
-{
- /* PIXMAN_OP_OVER */
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
- PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
-
- /* PIXMAN_OP_OVER_REVERSE */
- PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
- PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
-
- /* PIXMAN_OP_ADD */
- PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
- PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
- PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
-
- /* PIXMAN_OP_SRC */
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
-
- /* PIXMAN_OP_IN */
- PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
- PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
- PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
-
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
-
- { PIXMAN_OP_NONE },
-};
-
-static pixman_bool_t
-sse2_blt (pixman_implementation_t *imp,
- uint32_t * src_bits,
- uint32_t * dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x,
- int src_y,
- int dst_x,
- int dst_y,
- int width,
- int height)
-{
- if (!pixman_blt_sse2 (
- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
- src_x, src_y, dst_x, dst_y, width, height))
-
- {
- return _pixman_implementation_blt (
- imp->delegate,
- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
- src_x, src_y, dst_x, dst_y, width, height);
- }
-
- return TRUE;
-}
-
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static pixman_bool_t
-sse2_fill (pixman_implementation_t *imp,
- uint32_t * bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
- {
- return _pixman_implementation_fill (
- imp->delegate, bits, stride, bpp, x, y, width, height, xor);
- }
-
- return TRUE;
-}
-
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-pixman_implementation_t *
-_pixman_implementation_create_sse2 (void)
-{
-#ifdef USE_MMX
- pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
-#else
- pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
-#endif
- pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
-
- /* SSE2 constants */
- mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
- mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
- mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
- mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
- mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
- mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
- mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
- mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
- mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
- mask_0080 = create_mask_16_128 (0x0080);
- mask_00ff = create_mask_16_128 (0x00ff);
- mask_0101 = create_mask_16_128 (0x0101);
- mask_ffff = create_mask_16_128 (0xffff);
- mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
- mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
-
- /* MMX constants */
- mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
- mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
-
- mask_x0080 = create_mask_16_64 (0x0080);
- mask_x00ff = create_mask_16_64 (0x00ff);
- mask_x0101 = create_mask_16_64 (0x0101);
- mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
-
- _mm_empty ();
-
- /* Set up function pointers */
-
- /* SSE code patch for fbcompose.c */
- imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
- imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
- imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
- imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
- imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
- imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
- imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
- imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
- imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
- imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
-
- imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
-
- imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
- imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
- imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
- imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
- imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
- imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
- imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
-
- imp->blt = sse2_blt;
- imp->fill = sse2_fill;
-
- return imp;
-}
-
-#endif /* USE_SSE2 */
+/* + * Copyright © 2008 Rodrigo Kumpera + * Copyright © 2008 André Tupinambá + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Red Hat not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. Red Hat makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Rodrigo Kumpera (kumpera@gmail.com) + * André Tupinambá (andrelrt@gmail.com) + * + * Based on work by Owen Taylor and Søren Sandmann + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <mmintrin.h> +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ +#include <emmintrin.h> /* for SSE2 intrinsics */ +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "pixman-fast-path.h" + +#if defined(_MSC_VER) && defined(_M_AMD64) +/* Windows 64 doesn't allow MMX to be used, so + * the pixman-x64-mmx-emulation.h file contains + * implementations of those MMX intrinsics that + * are used in the SSE2 implementation. + */ +# include "pixman-x64-mmx-emulation.h" +#endif + +#ifdef USE_SSE2 + +/* -------------------------------------------------------------------- + * Locals + */ + +static __m64 mask_x0080; +static __m64 mask_x00ff; +static __m64 mask_x0101; +static __m64 mask_x_alpha; + +static __m64 mask_x565_rgb; +static __m64 mask_x565_unpack; + +static __m128i mask_0080; +static __m128i mask_00ff; +static __m128i mask_0101; +static __m128i mask_ffff; +static __m128i mask_ff000000; +static __m128i mask_alpha; + +static __m128i mask_565_r; +static __m128i mask_565_g1, mask_565_g2; +static __m128i mask_565_b; +static __m128i mask_red; +static __m128i mask_green; +static __m128i mask_blue; + +static __m128i mask_565_fix_rb; +static __m128i mask_565_fix_g; + +/* ---------------------------------------------------------------------- + * SSE2 Inlines + */ +static force_inline __m128i +unpack_32_1x128 (uint32_t data) +{ + return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); +} + +static force_inline void +unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) +{ + *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); + *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); +} + +static force_inline __m128i +unpack_565_to_8888 (__m128i lo) +{ + __m128i r, g, b, rb, t; + + r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); + g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); + b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); + + rb = _mm_or_si128 (r, b); + t = _mm_and_si128 (rb, mask_565_fix_rb); + t = _mm_srli_epi32 (t, 5); + rb = _mm_or_si128 (rb, t); + + t = _mm_and_si128 (g, mask_565_fix_g); + t = _mm_srli_epi32 (t, 6); + g = _mm_or_si128 (g, t); + + return _mm_or_si128 (rb, g); +} + +static force_inline void +unpack_565_128_4x128 (__m128i data, + __m128i* data0, + __m128i* data1, + __m128i* data2, + __m128i* data3) +{ + __m128i lo, hi; + + lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); + hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); + + lo = unpack_565_to_8888 (lo); + hi = unpack_565_to_8888 (hi); + + unpack_128_2x128 (lo, data0, data1); + unpack_128_2x128 (hi, data2, data3); +} + +static force_inline uint16_t +pack_565_32_16 (uint32_t pixel) +{ + return (uint16_t) (((pixel >> 8) & 0xf800) | + ((pixel >> 5) & 0x07e0) | + ((pixel >> 3) & 0x001f)); +} + +static force_inline __m128i +pack_2x128_128 (__m128i lo, __m128i hi) +{ + return _mm_packus_epi16 (lo, hi); +} + +static force_inline __m128i +pack_565_2x128_128 (__m128i lo, __m128i hi) +{ + __m128i data; + __m128i r, g1, g2, b; + + data = pack_2x128_128 (lo, hi); + + r = _mm_and_si128 (data, mask_565_r); + g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); + g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); + b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); + + return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); +} + +static force_inline __m128i +pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) +{ + return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), + pack_565_2x128_128 (*xmm2, *xmm3)); +} + +static force_inline int +is_opaque (__m128i x) +{ + __m128i ffs = _mm_cmpeq_epi8 (x, x); + + return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; +} + +static force_inline int +is_zero (__m128i x) +{ + return _mm_movemask_epi8 ( + _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; +} + +static force_inline int +is_transparent (__m128i x) +{ + return (_mm_movemask_epi8 ( + _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; +} + +static force_inline __m128i +expand_pixel_32_1x128 (uint32_t data) +{ + return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); +} + +static force_inline __m128i +expand_alpha_1x128 (__m128i data) +{ + return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, + _MM_SHUFFLE (3, 3, 3, 3)), + _MM_SHUFFLE (3, 3, 3, 3)); +} + +static force_inline void +expand_alpha_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* alpha_lo, + __m128i* alpha_hi) +{ + __m128i lo, hi; + + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); + + *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); + *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); +} + +static force_inline void +expand_alpha_rev_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* alpha_lo, + __m128i* alpha_hi) +{ + __m128i lo, hi; + + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); + *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); + *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); +} + +static force_inline void +pix_multiply_2x128 (__m128i* data_lo, + __m128i* data_hi, + __m128i* alpha_lo, + __m128i* alpha_hi, + __m128i* ret_lo, + __m128i* ret_hi) +{ + __m128i lo, hi; + + lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); + hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); + lo = _mm_adds_epu16 (lo, mask_0080); + hi = _mm_adds_epu16 (hi, mask_0080); + *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); + *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); +} + +static force_inline void +pix_add_multiply_2x128 (__m128i* src_lo, + __m128i* src_hi, + __m128i* alpha_dst_lo, + __m128i* alpha_dst_hi, + __m128i* dst_lo, + __m128i* dst_hi, + __m128i* alpha_src_lo, + __m128i* alpha_src_hi, + __m128i* ret_lo, + __m128i* ret_hi) +{ + __m128i t1_lo, t1_hi; + __m128i t2_lo, t2_hi; + + pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); + pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); + + *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); + *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); +} + +static force_inline void +negate_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* neg_lo, + __m128i* neg_hi) +{ + *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); + *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); +} + +static force_inline void +invert_colors_2x128 (__m128i data_lo, + __m128i data_hi, + __m128i* inv_lo, + __m128i* inv_hi) +{ + __m128i lo, hi; + + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); + *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); + *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); +} + +static force_inline void +over_2x128 (__m128i* src_lo, + __m128i* src_hi, + __m128i* alpha_lo, + __m128i* alpha_hi, + __m128i* dst_lo, + __m128i* dst_hi) +{ + __m128i t1, t2; + + negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); + + pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); + + *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); + *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); +} + +static force_inline void +over_rev_non_pre_2x128 (__m128i src_lo, + __m128i src_hi, + __m128i* dst_lo, + __m128i* dst_hi) +{ + __m128i lo, hi; + __m128i alpha_lo, alpha_hi; + + expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); + + lo = _mm_or_si128 (alpha_lo, mask_alpha); + hi = _mm_or_si128 (alpha_hi, mask_alpha); + + invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); + + pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); + + over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); +} + +static force_inline void +in_over_2x128 (__m128i* src_lo, + __m128i* src_hi, + __m128i* alpha_lo, + __m128i* alpha_hi, + __m128i* mask_lo, + __m128i* mask_hi, + __m128i* dst_lo, + __m128i* dst_hi) +{ + __m128i s_lo, s_hi; + __m128i a_lo, a_hi; + + pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); + pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); + + over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); +} + +/* load 4 pixels from a 16-byte boundary aligned address */ +static force_inline __m128i +load_128_aligned (__m128i* src) +{ + return _mm_load_si128 (src); +} + +/* load 4 pixels from a unaligned address */ +static force_inline __m128i +load_128_unaligned (const __m128i* src) +{ + return _mm_loadu_si128 (src); +} + +/* save 4 pixels using Write Combining memory on a 16-byte + * boundary aligned address + */ +static force_inline void +save_128_write_combining (__m128i* dst, + __m128i data) +{ + _mm_stream_si128 (dst, data); +} + +/* save 4 pixels on a 16-byte boundary aligned address */ +static force_inline void +save_128_aligned (__m128i* dst, + __m128i data) +{ + _mm_store_si128 (dst, data); +} + +/* save 4 pixels on a unaligned address */ +static force_inline void +save_128_unaligned (__m128i* dst, + __m128i data) +{ + _mm_storeu_si128 (dst, data); +} + +/* ------------------------------------------------------------------ + * MMX inlines + */ + +static force_inline __m64 +load_32_1x64 (uint32_t data) +{ + return _mm_cvtsi32_si64 (data); +} + +static force_inline __m64 +unpack_32_1x64 (uint32_t data) +{ + return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ()); +} + +static force_inline __m64 +expand_alpha_1x64 (__m64 data) +{ + return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3)); +} + +static force_inline __m64 +expand_alpha_rev_1x64 (__m64 data) +{ + return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); +} + +static force_inline __m64 +expand_pixel_8_1x64 (uint8_t data) +{ + return _mm_shuffle_pi16 ( + unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); +} + +static force_inline __m64 +pix_multiply_1x64 (__m64 data, + __m64 alpha) +{ + return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha), + mask_x0080), + mask_x0101); +} + +static force_inline __m64 +pix_add_multiply_1x64 (__m64* src, + __m64* alpha_dst, + __m64* dst, + __m64* alpha_src) +{ + __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst); + __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src); + + return _mm_adds_pu8 (t1, t2); +} + +static force_inline __m64 +negate_1x64 (__m64 data) +{ + return _mm_xor_si64 (data, mask_x00ff); +} + +static force_inline __m64 +invert_colors_1x64 (__m64 data) +{ + return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); +} + +static force_inline __m64 +over_1x64 (__m64 src, __m64 alpha, __m64 dst) +{ + return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha))); +} + +static force_inline __m64 +in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst) +{ + return over_1x64 (pix_multiply_1x64 (*src, *mask), + pix_multiply_1x64 (*alpha, *mask), + *dst); +} + +static force_inline __m64 +over_rev_non_pre_1x64 (__m64 src, __m64 dst) +{ + __m64 alpha = expand_alpha_1x64 (src); + + return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src), + _mm_or_si64 (alpha, mask_x_alpha)), + alpha, + dst); +} + +static force_inline uint32_t +pack_1x64_32 (__m64 data) +{ + return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ())); +} + +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into + * + * 00RR00GG00BB + * + * --- Expanding 565 in the low word --- + * + * m = (m << (32 - 3)) | (m << (16 - 5)) | m; + * m = m & (01f0003f001f); + * m = m * (008404100840); + * m = m >> 8; + * + * Note the trick here - the top word is shifted by another nibble to + * avoid it bumping into the middle word + */ +static force_inline __m64 +expand565_16_1x64 (uint16_t pixel) +{ + __m64 p; + __m64 t1, t2; + + p = _mm_cvtsi32_si64 ((uint32_t) pixel); + + t1 = _mm_slli_si64 (p, 36 - 11); + t2 = _mm_slli_si64 (p, 16 - 5); + + p = _mm_or_si64 (t1, p); + p = _mm_or_si64 (t2, p); + p = _mm_and_si64 (p, mask_x565_rgb); + p = _mm_mullo_pi16 (p, mask_x565_unpack); + + return _mm_srli_pi16 (p, 8); +} + +/* ---------------------------------------------------------------------------- + * Compose Core transformations + */ +static force_inline uint32_t +core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) +{ + uint8_t a; + __m64 ms; + + a = src >> 24; + + if (a == 0xff) + { + return src; + } + else if (src) + { + ms = unpack_32_1x64 (src); + return pack_1x64_32 ( + over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst))); + } + + return dst; +} + +static force_inline uint32_t +combine1 (const uint32_t *ps, const uint32_t *pm) +{ + uint32_t s = *ps; + + if (pm) + { + __m64 ms, mm; + + mm = unpack_32_1x64 (*pm); + mm = expand_alpha_1x64 (mm); + + ms = unpack_32_1x64 (s); + ms = pix_multiply_1x64 (ms, mm); + + s = pack_1x64_32 (ms); + } + + return s; +} + +static force_inline __m128i +combine4 (const __m128i *ps, const __m128i *pm) +{ + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_msk_lo, xmm_msk_hi; + __m128i s; + + if (pm) + { + xmm_msk_lo = load_128_unaligned (pm); + + if (is_transparent (xmm_msk_lo)) + return _mm_setzero_si128 (); + } + + s = load_128_unaligned (ps); + + if (pm) + { + unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); + + expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_msk_lo, &xmm_msk_hi, + &xmm_src_lo, &xmm_src_hi); + + s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); + } + + return s; +} + +static force_inline void +core_combine_over_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) +{ + uint32_t s, d; + + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + /* Align dst on a 16-byte boundary */ + while (w && ((unsigned long)pd & 15)) + { + d = *pd; + s = combine1 (ps, pm); + + *pd++ = core_combine_over_u_pixel_sse2 (s, d); + ps++; + if (pm) + pm++; + w--; + } + + while (w >= 4) + { + /* I'm loading unaligned because I'm not sure about + * the address alignment. + */ + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + + if (is_opaque (xmm_src_hi)) + { + save_128_aligned ((__m128i*)pd, xmm_src_hi); + } + else if (!is_zero (xmm_src_hi)) + { + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 ( + xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); + + over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + /* rebuid the 4 pixel data and save*/ + save_128_aligned ((__m128i*)pd, + pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + ps += 4; + pd += 4; + if (pm) + pm += 4; + } + + while (w) + { + d = *pd; + s = combine1 (ps, pm); + + *pd++ = core_combine_over_u_pixel_sse2 (s, d); + ps++; + if (pm) + pm++; + + w--; + } +} + +static force_inline void +core_combine_over_reverse_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) +{ + uint32_t s, d; + + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + /* Align dst on a 16-byte boundary */ + while (w && + ((unsigned long)pd & 15)) + { + d = *pd; + s = combine1 (ps, pm); + + *pd++ = core_combine_over_u_pixel_sse2 (d, s); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + /* I'm loading unaligned because I'm not sure + * about the address alignment. + */ + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_src_lo, &xmm_src_hi); + + /* rebuid the 4 pixel data and save*/ + save_128_aligned ((__m128i*)pd, + pack_2x128_128 (xmm_src_lo, xmm_src_hi)); + + w -= 4; + ps += 4; + pd += 4; + + if (pm) + pm += 4; + } + + while (w) + { + d = *pd; + s = combine1 (ps, pm); + + *pd++ = core_combine_over_u_pixel_sse2 (d, s); + ps++; + w--; + if (pm) + pm++; + } +} + +static force_inline uint32_t +core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst) +{ + uint32_t maska = src >> 24; + + if (maska == 0) + { + return 0; + } + else if (maska != 0xff) + { + return pack_1x64_32 ( + pix_multiply_1x64 (unpack_32_1x64 (dst), + expand_alpha_1x64 (unpack_32_1x64 (src)))); + } + + return dst; +} + +static force_inline void +core_combine_in_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) +{ + uint32_t s, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + while (w && ((unsigned long) pd & 15)) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixelsse2 (d, s); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)pd, + pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixelsse2 (d, s); + w--; + ps++; + if (pm) + pm++; + } +} + +static force_inline void +core_combine_reverse_in_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t *pm, + int w) +{ + uint32_t s, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + while (w && ((unsigned long) pd & 15)) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixelsse2 (s, d); + ps++; + w--; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_in_u_pixelsse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } +} + +static force_inline void +core_combine_reverse_out_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) +{ + while (w && ((unsigned long) pd & 15)) + { + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), negate_1x64 ( + expand_alpha_1x64 (unpack_32_1x64 (s))))); + + if (pm) + pm++; + ps++; + w--; + } + + while (w >= 4) + { + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + if (pm) + pm += 4; + + w -= 4; + } + + while (w) + { + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), negate_1x64 ( + expand_alpha_1x64 (unpack_32_1x64 (s))))); + ps++; + if (pm) + pm++; + w--; + } +} + +static force_inline void +core_combine_out_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) +{ + while (w && ((unsigned long) pd & 15)) + { + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), negate_1x64 ( + expand_alpha_1x64 (unpack_32_1x64 (d))))); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + uint32_t s = combine1 (ps, pm); + uint32_t d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), negate_1x64 ( + expand_alpha_1x64 (unpack_32_1x64 (d))))); + w--; + ps++; + if (pm) + pm++; + } +} + +static force_inline uint32_t +core_combine_atop_u_pixel_sse2 (uint32_t src, + uint32_t dst) +{ + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + + __m64 sa = negate_1x64 (expand_alpha_1x64 (s)); + __m64 da = expand_alpha_1x64 (d); + + return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); +} + +static force_inline void +core_combine_atop_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) +{ + uint32_t s, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + + while (w && ((unsigned long) pd & 15)) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_atop_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + + pix_add_multiply_2x128 ( + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_atop_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } +} + +static force_inline uint32_t +core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, + uint32_t dst) +{ + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + + __m64 sa = expand_alpha_1x64 (s); + __m64 da = negate_1x64 (expand_alpha_1x64 (d)); + + return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); +} + +static force_inline void +core_combine_reverse_atop_u_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t* pm, + int w) +{ + uint32_t s, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + + while (w && ((unsigned long) pd & 15)) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); + ps++; + w--; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_add_multiply_2x128 ( + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); + ps++; + w--; + if (pm) + pm++; + } +} + +static force_inline uint32_t +core_combine_xor_u_pixel_sse2 (uint32_t src, + uint32_t dst) +{ + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + + __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d)); + __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s)); + + return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s)); +} + +static force_inline void +core_combine_xor_u_sse2 (uint32_t* dst, + const uint32_t* src, + const uint32_t *mask, + int width) +{ + int w = width; + uint32_t s, d; + uint32_t* pd = dst; + const uint32_t* ps = src; + const uint32_t* pm = mask; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + + while (w && ((unsigned long) pd & 15)) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_xor_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); + xmm_dst = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_add_multiply_2x128 ( + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + w -= 4; + if (pm) + pm += 4; + } + + while (w) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_xor_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } +} + +static force_inline void +core_combine_add_u_sse2 (uint32_t* dst, + const uint32_t* src, + const uint32_t* mask, + int width) +{ + int w = width; + uint32_t s, d; + uint32_t* pd = dst; + const uint32_t* ps = src; + const uint32_t* pm = mask; + + while (w && (unsigned long)pd & 15) + { + s = combine1 (ps, pm); + d = *pd; + + ps++; + if (pm) + pm++; + *pd++ = _mm_cvtsi64_si32 ( + _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + w--; + } + + while (w >= 4) + { + __m128i s; + + s = combine4 ((__m128i*)ps, (__m128i*)pm); + + save_128_aligned ( + (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); + + pd += 4; + ps += 4; + if (pm) + pm += 4; + w -= 4; + } + + while (w--) + { + s = combine1 (ps, pm); + d = *pd; + + ps++; + *pd++ = _mm_cvtsi64_si32 ( + _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + if (pm) + pm++; + } +} + +static force_inline uint32_t +core_combine_saturate_u_pixel_sse2 (uint32_t src, + uint32_t dst) +{ + __m64 ms = unpack_32_1x64 (src); + __m64 md = unpack_32_1x64 (dst); + uint32_t sa = src >> 24; + uint32_t da = ~dst >> 24; + + if (sa > da) + { + ms = pix_multiply_1x64 ( + ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24))); + } + + return pack_1x64_32 (_mm_adds_pu16 (md, ms)); +} + +static force_inline void +core_combine_saturate_u_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) +{ + uint32_t s, d; + + uint32_t pack_cmp; + __m128i xmm_src, xmm_dst; + + while (w && (unsigned long)pd & 15) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + w--; + ps++; + if (pm) + pm++; + } + + while (w >= 4) + { + xmm_dst = load_128_aligned ((__m128i*)pd); + xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); + + pack_cmp = _mm_movemask_epi8 ( + _mm_cmpgt_epi32 ( + _mm_srli_epi32 (xmm_src, 24), + _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); + + /* if some alpha src is grater than respective ~alpha dst */ + if (pack_cmp) + { + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + + s = combine1 (ps++, pm); + d = *pd; + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + if (pm) + pm++; + } + else + { + save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); + + pd += 4; + ps += 4; + if (pm) + pm += 4; + } + + w -= 4; + } + + while (w--) + { + s = combine1 (ps, pm); + d = *pd; + + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); + ps++; + if (pm) + pm++; + } +} + +static force_inline void +core_combine_src_ca_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t *pm, + int w) +{ + uint32_t s, m; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + w--; + } + + while (w >= 4) + { + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + w--; + } +} + +static force_inline uint32_t +core_combine_over_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) +{ + __m64 s = unpack_32_1x64 (src); + __m64 expAlpha = expand_alpha_1x64 (s); + __m64 unpk_mask = unpack_32_1x64 (mask); + __m64 unpk_dst = unpack_32_1x64 (dst); + + return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst)); +} + +static force_inline void +core_combine_over_ca_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t *pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); + w--; + } +} + +static force_inline uint32_t +core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) +{ + __m64 d = unpack_32_1x64 (dst); + + return pack_1x64_32 ( + over_1x64 (d, expand_alpha_1x64 (d), + pix_multiply_1x64 (unpack_32_1x64 (src), + unpack_32_1x64 (mask)))); +} + +static force_inline void +core_combine_over_reverse_ca_sse2 (uint32_t* pd, + const uint32_t* ps, + const uint32_t *pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask_lo, &xmm_mask_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); + w--; + } +} + +static force_inline void +core_combine_in_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), + expand_alpha_1x64 (unpack_32_1x64 (d)))); + + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (m)), + expand_alpha_1x64 (unpack_32_1x64 (d)))); + + w--; + } +} + +static force_inline void +core_combine_in_reverse_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), + pix_multiply_1x64 (unpack_32_1x64 (m), + expand_alpha_1x64 (unpack_32_1x64 (s))))); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), + pix_multiply_1x64 (unpack_32_1x64 (m), + expand_alpha_1x64 (unpack_32_1x64 (s))))); + w--; + } +} + +static force_inline void +core_combine_out_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (m)), + negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (m)), + negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); + + w--; + } +} + +static force_inline void +core_combine_out_reverse_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), + negate_1x64 (pix_multiply_1x64 ( + unpack_32_1x64 (m), + expand_alpha_1x64 (unpack_32_1x64 (s)))))); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask_lo, &xmm_mask_hi); + + negate_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (d), + negate_1x64 (pix_multiply_1x64 ( + unpack_32_1x64 (m), + expand_alpha_1x64 (unpack_32_1x64 (s)))))); + w--; + } +} + +static force_inline uint32_t +core_combine_atop_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) +{ + __m64 m = unpack_32_1x64 (mask); + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + __m64 sa = expand_alpha_1x64 (s); + __m64 da = expand_alpha_1x64 (d); + + s = pix_multiply_1x64 (s, m); + m = negate_1x64 (pix_multiply_1x64 (m, sa)); + + return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); +} + +static force_inline void +core_combine_atop_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_mask_lo, &xmm_mask_hi); + + negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + pix_add_multiply_2x128 ( + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); + w--; + } +} + +static force_inline uint32_t +core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) +{ + __m64 m = unpack_32_1x64 (mask); + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + + __m64 da = negate_1x64 (expand_alpha_1x64 (d)); + __m64 sa = expand_alpha_1x64 (s); + + s = pix_multiply_1x64 (s, m); + m = pix_multiply_1x64 (m, sa); + + return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); +} + +static force_inline void +core_combine_reverse_atop_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_mask_lo, &xmm_mask_hi); + + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_add_multiply_2x128 ( + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); + w--; + } +} + +static force_inline uint32_t +core_combine_xor_ca_pixel_sse2 (uint32_t src, + uint32_t mask, + uint32_t dst) +{ + __m64 a = unpack_32_1x64 (mask); + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + + __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 ( + a, expand_alpha_1x64 (s))); + __m64 dest = pix_multiply_1x64 (s, a); + __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d)); + + return pack_1x64_32 (pix_add_multiply_1x64 (&d, + &alpha_dst, + &dest, + &alpha_src)); +} + +static force_inline void +core_combine_xor_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); + w--; + } + + while (w >= 4) + { + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_alpha_src_lo, &xmm_alpha_src_hi, + &xmm_mask_lo, &xmm_mask_hi); + + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + negate_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_add_multiply_2x128 ( + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); + w--; + } +} + +static force_inline void +core_combine_add_ca_sse2 (uint32_t * pd, + const uint32_t *ps, + const uint32_t *pm, + int w) +{ + uint32_t s, m, d; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask_lo, xmm_mask_hi; + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), + unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } + + while (w >= 4) + { + xmm_src_hi = load_128_unaligned ((__m128i*)ps); + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + xmm_dst_hi = load_128_aligned ((__m128i*)pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_src_lo, &xmm_src_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 ( + _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), + _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 ( + _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), + unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } +} + +/* --------------------------------------------------- + * fb_compose_setup_sSE2 + */ +static force_inline __m64 +create_mask_16_64 (uint16_t mask) +{ + return _mm_set1_pi16 (mask); +} + +static force_inline __m128i +create_mask_16_128 (uint16_t mask) +{ + return _mm_set1_epi16 (mask); +} + +static force_inline __m64 +create_mask_2x32_64 (uint32_t mask0, + uint32_t mask1) +{ + return _mm_set_pi32 (mask0, mask1); +} + +/* Work around a code generation bug in Sun Studio 12. */ +#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) +# define create_mask_2x32_128(mask0, mask1) \ + (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) +#else +static force_inline __m128i +create_mask_2x32_128 (uint32_t mask0, + uint32_t mask1) +{ + return _mm_set_epi32 (mask0, mask1, mask0, mask1); +} +#endif + +/* SSE2 code patch for fbcompose.c */ + +static void +sse2_combine_over_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_over_u_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_over_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_over_reverse_u_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_in_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_in_u_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_in_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_reverse_in_u_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_out_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_out_u_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_out_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_reverse_out_u_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_atop_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_atop_u_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_atop_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_reverse_atop_u_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_xor_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_xor_u_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_add_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_add_u_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_saturate_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_saturate_u_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_src_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_src_ca_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_over_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_over_ca_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_over_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_over_reverse_ca_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_in_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_in_ca_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_in_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_in_reverse_ca_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_out_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_out_ca_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_out_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_out_reverse_ca_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_atop_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_atop_ca_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_reverse_atop_ca_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_xor_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_xor_ca_sse2 (dst, src, mask, width); + _mm_empty (); +} + +static void +sse2_combine_add_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + core_combine_add_ca_sse2 (dst, src, mask, width); + _mm_empty (); +} + +/* ------------------------------------------------------------------- + * composite_over_n_8888 + */ + +static void +sse2_composite_over_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint32_t *dst_line, *dst, d; + int32_t w; + int dst_stride; + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + + while (height--) + { + dst = dst_line; + + dst_line += dst_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + d = *dst; + *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), + _mm_movepi64_pi64 (xmm_alpha), + unpack_32_1x64 (d))); + w--; + } + + while (w >= 4) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_dst_lo, &xmm_dst_hi); + + /* rebuid the 4 pixel data and save*/ + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + w -= 4; + dst += 4; + } + + while (w) + { + d = *dst; + *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), + _mm_movepi64_pi64 (xmm_alpha), + unpack_32_1x64 (d))); + w--; + } + + } + _mm_empty (); +} + +/* --------------------------------------------------------------------- + * composite_over_n_0565 + */ +static void +sse2_composite_over_n_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint16_t *dst_line, *dst, d; + int32_t w; + int dst_stride; + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + + while (height--) + { + dst = dst_line; + + dst_line += dst_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + d = *dst; + + *dst++ = pack_565_32_16 ( + pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), + _mm_movepi64_pi64 (xmm_alpha), + expand565_16_1x64 (d)))); + w--; + } + + while (w >= 8) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + + over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_dst0, &xmm_dst1); + over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_dst2, &xmm_dst3); + + xmm_dst = pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + + save_128_aligned ((__m128i*)dst, xmm_dst); + + dst += 8; + w -= 8; + } + + while (w--) + { + d = *dst; + *dst++ = pack_565_32_16 ( + pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), + _mm_movepi64_pi64 (xmm_alpha), + expand565_16_1x64 (d)))); + } + } + + _mm_empty (); +} + +/* ------------------------------ + * composite_add_n_8888_8888_ca + */ +static void +sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, d; + uint32_t *mask_line, m; + uint32_t pack_cmp; + int dst_stride, mask_stride; + + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + srca = src >> 24; + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + xmm_src = _mm_unpacklo_epi8 ( + create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = _mm_movepi64_pi64 (xmm_src); + mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + + while (height--) + { + int w = width; + const uint32_t *pm = (uint32_t *)mask_line; + uint32_t *pd = (uint32_t *)dst_line; + + dst_line += dst_stride; + mask_line += mask_stride; + + while (w && (unsigned long)pd & 15) + { + m = *pm++; + + if (m) + { + d = *pd; + + mmx_mask = unpack_32_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *pd = pack_1x64_32 ( + _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); + } + + pd++; + w--; + } + + while (w >= 4) + { + xmm_mask = load_128_unaligned ((__m128i*)pm); + + pack_cmp = + _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ + if (pack_cmp != 0xffff) + { + xmm_dst = load_128_aligned ((__m128i*)pd); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_src, &xmm_src, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); + + save_128_aligned ( + (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); + } + + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + m = *pm++; + + if (m) + { + d = *pd; + + mmx_mask = unpack_32_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *pd = pack_1x64_32 ( + _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); + } + + pd++; + w--; + } + } + + _mm_empty (); +} + +/* --------------------------------------------------------------------------- + * composite_over_n_8888_8888_ca + */ + +static void +sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint32_t *dst_line, d; + uint32_t *mask_line, m; + uint32_t pack_cmp; + int dst_stride, mask_stride; + + __m128i xmm_src, xmm_alpha; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + xmm_src = _mm_unpacklo_epi8 ( + create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = _mm_movepi64_pi64 (xmm_src); + mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + + while (height--) + { + int w = width; + const uint32_t *pm = (uint32_t *)mask_line; + uint32_t *pd = (uint32_t *)dst_line; + + dst_line += dst_stride; + mask_line += mask_stride; + + while (w && (unsigned long)pd & 15) + { + m = *pm++; + + if (m) + { + d = *pd; + mmx_mask = unpack_32_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *pd = pack_1x64_32 (in_over_1x64 (&mmx_src, + &mmx_alpha, + &mmx_mask, + &mmx_dest)); + } + + pd++; + w--; + } + + while (w >= 4) + { + xmm_mask = load_128_unaligned ((__m128i*)pm); + + pack_cmp = + _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ + if (pack_cmp != 0xffff) + { + xmm_dst = load_128_aligned ((__m128i*)pd); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + m = *pm++; + + if (m) + { + d = *pd; + mmx_mask = unpack_32_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *pd = pack_1x64_32 ( + in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); + } + + pd++; + w--; + } + } + + _mm_empty (); +} + +/*--------------------------------------------------------------------- + * composite_over_8888_n_8888 + */ + +static void +sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t mask; + int32_t w; + int dst_stride, src_stride; + + __m128i xmm_mask; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8); + + xmm_mask = create_mask_16_128 (mask >> 24); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + uint32_t s = *src++; + + if (s) + { + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + __m64 alpha = expand_alpha_1x64 (ms); + __m64 dest = _mm_movepi64_pi64 (xmm_mask); + __m64 alpha_dst = unpack_32_1x64 (d); + + *dst = pack_1x64_32 ( + in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); + } + dst++; + w--; + } + + while (w >= 4) + { + xmm_src = load_128_unaligned ((__m128i*)src); + + if (!is_zero (xmm_src)) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + dst += 4; + src += 4; + w -= 4; + } + + while (w) + { + uint32_t s = *src++; + + if (s) + { + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + __m64 alpha = expand_alpha_1x64 (ms); + __m64 mask = _mm_movepi64_pi64 (xmm_mask); + __m64 dest = unpack_32_1x64 (d); + + *dst = pack_1x64_32 ( + in_over_1x64 (&ms, &alpha, &mask, &dest)); + } + + dst++; + w--; + } + } + + _mm_empty (); +} + +/*--------------------------------------------------------------------- + * composite_over_8888_n_8888 + */ + +static void +sse2_composite_src_x888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int32_t w; + int dst_stride, src_stride; + + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + *dst++ = *src++ | 0xff000000; + w--; + } + + while (w >= 16) + { + __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; + + xmm_src1 = load_128_unaligned ((__m128i*)src + 0); + xmm_src2 = load_128_unaligned ((__m128i*)src + 1); + xmm_src3 = load_128_unaligned ((__m128i*)src + 2); + xmm_src4 = load_128_unaligned ((__m128i*)src + 3); + + save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); + save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); + save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); + save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); + + dst += 16; + src += 16; + w -= 16; + } + + while (w) + { + *dst++ = *src++ | 0xff000000; + w--; + } + } + + _mm_empty (); +} + +/* --------------------------------------------------------------------- + * composite_over_x888_n_8888 + */ +static void +sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t mask; + int dst_stride, src_stride; + int32_t w; + + __m128i xmm_mask, xmm_alpha; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8); + + xmm_mask = create_mask_16_128 (mask >> 24); + xmm_alpha = mask_00ff; + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + uint32_t s = (*src++) | 0xff000000; + uint32_t d = *dst; + + __m64 src = unpack_32_1x64 (s); + __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); + __m64 mask = _mm_movepi64_pi64 (xmm_mask); + __m64 dest = unpack_32_1x64 (d); + + *dst++ = pack_1x64_32 ( + in_over_1x64 (&src, &alpha, &mask, &dest)); + + w--; + } + + while (w >= 4) + { + xmm_src = _mm_or_si128 ( + load_128_unaligned ((__m128i*)src), mask_ff000000); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha, &xmm_alpha, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + dst += 4; + src += 4; + w -= 4; + + } + + while (w) + { + uint32_t s = (*src++) | 0xff000000; + uint32_t d = *dst; + + __m64 src = unpack_32_1x64 (s); + __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); + __m64 mask = _mm_movepi64_pi64 (xmm_mask); + __m64 dest = unpack_32_1x64 (d); + + *dst++ = pack_1x64_32 ( + in_over_1x64 (&src, &alpha, &mask, &dest)); + + w--; + } + } + + _mm_empty (); +} + +/* -------------------------------------------------------------------- + * composite_over_8888_8888 + */ +static void +sse2_composite_over_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + int dst_stride, src_stride; + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + dst = dst_line; + src = src_line; + + while (height--) + { + core_combine_over_u_sse2 (dst, src, NULL, width); + + dst += dst_stride; + src += src_stride; + } + _mm_empty (); +} + +/* ------------------------------------------------------------------ + * composite_over_8888_0565 + */ +static force_inline uint16_t +composite_over_8888_0565pixel (uint32_t src, uint16_t dst) +{ + __m64 ms; + + ms = unpack_32_1x64 (src); + return pack_565_32_16 ( + pack_1x64_32 ( + over_1x64 ( + ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst)))); +} + +static void +sse2_composite_over_8888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst, d; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + int32_t w; + + __m128i xmm_alpha_lo, xmm_alpha_hi; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +#if 0 + /* FIXME + * + * I copy the code from MMX one and keep the fixme. + * If it's a problem there, probably is a problem here. + */ + assert (src_image->drawable == mask_image->drawable); +#endif + + while (height--) + { + dst = dst_line; + src = src_line; + + dst_line += dst_stride; + src_line += src_stride; + w = width; + + /* Align dst on a 16-byte boundary */ + while (w && + ((unsigned long)dst & 15)) + { + s = *src++; + d = *dst; + + *dst++ = composite_over_8888_0565pixel (s, d); + w--; + } + + /* It's a 8 pixel loop */ + while (w >= 8) + { + /* I'm loading unaligned because I'm not sure + * about the address alignment. + */ + xmm_src = load_128_unaligned ((__m128i*) src); + xmm_dst = load_128_aligned ((__m128i*) dst); + + /* Unpacking */ + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + /* I'm loading next 4 pixels from memory + * before to optimze the memory read. + */ + xmm_src = load_128_unaligned ((__m128i*) (src + 4)); + + over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst0, &xmm_dst1); + + /* Unpacking */ + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst2, &xmm_dst3); + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + dst += 8; + src += 8; + } + + while (w--) + { + s = *src++; + d = *dst; + + *dst++ = composite_over_8888_0565pixel (s, d); + } + } + + _mm_empty (); +} + +/* ----------------------------------------------------------------- + * composite_over_n_8_8888 + */ + +static void +sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t m, d; + + __m128i xmm_src, xmm_alpha, xmm_def; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + xmm_def = create_mask_2x32_128 (src, src); + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = _mm_movepi64_pi64 (xmm_src); + mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_pixel_8_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, + &mmx_alpha, + &mmx_mask, + &mmx_dest)); + } + + w--; + dst++; + } + + while (w >= 4) + { + m = *((uint32_t*)mask); + + if (srca == 0xff && m == 0xffffffff) + { + save_128_aligned ((__m128i*)dst, xmm_def); + } + else if (m) + { + xmm_dst = load_128_aligned ((__m128i*) dst); + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_pixel_8_1x64 (m); + mmx_dest = unpack_32_1x64 (d); + + *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, + &mmx_alpha, + &mmx_mask, + &mmx_dest)); + } + + w--; + dst++; + } + } + + _mm_empty (); +} + +/* ---------------------------------------------------------------- + * composite_over_n_8_8888 + */ + +pixman_bool_t +pixman_fill_sse2 (uint32_t *bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t data) +{ + uint32_t byte_width; + uint8_t *byte_line; + + __m128i xmm_def; + + if (bpp == 8) + { + uint8_t b; + uint16_t w; + + stride = stride * (int) sizeof (uint32_t) / 1; + byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); + byte_width = width; + stride *= 1; + + b = data & 0xff; + w = (b << 8) | b; + data = (w << 16) | w; + } + else if (bpp == 16) + { + stride = stride * (int) sizeof (uint32_t) / 2; + byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); + byte_width = 2 * width; + stride *= 2; + + data = (data & 0xffff) * 0x00010001; + } + else if (bpp == 32) + { + stride = stride * (int) sizeof (uint32_t) / 4; + byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); + byte_width = 4 * width; + stride *= 4; + } + else + { + return FALSE; + } + + xmm_def = create_mask_2x32_128 (data, data); + + while (height--) + { + int w; + uint8_t *d = byte_line; + byte_line += stride; + w = byte_width; + + while (w >= 1 && ((unsigned long)d & 1)) + { + *(uint8_t *)d = data; + w -= 1; + d += 1; + } + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(uint16_t *)d = data; + w -= 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 15)) + { + *(uint32_t *)d = data; + + w -= 4; + d += 4; + } + + while (w >= 128) + { + save_128_aligned ((__m128i*)(d), xmm_def); + save_128_aligned ((__m128i*)(d + 16), xmm_def); + save_128_aligned ((__m128i*)(d + 32), xmm_def); + save_128_aligned ((__m128i*)(d + 48), xmm_def); + save_128_aligned ((__m128i*)(d + 64), xmm_def); + save_128_aligned ((__m128i*)(d + 80), xmm_def); + save_128_aligned ((__m128i*)(d + 96), xmm_def); + save_128_aligned ((__m128i*)(d + 112), xmm_def); + + d += 128; + w -= 128; + } + + if (w >= 64) + { + save_128_aligned ((__m128i*)(d), xmm_def); + save_128_aligned ((__m128i*)(d + 16), xmm_def); + save_128_aligned ((__m128i*)(d + 32), xmm_def); + save_128_aligned ((__m128i*)(d + 48), xmm_def); + + d += 64; + w -= 64; + } + + if (w >= 32) + { + save_128_aligned ((__m128i*)(d), xmm_def); + save_128_aligned ((__m128i*)(d + 16), xmm_def); + + d += 32; + w -= 32; + } + + if (w >= 16) + { + save_128_aligned ((__m128i*)(d), xmm_def); + + d += 16; + w -= 16; + } + + while (w >= 4) + { + *(uint32_t *)d = data; + + w -= 4; + d += 4; + } + + if (w >= 2) + { + *(uint16_t *)d = data; + w -= 2; + d += 2; + } + + if (w >= 1) + { + *(uint8_t *)d = data; + w -= 1; + d += 1; + } + } + + _mm_empty (); + return TRUE; +} + +static void +sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t m; + + __m128i xmm_src, xmm_def; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + { + pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride, + PIXMAN_FORMAT_BPP (dst_image->bits.format), + dest_x, dest_y, width, height, 0); + return; + } + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + xmm_def = create_mask_2x32_128 (src, src); + xmm_src = expand_pixel_32_1x128 (src); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + uint8_t m = *mask++; + + if (m) + { + *dst = pack_1x64_32 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); + } + else + { + *dst = 0; + } + + w--; + dst++; + } + + while (w >= 4) + { + m = *((uint32_t*)mask); + + if (srca == 0xff && m == 0xffffffff) + { + save_128_aligned ((__m128i*)dst, xmm_def); + } + else if (m) + { + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_src, &xmm_src, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); + } + else + { + save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + + if (m) + { + *dst = pack_1x64_32 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); + } + else + { + *dst = 0; + } + + w--; + dst++; + } + } + + _mm_empty (); +} + +/*----------------------------------------------------------------------- + * composite_over_n_8_0565 + */ + +static void +sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint16_t *dst_line, *dst, d; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t m; + __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + + __m128i xmm_src, xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = _mm_movepi64_pi64 (xmm_src); + mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); + mmx_dest = expand565_16_1x64 (d); + + *dst = pack_565_32_16 ( + pack_1x64_32 ( + in_over_1x64 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + } + + while (w >= 8) + { + xmm_dst = load_128_aligned ((__m128i*) dst); + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + + m = *((uint32_t*)mask); + mask += 4; + + if (m) + { + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst0, &xmm_dst1); + } + + m = *((uint32_t*)mask); + mask += 4; + + if (m) + { + xmm_mask = unpack_32_1x128 (m); + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + + /* Unpacking */ + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst2, &xmm_dst3); + } + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + dst += 8; + } + + while (w) + { + m = *mask++; + + if (m) + { + d = *dst; + mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); + mmx_dest = expand565_16_1x64 (d); + + *dst = pack_565_32_16 ( + pack_1x64_32 ( + in_over_1x64 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + } + } + + _mm_empty (); +} + +/* ----------------------------------------------------------------------- + * composite_over_pixbuf_0565 + */ + +static void +sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst, d; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + int32_t w; + uint32_t opaque, zero; + + __m64 ms; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +#if 0 + /* FIXME + * + * I copy the code from MMX one and keep the fixme. + * If it's a problem there, probably is a problem here. + */ + assert (src_image->drawable == mask_image->drawable); +#endif + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + s = *src++; + d = *dst; + + ms = unpack_32_1x64 (s); + + *dst++ = pack_565_32_16 ( + pack_1x64_32 ( + over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); + w--; + } + + while (w >= 8) + { + /* First round */ + xmm_src = load_128_unaligned ((__m128i*)src); + xmm_dst = load_128_aligned ((__m128i*)dst); + + opaque = is_opaque (xmm_src); + zero = is_zero (xmm_src); + + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + + /* preload next round*/ + xmm_src = load_128_unaligned ((__m128i*)(src + 4)); + + if (opaque) + { + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst0, &xmm_dst1); + } + else if (!zero) + { + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst0, &xmm_dst1); + } + + /* Second round */ + opaque = is_opaque (xmm_src); + zero = is_zero (xmm_src); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + + if (opaque) + { + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst2, &xmm_dst3); + } + else if (!zero) + { + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst2, &xmm_dst3); + } + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + src += 8; + dst += 8; + } + + while (w) + { + s = *src++; + d = *dst; + + ms = unpack_32_1x64 (s); + + *dst++ = pack_565_32_16 ( + pack_1x64_32 ( + over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); + w--; + } + } + + _mm_empty (); +} + +/* ------------------------------------------------------------------------- + * composite_over_pixbuf_8888 + */ + +static void +sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst, d; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + int32_t w; + uint32_t opaque, zero; + + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +#if 0 + /* FIXME + * + * I copy the code from MMX one and keep the fixme. + * If it's a problem there, probably is a problem here. + */ + assert (src_image->drawable == mask_image->drawable); +#endif + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + s = *src++; + d = *dst; + + *dst++ = pack_1x64_32 ( + over_rev_non_pre_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (d))); + + w--; + } + + while (w >= 4) + { + xmm_src_hi = load_128_unaligned ((__m128i*)src); + + opaque = is_opaque (xmm_src_hi); + zero = is_zero (xmm_src_hi); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + + if (opaque) + { + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + else if (!zero) + { + xmm_dst_hi = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + dst += 4; + src += 4; + } + + while (w) + { + s = *src++; + d = *dst; + + *dst++ = pack_1x64_32 ( + over_rev_non_pre_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (d))); + + w--; + } + } + + _mm_empty (); +} + +/* ------------------------------------------------------------------------------------------------- + * composite_over_n_8888_0565_ca + */ + +static void +sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint16_t *dst_line, *dst, d; + uint32_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int w; + uint32_t pack_cmp; + + __m128i xmm_src, xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + + __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + xmm_src = expand_pixel_32_1x128 (src); + xmm_alpha = expand_alpha_1x128 (xmm_src); + mmx_src = _mm_movepi64_pi64 (xmm_src); + mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + + while (height--) + { + w = width; + mask = mask_line; + dst = dst_line; + mask_line += mask_stride; + dst_line += dst_stride; + + while (w && ((unsigned long)dst & 15)) + { + m = *(uint32_t *) mask; + + if (m) + { + d = *dst; + mmx_mask = unpack_32_1x64 (m); + mmx_dest = expand565_16_1x64 (d); + + *dst = pack_565_32_16 ( + pack_1x64_32 ( + in_over_1x64 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + mask++; + } + + while (w >= 8) + { + /* First round */ + xmm_mask = load_128_unaligned ((__m128i*)mask); + xmm_dst = load_128_aligned ((__m128i*)dst); + + pack_cmp = _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + unpack_565_128_4x128 (xmm_dst, + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + /* preload next round */ + xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); + + /* preload next round */ + if (pack_cmp != 0xffff) + { + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst0, &xmm_dst1); + } + + /* Second round */ + pack_cmp = _mm_movemask_epi8 ( + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + + if (pack_cmp != 0xffff) + { + in_over_2x128 (&xmm_src, &xmm_src, + &xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst2, &xmm_dst3); + } + + save_128_aligned ( + (__m128i*)dst, pack_565_4x128_128 ( + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + + w -= 8; + dst += 8; + mask += 8; + } + + while (w) + { + m = *(uint32_t *) mask; + + if (m) + { + d = *dst; + mmx_mask = unpack_32_1x64 (m); + mmx_dest = expand565_16_1x64 (d); + + *dst = pack_565_32_16 ( + pack_1x64_32 ( + in_over_1x64 ( + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); + } + + w--; + dst++; + mask++; + } + } + + _mm_empty (); +} + +/* ----------------------------------------------------------------------- + * composite_in_n_8_8 + */ + +static void +sse2_composite_in_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + uint32_t d, m; + uint32_t src; + uint8_t sa; + int32_t w; + + __m128i xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + sa = src >> 24; + + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), + unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } + + while (w >= 16) + { + xmm_mask = load_128_unaligned ((__m128i*)mask); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + mask += 16; + dst += 16; + w -= 16; + } + + while (w) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + pix_multiply_1x64 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } + } + + _mm_empty (); +} + +/* ----------------------------------------------------------------------- + * composite_in_n_8 + */ + +static void +sse2_composite_in_n_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + int dst_stride; + uint32_t d; + uint32_t src; + int32_t w; + + __m128i xmm_alpha; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); + + src = src >> 24; + + if (src == 0xff) + return; + + if (src == 0x00) + { + pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride, + 8, dest_x, dest_y, width, height, src); + + return; + } + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_alpha), + unpack_32_1x64 (d))); + w--; + } + + while (w >= 16) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + dst += 16; + w -= 16; + } + + while (w) + { + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_alpha), + unpack_32_1x64 (d))); + w--; + } + } + + _mm_empty (); +} + +/* --------------------------------------------------------------------------- + * composite_in_8_8 + */ + +static void +sse2_composite_in_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int src_stride, dst_stride; + int32_t w; + uint32_t s, d; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + s = (uint32_t) *src++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + pix_multiply_1x64 ( + unpack_32_1x64 (s), unpack_32_1x64 (d))); + w--; + } + + while (w >= 16) + { + xmm_src = load_128_unaligned ((__m128i*)src); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + src += 16; + dst += 16; + w -= 16; + } + + while (w) + { + s = (uint32_t) *src++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); + w--; + } + } + + _mm_empty (); +} + +/* ------------------------------------------------------------------------- + * composite_add_n_8_8 + */ + +static void +sse2_composite_add_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t src; + uint8_t sa; + uint32_t m, d; + + __m128i xmm_alpha; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + sa = src >> 24; + + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + _mm_adds_pu16 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } + + while (w >= 16) + { + xmm_mask = load_128_unaligned ((__m128i*)mask); + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); + xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + mask += 16; + dst += 16; + w -= 16; + } + + while (w) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 ( + _mm_adds_pu16 ( + pix_multiply_1x64 ( + _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + + w--; + } + } + + _mm_empty (); +} + +/* ------------------------------------------------------------------------- + * composite_add_n_8_8 + */ + +static void +sse2_composite_add_n_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + int dst_stride; + int32_t w; + uint32_t src; + + __m128i xmm_src; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + src >>= 24; + + if (src == 0x00) + return; + + if (src == 0xff) + { + pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride, + 8, dest_x, dest_y, width, height, 0xff); + + return; + } + + src = (src << 24) | (src << 16) | (src << 8) | src; + xmm_src = _mm_set_epi32 (src, src, src, src); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + *dst = (uint8_t)_mm_cvtsi64_si32 ( + _mm_adds_pu8 ( + _mm_movepi64_pi64 (xmm_src), + _mm_cvtsi32_si64 (*dst))); + + w--; + dst++; + } + + while (w >= 16) + { + save_128_aligned ( + (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); + + dst += 16; + w -= 16; + } + + while (w) + { + *dst = (uint8_t)_mm_cvtsi64_si32 ( + _mm_adds_pu8 ( + _mm_movepi64_pi64 (xmm_src), + _mm_cvtsi32_si64 (*dst))); + + w--; + dst++; + } + } + + _mm_empty (); +} + +/* ---------------------------------------------------------------------- + * composite_add_8_8 + */ + +static void +sse2_composite_add_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint16_t t; + + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + src = src_line; + + dst_line += dst_stride; + src_line += src_stride; + w = width; + + /* Small head */ + while (w && (unsigned long)dst & 3) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); + w--; + } + + core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); + + /* Small tail */ + dst += w & 0xfffc; + src += w & 0xfffc; + + w &= 3; + + while (w) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); + w--; + } + } + + _mm_empty (); +} + +/* --------------------------------------------------------------------- + * composite_add_8888_8888 + */ +static void +sse2_composite_add_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + + core_combine_add_u_sse2 (dst, src, NULL, width); + } + + _mm_empty (); +} + +/* ------------------------------------------------------------------------------------------------- + * sse2_composite_copy_area + */ + +static pixman_bool_t +pixman_blt_sse2 (uint32_t *src_bits, + uint32_t *dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + uint8_t * src_bytes; + uint8_t * dst_bytes; + int byte_width; + + if (src_bpp != dst_bpp) + return FALSE; + + if (src_bpp == 16) + { + src_stride = src_stride * (int) sizeof (uint32_t) / 2; + dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; + src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); + dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + byte_width = 2 * width; + src_stride *= 2; + dst_stride *= 2; + } + else if (src_bpp == 32) + { + src_stride = src_stride * (int) sizeof (uint32_t) / 4; + dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; + src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); + dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + byte_width = 4 * width; + src_stride *= 4; + dst_stride *= 4; + } + else + { + return FALSE; + } + + while (height--) + { + int w; + uint8_t *s = src_bytes; + uint8_t *d = dst_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + w = byte_width; + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(uint16_t *)d = *(uint16_t *)s; + w -= 2; + s += 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 15)) + { + *(uint32_t *)d = *(uint32_t *)s; + + w -= 4; + s += 4; + d += 4; + } + + while (w >= 64) + { + __m128i xmm0, xmm1, xmm2, xmm3; + + xmm0 = load_128_unaligned ((__m128i*)(s)); + xmm1 = load_128_unaligned ((__m128i*)(s + 16)); + xmm2 = load_128_unaligned ((__m128i*)(s + 32)); + xmm3 = load_128_unaligned ((__m128i*)(s + 48)); + + save_128_aligned ((__m128i*)(d), xmm0); + save_128_aligned ((__m128i*)(d + 16), xmm1); + save_128_aligned ((__m128i*)(d + 32), xmm2); + save_128_aligned ((__m128i*)(d + 48), xmm3); + + s += 64; + d += 64; + w -= 64; + } + + while (w >= 16) + { + save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); + + w -= 16; + d += 16; + s += 16; + } + + while (w >= 4) + { + *(uint32_t *)d = *(uint32_t *)s; + + w -= 4; + s += 4; + d += 4; + } + + if (w >= 2) + { + *(uint16_t *)d = *(uint16_t *)s; + w -= 2; + s += 2; + d += 2; + } + } + + _mm_empty (); + + return TRUE; +} + +static void +sse2_composite_copy_area (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + pixman_blt_sse2 (src_image->bits.bits, + dst_image->bits.bits, + src_image->bits.rowstride, + dst_image->bits.rowstride, + PIXMAN_FORMAT_BPP (src_image->bits.format), + PIXMAN_FORMAT_BPP (dst_image->bits.format), + src_x, src_y, dest_x, dest_y, width, height); +} + +static void +sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *src, *src_line, s; + uint32_t *dst, *dst_line, d; + uint8_t *mask, *mask_line; + uint32_t m; + int src_stride, mask_stride, dst_stride; + int32_t w; + __m64 ms; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + + while (w && (unsigned long)dst & 15) + { + s = 0xff000000 | *src++; + m = (uint32_t) *mask++; + d = *dst; + ms = unpack_32_1x64 (s); + + if (m != 0xff) + { + __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); + __m64 md = unpack_32_1x64 (d); + + ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md); + } + + *dst++ = pack_1x64_32 (ms); + w--; + } + + while (w >= 4) + { + m = *(uint32_t*) mask; + xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000); + + if (m == 0xffffffff) + { + save_128_aligned ((__m128i*)dst, xmm_src); + } + else + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + src += 4; + dst += 4; + mask += 4; + w -= 4; + } + + while (w) + { + m = (uint32_t) *mask++; + + if (m) + { + s = 0xff000000 | *src; + + if (m == 0xff) + { + *dst = s; + } + else + { + __m64 ma, md, ms; + + d = *dst; + + ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); + md = unpack_32_1x64 (d); + ms = unpack_32_1x64 (s); + + *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md)); + } + + } + + src++; + dst++; + w--; + } + } + + _mm_empty (); +} + +static void +sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *src, *src_line, s; + uint32_t *dst, *dst_line, d; + uint8_t *mask, *mask_line; + uint32_t m; + int src_stride, mask_stride, dst_stride; + int32_t w; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + + while (w && (unsigned long)dst & 15) + { + uint32_t sa; + + s = *src++; + m = (uint32_t) *mask++; + d = *dst; + + sa = s >> 24; + + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m64 ms, md, ma, msa; + + ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); + ms = unpack_32_1x64 (s); + md = unpack_32_1x64 (d); + + msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + + *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } + + while (w >= 4) + { + m = *(uint32_t *) mask; + + if (m) + { + xmm_src = load_128_unaligned ((__m128i*)src); + + if (m == 0xffffffff && is_opaque (xmm_src)) + { + save_128_aligned ((__m128i *)dst, xmm_src); + } + else + { + xmm_dst = load_128_aligned ((__m128i *)dst); + + xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, + &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + } + + src += 4; + dst += 4; + mask += 4; + w -= 4; + } + + while (w) + { + uint32_t sa; + + s = *src++; + m = (uint32_t) *mask++; + d = *dst; + + sa = s >> 24; + + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m64 ms, md, ma, msa; + + ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); + ms = unpack_32_1x64 (s); + md = unpack_32_1x64 (d); + + msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + + *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } + } + + _mm_empty (); +} + +static void +sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint32_t *dst_line, *dst; + __m128i xmm_src; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_dsta_hi, xmm_dsta_lo; + int dst_stride; + int32_t w; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + xmm_src = expand_pixel_32_1x128 (src); + + while (height--) + { + dst = dst_line; + + dst_line += dst_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + __m64 vd; + + vd = unpack_32_1x64 (*dst); + + *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd), + _mm_movepi64_pi64 (xmm_src))); + w--; + dst++; + } + + while (w >= 4) + { + __m128i tmp_lo, tmp_hi; + + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); + + tmp_lo = xmm_src; + tmp_hi = xmm_src; + + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_dsta_lo, &xmm_dsta_hi, + &tmp_lo, &tmp_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); + + w -= 4; + dst += 4; + } + + while (w) + { + __m64 vd; + + vd = unpack_32_1x64 (*dst); + + *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd), + _mm_movepi64_pi64 (xmm_src))); + w--; + dst++; + } + + } + + _mm_empty (); +} + +static void +sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *src, *src_line, s; + uint32_t *dst, *dst_line, d; + uint32_t *mask, *mask_line; + uint32_t m; + int src_stride, mask_stride, dst_stride; + int32_t w; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + + while (w && (unsigned long)dst & 15) + { + uint32_t sa; + + s = *src++; + m = (*mask++) >> 24; + d = *dst; + + sa = s >> 24; + + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m64 ms, md, ma, msa; + + ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); + ms = unpack_32_1x64 (s); + md = unpack_32_1x64 (d); + + msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + + *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } + + while (w >= 4) + { + xmm_mask = load_128_unaligned ((__m128i*)mask); + + if (!is_transparent (xmm_mask)) + { + xmm_src = load_128_unaligned ((__m128i*)src); + + if (is_opaque (xmm_mask) && is_opaque (xmm_src)) + { + save_128_aligned ((__m128i *)dst, xmm_src); + } + else + { + xmm_dst = load_128_aligned ((__m128i *)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); + expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, + &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + } + + src += 4; + dst += 4; + mask += 4; + w -= 4; + } + + while (w) + { + uint32_t sa; + + s = *src++; + m = (*mask++) >> 24; + d = *dst; + + sa = s >> 24; + + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m64 ms, md, ma, msa; + + ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); + ms = unpack_32_1x64 (s); + md = unpack_32_1x64 (d); + + msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + + *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } + } + + _mm_empty (); +} + +/* A variant of 'core_combine_over_u_sse2' with minor tweaks */ +static force_inline void +scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, + const uint32_t* ps, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx) +{ + uint32_t s, d; + const uint32_t* pm = NULL; + + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + /* Align dst on a 16-byte boundary */ + while (w && ((unsigned long)pd & 15)) + { + d = *pd; + s = combine1 (ps + (vx >> 16), pm); + vx += unit_x; + + *pd++ = core_combine_over_u_pixel_sse2 (s, d); + if (pm) + pm++; + w--; + } + + while (w >= 4) + { + __m128i tmp; + uint32_t tmp1, tmp2, tmp3, tmp4; + + tmp1 = ps[vx >> 16]; + vx += unit_x; + tmp2 = ps[vx >> 16]; + vx += unit_x; + tmp3 = ps[vx >> 16]; + vx += unit_x; + tmp4 = ps[vx >> 16]; + vx += unit_x; + + tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); + + xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); + + if (is_opaque (xmm_src_hi)) + { + save_128_aligned ((__m128i*)pd, xmm_src_hi); + } + else if (!is_zero (xmm_src_hi)) + { + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 ( + xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); + + over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + /* rebuid the 4 pixel data and save*/ + save_128_aligned ((__m128i*)pd, + pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + pd += 4; + if (pm) + pm += 4; + } + + while (w) + { + d = *pd; + s = combine1 (ps + (vx >> 16), pm); + vx += unit_x; + + *pd++ = core_combine_over_u_pixel_sse2 (s, d); + if (pm) + pm++; + + w--; + } + _mm_empty (); +} + +FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, COVER) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, NONE) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, PAD) + +static const pixman_fast_path_t sse2_fast_paths[] = +{ + /* PIXMAN_OP_OVER */ + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), + + /* PIXMAN_OP_OVER_REVERSE */ + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), + + /* PIXMAN_OP_ADD */ + PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), + PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), + + /* PIXMAN_OP_SRC */ + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area), + + /* PIXMAN_OP_IN */ + PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8), + PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8), + PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8), + + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + + { PIXMAN_OP_NONE }, +}; + +static pixman_bool_t +sse2_blt (pixman_implementation_t *imp, + uint32_t * src_bits, + uint32_t * dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + if (!pixman_blt_sse2 ( + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dst_x, dst_y, width, height)) + + { + return _pixman_implementation_blt ( + imp->delegate, + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dst_x, dst_y, width, height); + } + + return TRUE; +} + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +static pixman_bool_t +sse2_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor)) + { + return _pixman_implementation_fill ( + imp->delegate, bits, stride, bpp, x, y, width, height, xor); + } + + return TRUE; +} + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +pixman_implementation_t * +_pixman_implementation_create_sse2 (void) +{ +#ifdef USE_MMX + pixman_implementation_t *fallback = _pixman_implementation_create_mmx (); +#else + pixman_implementation_t *fallback = _pixman_implementation_create_fast_path (); +#endif + pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths); + + /* SSE2 constants */ + mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); + mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); + mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); + mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); + mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); + mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); + mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); + mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); + mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); + mask_0080 = create_mask_16_128 (0x0080); + mask_00ff = create_mask_16_128 (0x00ff); + mask_0101 = create_mask_16_128 (0x0101); + mask_ffff = create_mask_16_128 (0xffff); + mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); + mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); + + /* MMX constants */ + mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f); + mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840); + + mask_x0080 = create_mask_16_64 (0x0080); + mask_x00ff = create_mask_16_64 (0x00ff); + mask_x0101 = create_mask_16_64 (0x0101); + mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000); + + _mm_empty (); + + /* Set up function pointers */ + + /* SSE code patch for fbcompose.c */ + imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; + imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; + imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; + imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; + imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; + imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; + imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; + imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; + imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; + imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; + + imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; + + imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; + imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; + imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; + imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; + imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; + imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; + + imp->blt = sse2_blt; + imp->fill = sse2_fill; + + return imp; +} + +#endif /* USE_SSE2 */ diff --git a/pixman/test/Makefile.am b/pixman/test/Makefile.am index 19c4f8006..71e535374 100644 --- a/pixman/test/Makefile.am +++ b/pixman/test/Makefile.am @@ -23,40 +23,61 @@ TESTPROGRAMS = \ composite a1_trap_test_LDADD = $(TEST_LDADD) +a1_trap_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ + fetch_test_LDADD = $(TEST_LDADD) +fetch_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ + trap_crasher_LDADD = $(TEST_LDADD) +trap_crasher_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ + oob_test_LDADD = $(TEST_LDADD) +oob_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ + scaling_crash_test_LDADD = $(TEST_LDADD) +scaling_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ + region_translate_test_LDADD = $(TEST_LDADD) +region_translate_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ pdf_op_test_LDADD = $(TEST_LDADD) +pdf_op_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ pdf_op_test_SOURCES = pdf-op-test.c utils.c utils.h region_test_LDADD = $(TEST_LDADD) +region_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ region_test_SOURCES = region-test.c utils.c utils.h blitters_test_LDADD = $(TEST_LDADD) +blitters_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ blitters_test_SOURCES = blitters-test.c utils.c utils.h scaling_test_LDADD = $(TEST_LDADD) +scaling_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ scaling_test_SOURCES = scaling-test.c utils.c utils.h affine_test_LDADD = $(TEST_LDADD) +affine_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ affine_test_SOURCES = affine-test.c utils.c utils.h alphamap_LDADD = $(TEST_LDADD) +alphamap_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ alphamap_SOURCES = alphamap.c utils.c utils.h alpha_loop_LDADD = $(TEST_LDADD) +alpha_loop_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ alpha_loop_SOURCES = alpha-loop.c utils.c utils.h composite_LDADD = $(TEST_LDADD) +composite_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ composite_SOURCES = composite.c utils.c utils.h gradient_crash_test_LDADD = $(TEST_LDADD) +gradient_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h stress_test_LDADD = $(TEST_LDADD) +stress_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ stress_test_SOURCES = stress-test.c utils.c utils.h # GTK using test programs |