aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pixman/configure.ac1580
-rw-r--r--pixman/pixman/pixman-bits-image.c2884
-rw-r--r--pixman/pixman/pixman-compiler.h407
-rw-r--r--pixman/pixman/pixman-cpu.c1201
-rw-r--r--pixman/pixman/pixman-fast-path.c3874
-rw-r--r--pixman/pixman/pixman-fast-path.h894
-rw-r--r--pixman/pixman/pixman-matrix.c3
-rw-r--r--pixman/pixman/pixman-sse2.c12062
-rw-r--r--pixman/test/Makefile.am21
9 files changed, 11484 insertions, 11442 deletions
diff --git a/pixman/configure.ac b/pixman/configure.ac
index ac0d16158..6552f1270 100644
--- a/pixman/configure.ac
+++ b/pixman/configure.ac
@@ -1,783 +1,797 @@
-dnl Copyright 2005 Red Hat, Inc.
-dnl
-dnl Permission to use, copy, modify, distribute, and sell this software and its
-dnl documentation for any purpose is hereby granted without fee, provided that
-dnl the above copyright notice appear in all copies and that both that
-dnl copyright notice and this permission notice appear in supporting
-dnl documentation, and that the name of Red Hat not be used in
-dnl advertising or publicity pertaining to distribution of the software without
-dnl specific, written prior permission. Red Hat makes no
-dnl representations about the suitability of this software for any purpose. It
-dnl is provided "as is" without express or implied warranty.
-dnl
-dnl RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
-dnl INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
-dnl EVENT SHALL RED HAT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
-dnl CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
-dnl DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-dnl TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-dnl PERFORMANCE OF THIS SOFTWARE.
-dnl
-dnl Process this file with autoconf to create configure.
-
-AC_PREREQ([2.57])
-
-# Pixman versioning scheme
-#
-# - The version in git has an odd MICRO version number
-#
-# - Released versions both development and stable have an even MICRO
-# version number
-#
-# - Released development versions have an odd MINOR number
-#
-# - Released stable versions have an even MINOR number
-#
-# - Versions that break ABI must have a new MAJOR number
-#
-# - If you break the ABI, then at least this must be done:
-#
-# - increment MAJOR
-#
-# - In the first development release where you break ABI, find
-# all instances of "pixman-n" and change them to pixman-(n+1)
-#
-# This needs to be done at least in
-# configure.ac
-# all Makefile.am's
-# pixman-n.pc.in
-#
-# This ensures that binary incompatible versions can be installed
-# in parallel. See http://www106.pair.com/rhp/parallel.html for
-# more information
-#
-
-m4_define([pixman_major], 0)
-m4_define([pixman_minor], 21)
-m4_define([pixman_micro], 3)
-
-m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
-
-AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman)
-AM_INIT_AUTOMAKE([foreign dist-bzip2])
-
-# Suppress verbose compile lines
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-
-AM_CONFIG_HEADER(config.h)
-
-AC_CANONICAL_HOST
-
-test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
-
-AC_PROG_CC
-AM_PROG_AS
-AC_PROG_LIBTOOL
-AC_CHECK_FUNCS([getisax])
-AC_C_BIGENDIAN
-AC_C_INLINE
-
-dnl PIXMAN_LINK_WITH_ENV(env-setup, program, true-action, false-action)
-dnl
-dnl Compiles and links the given program in the environment setup by env-setup
-dnl and executes true-action on success and false-action on failure.
-AC_DEFUN([PIXMAN_LINK_WITH_ENV],[dnl
- save_CFLAGS="$CFLAGS"
- save_LDFLAGS="$LDFLAGS"
- save_LIBS="$LIBS"
- CFLAGS=""
- LDFLAGS=""
- LIBS=""
- $1
- AC_LINK_IFELSE(
- [$2],
- [pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
- pixman_cc_flag=yes],
- [pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
- pixman_cc_flag=no])
-
- if test "x$pixman_cc_stderr" != "x"; then
- pixman_cc_flag=no
- fi
-
- if test "x$pixman_cc_flag" = "xyes"; then
- ifelse([$3], , :, [$3])
- else
- ifelse([$4], , :, [$4])
- fi
- CFLAGS="$save_CFLAGS"
- LDFLAGS="$save_LDFLAGS"
- LIBS="$save_LIBS"
-])
-
-dnl Find a -Werror for catching warnings.
-WERROR=
-for w in -Werror -errwarn; do
- if test "z$WERROR" = "z"; then
- AC_MSG_CHECKING([whether the compiler supports $w])
- PIXMAN_LINK_WITH_ENV(
- [CFLAGS=$w],
- [int main(int c, char **v) { (void)c; (void)v; return 0; }],
- [WERROR=$w; yesno=yes], [yesno=no])
- AC_MSG_RESULT($_yesno)
- fi
-done
-
-dnl PIXMAN_CHECK_CFLAG(flag, [program])
-dnl Adds flag to CFLAGS if the given program links without warnings or errors.
-AC_DEFUN([PIXMAN_CHECK_CFLAG], [dnl
- AC_MSG_CHECKING([whether the compiler supports $1])
- PIXMAN_LINK_WITH_ENV(
- [CFLAGS="$WERROR $1"],
- [$2
- int main(int c, char **v) { (void)c; (void)v; return 0; }
- ],
- [_yesno=yes],
- [_yesno=no])
- if test "x$_yesno" = xyes; then
- CFLAGS="$CFLAGS $1"
- fi
- AC_MSG_RESULT($_yesno)
-])
-
-AC_CHECK_SIZEOF(long)
-
-# Checks for Sun Studio compilers
-AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
-AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
-
-# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
-# if we're using Sun Studio and neither the user nor a config.site
-# has set CFLAGS.
-if test $SUNCC = yes && \
- test "$test_CFLAGS" == "" && \
- test "$CFLAGS" = "-g"
-then
- CFLAGS="-O -g"
-fi
-
-#
-# We ignore pixman_major in the version here because the major version should
-# always be encoded in the actual library name. Ie., the soname is:
-#
-# pixman-$(pixman_major).0.minor.micro
-#
-m4_define([lt_current], [pixman_minor])
-m4_define([lt_revision], [pixman_micro])
-m4_define([lt_age], [pixman_minor])
-
-LT_VERSION_INFO="lt_current:lt_revision:lt_age"
-
-PIXMAN_VERSION_MAJOR=pixman_major()
-AC_SUBST(PIXMAN_VERSION_MAJOR)
-PIXMAN_VERSION_MINOR=pixman_minor()
-AC_SUBST(PIXMAN_VERSION_MINOR)
-PIXMAN_VERSION_MICRO=pixman_micro()
-AC_SUBST(PIXMAN_VERSION_MICRO)
-
-AC_SUBST(LT_VERSION_INFO)
-
-# Check for dependencies
-
-PIXMAN_CHECK_CFLAG([-Wall])
-PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
-
-AC_PATH_PROG(PERL, perl, no)
-if test "x$PERL" = xno; then
- AC_MSG_ERROR([Perl is required to build pixman.])
-fi
-AC_SUBST(PERL)
-
-dnl =========================================================================
-dnl OpenMP for the test suite?
-dnl
-
-# Check for OpenMP support (only supported by autoconf >=2.62)
-OPENMP_CFLAGS=
-m4_ifdef([AC_OPENMP], [AC_OPENMP])
-
-m4_define([openmp_test_program],[dnl
-#include <stdio.h>
-
-extern unsigned int lcg_seed;
-#pragma omp threadprivate(lcg_seed)
-unsigned int lcg_seed;
-
-unsigned function(unsigned a, unsigned b)
-{
- lcg_seed ^= b;
- return ((a + b) ^ a ) + lcg_seed;
-}
-
-int main(int argc, char **argv)
-{
- int i;
- int n1 = 0, n2 = argc;
- unsigned checksum = 0;
- int verbose = argv != NULL;
- unsigned (*test_function)(unsigned, unsigned);
- test_function = function;
- #pragma omp parallel for reduction(+:checksum) default(none) \
- shared(n1, n2, test_function, verbose)
- for (i = n1; i < n2; i++)
- {
- unsigned crc = test_function (i, 0);
- if (verbose)
- printf ("%d: %08X\n", i, crc);
- checksum += crc;
- }
- printf("%u\n", checksum);
- return 0;
-}
-])
-
-PIXMAN_LINK_WITH_ENV(
- [CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"],
- [openmp_test_program],
- [have_openmp=yes],
- [have_openmp=no])
-if test "x$have_openmp" = "xyes"; then
- AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
-else
- OPENMP_CFLAGS=""
-fi
-AC_SUBST(OPENMP_CFLAGS)
-
-dnl =========================================================================
-dnl -fvisibility stuff
-
-PIXMAN_CHECK_CFLAG([-fvisibility=hidden], [dnl
-#if defined(__GNUC__) && (__GNUC__ >= 4)
-#ifdef _WIN32
-#error Have -fvisibility but it is ignored and generates a warning
-#endif
-#else
-error Need GCC 4.0 for visibility
-#endif
-])
-
-PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
-#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
-#else
-error Need Sun Studio 8 for visibility
-#endif
-])
-
-dnl ===========================================================================
-dnl Check for MMX
-
-if test "x$MMX_CFLAGS" = "x" ; then
- if test "x$SUNCC" = "xyes"; then
- # Sun Studio doesn't have an -xarch=mmx flag, so we have to use sse
- # but if we're building 64-bit, mmx & sse support is on by default and
- # -xarch=sse throws an error instead
- if test "$AMD64_ABI" = "no" ; then
- MMX_CFLAGS="-xarch=sse"
- fi
- else
- MMX_CFLAGS="-mmmx -Winline"
- fi
-fi
-
-have_mmx_intrinsics=no
-AC_MSG_CHECKING(whether to use MMX intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$MMX_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-error "Need GCC >= 3.4 for MMX intrinsics"
-#endif
-#include <mmintrin.h>
-int main () {
- __m64 v = _mm_cvtsi32_si64 (1);
- return _mm_cvtsi64_si32 (v);
-}], have_mmx_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(mmx,
- [AC_HELP_STRING([--disable-mmx],
- [disable MMX fast paths])],
- [enable_mmx=$enableval], [enable_mmx=auto])
-
-if test $enable_mmx = no ; then
- have_mmx_intrinsics=disabled
-fi
-
-if test $have_mmx_intrinsics = yes ; then
- AC_DEFINE(USE_MMX, 1, [use MMX compiler intrinsics])
-else
- MMX_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_mmx_intrinsics)
-if test $enable_mmx = yes && test $have_mmx_intrinsics = no ; then
- AC_MSG_ERROR([MMX intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_MMX, test $have_mmx_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Check for SSE2
-
-if test "x$SSE2_CFLAGS" = "x" ; then
- if test "x$SUNCC" = "xyes"; then
- # SSE2 is enabled by default in the Sun Studio 64-bit environment
- if test "$AMD64_ABI" = "no" ; then
- SSE2_CFLAGS="-xarch=sse2"
- fi
- else
- SSE2_CFLAGS="-mmmx -msse2 -Winline"
- fi
-fi
-
-have_sse2_intrinsics=no
-AC_MSG_CHECKING(whether to use SSE2 intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$SSE2_CFLAGS $CFLAGS"
-
-AC_COMPILE_IFELSE([
-#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
-# if !defined(__amd64__) && !defined(__x86_64__)
-# error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
-# endif
-#endif
-#include <mmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-int main () {
- __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
- c = _mm_xor_si128 (a, b);
- return 0;
-}], have_sse2_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(sse2,
- [AC_HELP_STRING([--disable-sse2],
- [disable SSE2 fast paths])],
- [enable_sse2=$enableval], [enable_sse2=auto])
-
-if test $enable_sse2 = no ; then
- have_sse2_intrinsics=disabled
-fi
-
-if test $have_sse2_intrinsics = yes ; then
- AC_DEFINE(USE_SSE2, 1, [use SSE2 compiler intrinsics])
-fi
-
-AC_MSG_RESULT($have_sse2_intrinsics)
-if test $enable_sse2 = yes && test $have_sse2_intrinsics = no ; then
- AC_MSG_ERROR([SSE2 intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Other special flags needed when building code using MMX or SSE instructions
-case $host_os in
- solaris*)
- # When building 32-bit binaries, apply a mapfile to ensure that the
- # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
- # since they check at runtime before using those instructions.
- # Not all linkers grok the mapfile format so we check for that first.
- if test "$AMD64_ABI" = "no" ; then
- use_hwcap_mapfile=no
- AC_MSG_CHECKING(whether to use a hardware capability map file)
- hwcap_save_LDFLAGS="$LDFLAGS"
- HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
- LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
- AC_LINK_IFELSE([int main() { return 0; }],
- use_hwcap_mapfile=yes,
- HWCAP_LDFLAGS="")
- LDFLAGS="$hwcap_save_LDFLAGS"
- AC_MSG_RESULT($use_hwcap_mapfile)
- fi
- if test "x$MMX_LDFLAGS" = "x" ; then
- MMX_LDFLAGS="$HWCAP_LDFLAGS"
- fi
- if test "x$SSE2_LDFLAGS" = "x" ; then
- SSE2_LDFLAGS="$HWCAP_LDFLAGS"
- fi
- ;;
-esac
-
-AC_SUBST(MMX_CFLAGS)
-AC_SUBST(MMX_LDFLAGS)
-AC_SUBST(SSE2_CFLAGS)
-AC_SUBST(SSE2_LDFLAGS)
-
-dnl ===========================================================================
-dnl Check for VMX/Altivec
-if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
- VMX_CFLAGS="-faltivec"
-else
- VMX_CFLAGS="-maltivec -mabi=altivec"
-fi
-
-have_vmx_intrinsics=no
-AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$VMX_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-error "Need GCC >= 3.4 for sane altivec support"
-#endif
-#include <altivec.h>
-int main () {
- vector unsigned int v = vec_splat_u32 (1);
- v = vec_sub (v, v);
- return 0;
-}], have_vmx_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(vmx,
- [AC_HELP_STRING([--disable-vmx],
- [disable VMX fast paths])],
- [enable_vmx=$enableval], [enable_vmx=auto])
-
-if test $enable_vmx = no ; then
- have_vmx_intrinsics=disabled
-fi
-
-if test $have_vmx_intrinsics = yes ; then
- AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
-else
- VMX_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_vmx_intrinsics)
-if test $enable_vmx = yes && test $have_vmx_intrinsics = no ; then
- AC_MSG_ERROR([VMX intrinsics not detected])
-fi
-
-AC_SUBST(VMX_CFLAGS)
-
-AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports ARM SIMD instructions
-have_arm_simd=no
-AC_MSG_CHECKING(whether to use ARM SIMD assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-x assembler-with-cpp $CFLAGS"
-AC_COMPILE_IFELSE([[
-.text
-.arch armv6
-.object_arch armv4
-.arm
-.altmacro
-#ifndef __ARM_EABI__
-#error EABI is required (to be sure that calling conventions are compatible)
-#endif
-pld [r0]
-uqadd8 r0, r0, r0]], have_arm_simd=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(arm-simd,
- [AC_HELP_STRING([--disable-arm-simd],
- [disable ARM SIMD fast paths])],
- [enable_arm_simd=$enableval], [enable_arm_simd=auto])
-
-if test $enable_arm_simd = no ; then
- have_arm_simd=disabled
-fi
-
-if test $have_arm_simd = yes ; then
- AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
-
-AC_MSG_RESULT($have_arm_simd)
-if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
- AC_MSG_ERROR([ARM SIMD intrinsics not detected])
-fi
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports NEON instructions
-have_arm_neon=no
-AC_MSG_CHECKING(whether to use ARM NEON assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-x assembler-with-cpp $CFLAGS"
-AC_COMPILE_IFELSE([[
-.text
-.fpu neon
-.arch armv7a
-.object_arch armv4
-.eabi_attribute 10, 0
-.arm
-.altmacro
-#ifndef __ARM_EABI__
-#error EABI is required (to be sure that calling conventions are compatible)
-#endif
-pld [r0]
-vmovn.u16 d0, q0]], have_arm_neon=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(arm-neon,
- [AC_HELP_STRING([--disable-arm-neon],
- [disable ARM NEON fast paths])],
- [enable_arm_neon=$enableval], [enable_arm_neon=auto])
-
-if test $enable_arm_neon = no ; then
- have_arm_neon=disabled
-fi
-
-if test $have_arm_neon = yes ; then
- AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
-
-AC_MSG_RESULT($have_arm_neon)
-if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
- AC_MSG_ERROR([ARM NEON intrinsics not detected])
-fi
-
-dnl =========================================================================================
-dnl Check for GNU-style inline assembly support
-
-have_gcc_inline_asm=no
-AC_MSG_CHECKING(whether to use GNU-style inline assembler)
-AC_COMPILE_IFELSE([
-int main () {
- /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
- asm volatile ( "\tnop\n" : : : "cc", "memory" );
- return 0;
-}], have_gcc_inline_asm=yes)
-
-AC_ARG_ENABLE(gcc-inline-asm,
- [AC_HELP_STRING([--disable-gcc-inline-asm],
- [disable GNU-style inline assembler])],
- [enable_gcc_inline_asm=$enableval], [enable_gcc_inline_asm=auto])
-
-if test $enable_gcc_inline_asm = no ; then
- have_gcc_inline_asm=disabled
-fi
-
-if test $have_gcc_inline_asm = yes ; then
- AC_DEFINE(USE_GCC_INLINE_ASM, 1, [use GNU-style inline assembler])
-fi
-
-AC_MSG_RESULT($have_gcc_inline_asm)
-if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then
- AC_MSG_ERROR([GNU-style inline assembler not detected])
-fi
-
-AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
-
-dnl ==============================================
-dnl Timers
-
-AC_ARG_ENABLE(timers,
- [AC_HELP_STRING([--enable-timers],
- [enable TIMER_BEGIN and TIMER_END macros [default=no]])],
- [enable_timers=$enableval], [enable_timers=no])
-
-if test $enable_timers = yes ; then
- AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros])
-fi
-AC_SUBST(PIXMAN_TIMERS)
-
-dnl ===================================
-dnl GTK+
-
-AC_ARG_ENABLE(gtk,
- [AC_HELP_STRING([--enable-gtk],
- [enable tests using GTK+ [default=auto]])],
- [enable_gtk=$enableval], [enable_gtk=auto])
-
-PKG_PROG_PKG_CONFIG
-
-if test $enable_gtk = yes ; then
- AC_CHECK_LIB([pixman-1], [pixman_version_string])
- PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1])
-fi
-
-if test $enable_gtk = auto ; then
- AC_CHECK_LIB([pixman-1], [pixman_version_string], [enable_gtk=auto], [enable_gtk=no])
-fi
-
-if test $enable_gtk = auto ; then
- PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1], [enable_gtk=yes], [enable_gtk=no])
-fi
-
-AM_CONDITIONAL(HAVE_GTK, [test "x$enable_gtk" = xyes])
-
-AC_SUBST(GTK_CFLAGS)
-AC_SUBST(GTK_LIBS)
-AC_SUBST(DEP_CFLAGS)
-AC_SUBST(DEP_LIBS)
-
-dnl =====================================
-dnl posix_memalign, sigaction, alarm, gettimeofday
-
-AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
-if test x$have_posix_memalign = xyes; then
- AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
-fi
-
-AC_CHECK_FUNC(sigaction, have_sigaction=yes, have_sigaction=no)
-if test x$have_sigaction = xyes; then
- AC_DEFINE(HAVE_SIGACTION, 1, [Whether we have sigaction()])
-fi
-
-AC_CHECK_FUNC(alarm, have_alarm=yes, have_alarm=no)
-if test x$have_alarm = xyes; then
- AC_DEFINE(HAVE_ALARM, 1, [Whether we have alarm()])
-fi
-
-AC_CHECK_HEADER([sys/mman.h],
- [AC_DEFINE(HAVE_SYS_MMAN_H, [1], [Define to 1 if we have <sys/mman.h>])])
-
-AC_CHECK_FUNC(mprotect, have_mprotect=yes, have_mprotect=no)
-if test x$have_mprotect = xyes; then
- AC_DEFINE(HAVE_MPROTECT, 1, [Whether we have mprotect()])
-fi
-
-AC_CHECK_FUNC(getpagesize, have_getpagesize=yes, have_getpagesize=no)
-if test x$have_getpagesize = xyes; then
- AC_DEFINE(HAVE_GETPAGESIZE, 1, [Whether we have getpagesize()])
-fi
-
-AC_CHECK_HEADER([fenv.h],
- [AC_DEFINE(HAVE_FENV_H, [1], [Define to 1 if we have <fenv.h>])])
-
-AC_CHECK_LIB(m, feenableexcept, have_feenableexcept=yes, have_feenableexcept=no)
-if test x$have_feenableexcept = xyes; then
- AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()])
-fi
-
-AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no)
-AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no)
-if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
- AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()])
-fi
-
-dnl =====================================
-dnl Thread local storage
-
-support_for__thread=no
-
-AC_MSG_CHECKING(for __thread)
-AC_LINK_IFELSE([
-#if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
-#error This MinGW version has broken __thread support
-#endif
-#ifdef __OpenBSD__
-#error OpenBSD has broken __thread support
-#endif
-static __thread int x ;
-int main () { x = 123; return x; }
-], support_for__thread=yes)
-
-if test $support_for__thread = yes; then
- AC_DEFINE([TOOLCHAIN_SUPPORTS__THREAD],[],[Whether the tool chain supports __thread])
-fi
-
-AC_MSG_RESULT($support_for__thread)
-
-dnl
-dnl posix tls
-dnl
-
-m4_define([pthread_test_program],[dnl
-#include <stdlib.h>
-#include <pthread.h>
-
-static pthread_once_t once_control = PTHREAD_ONCE_INIT;
-static pthread_key_t key;
-
-static void
-make_key (void)
-{
- pthread_key_create (&key, NULL);
-}
-
-int
-main ()
-{
- void *value = NULL;
-
- if (pthread_once (&once_control, make_key) != 0)
- {
- value = NULL;
- }
- else
- {
- value = pthread_getspecific (key);
- if (!value)
- {
- value = malloc (100);
- pthread_setspecific (key, value);
- }
- }
- return 0;
-}
-])
-
-AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
- if test "z$support_for_pthread_setspecific" != "zyes"; then
- PIXMAN_LINK_WITH_ENV(
- [$1], [pthread_test_program],
- [PTHREAD_CFLAGS="$CFLAGS"
- PTHREAD_LIBS="$LIBS"
- PTHREAD_LDFLAGS="$LDFLAGS"
- support_for_pthread_setspecific=yes])
- fi
-])
-
-if test $support_for__thread = no; then
- support_for_pthread_setspecific=no
-
- AC_MSG_CHECKING(for pthread_setspecific)
-
- PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
- PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
- PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
-
- if test $support_for_pthread_setspecific = yes; then
- CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
- AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
- fi
-
- AC_MSG_RESULT($support_for_pthread_setspecific);
-fi
-
-AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
-AC_SUBST(HAVE_PTHREAD_SETSPECIFIC)
-AC_SUBST(PTHREAD_LDFLAGS)
-AC_SUBST(PTHREAD_LIBS)
-
-dnl =====================================
-dnl __attribute__((constructor))
-
-support_for_attribute_constructor=no
-
-AC_MSG_CHECKING(for __attribute__((constructor)))
-AC_LINK_IFELSE([
-#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
-/* attribute 'constructor' is supported since gcc 2.7, but some compilers
- * may only pretend to be gcc, so let's try to actually use it
- */
-static int x = 1;
-static void __attribute__((constructor)) constructor_function () { x = 0; }
-int main (void) { return x; }
-#else
-#error not gcc or gcc version is older than 2.7
-#endif
-], support_for_attribute_constructor=yes)
-
-if test x$support_for_attribute_constructor = xyes; then
- AC_DEFINE([TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR],
- [],[Whether the tool chain supports __attribute__((constructor))])
-fi
-
-AC_MSG_RESULT($support_for_attribute_constructor)
-AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
-
-AC_OUTPUT([pixman-1.pc
- pixman-1-uninstalled.pc
- Makefile
- pixman/Makefile
- pixman/pixman-version.h
- test/Makefile])
+dnl Copyright 2005 Red Hat, Inc.
+dnl
+dnl Permission to use, copy, modify, distribute, and sell this software and its
+dnl documentation for any purpose is hereby granted without fee, provided that
+dnl the above copyright notice appear in all copies and that both that
+dnl copyright notice and this permission notice appear in supporting
+dnl documentation, and that the name of Red Hat not be used in
+dnl advertising or publicity pertaining to distribution of the software without
+dnl specific, written prior permission. Red Hat makes no
+dnl representations about the suitability of this software for any purpose. It
+dnl is provided "as is" without express or implied warranty.
+dnl
+dnl RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+dnl INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+dnl EVENT SHALL RED HAT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+dnl CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+dnl DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+dnl TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+dnl PERFORMANCE OF THIS SOFTWARE.
+dnl
+dnl Process this file with autoconf to create configure.
+
+AC_PREREQ([2.57])
+
+# Pixman versioning scheme
+#
+# - The version in git has an odd MICRO version number
+#
+# - Released versions both development and stable have an even MICRO
+# version number
+#
+# - Released development versions have an odd MINOR number
+#
+# - Released stable versions have an even MINOR number
+#
+# - Versions that break ABI must have a new MAJOR number
+#
+# - If you break the ABI, then at least this must be done:
+#
+# - increment MAJOR
+#
+# - In the first development release where you break ABI, find
+# all instances of "pixman-n" and change them to pixman-(n+1)
+#
+# This needs to be done at least in
+# configure.ac
+# all Makefile.am's
+# pixman-n.pc.in
+#
+# This ensures that binary incompatible versions can be installed
+# in parallel. See http://www106.pair.com/rhp/parallel.html for
+# more information
+#
+
+m4_define([pixman_major], 0)
+m4_define([pixman_minor], 21)
+m4_define([pixman_micro], 3)
+
+m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
+
+AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman)
+AM_INIT_AUTOMAKE([foreign dist-bzip2])
+
+# Suppress verbose compile lines
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+AM_CONFIG_HEADER(config.h)
+
+AC_CANONICAL_HOST
+
+test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
+
+AC_PROG_CC
+AM_PROG_AS
+AC_PROG_LIBTOOL
+AC_CHECK_FUNCS([getisax])
+AC_C_BIGENDIAN
+AC_C_INLINE
+
+dnl PIXMAN_LINK_WITH_ENV(env-setup, program, true-action, false-action)
+dnl
+dnl Compiles and links the given program in the environment setup by env-setup
+dnl and executes true-action on success and false-action on failure.
+AC_DEFUN([PIXMAN_LINK_WITH_ENV],[dnl
+ save_CFLAGS="$CFLAGS"
+ save_LDFLAGS="$LDFLAGS"
+ save_LIBS="$LIBS"
+ CFLAGS=""
+ LDFLAGS=""
+ LIBS=""
+ $1
+ AC_LINK_IFELSE(
+ [$2],
+ [pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+ pixman_cc_flag=yes],
+ [pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+ pixman_cc_flag=no])
+
+ if test "x$pixman_cc_stderr" != "x"; then
+ pixman_cc_flag=no
+ fi
+
+ if test "x$pixman_cc_flag" = "xyes"; then
+ ifelse([$3], , :, [$3])
+ else
+ ifelse([$4], , :, [$4])
+ fi
+ CFLAGS="$save_CFLAGS"
+ LDFLAGS="$save_LDFLAGS"
+ LIBS="$save_LIBS"
+])
+
+dnl Find a -Werror for catching warnings.
+WERROR=
+for w in -Werror -errwarn; do
+ if test "z$WERROR" = "z"; then
+ AC_MSG_CHECKING([whether the compiler supports $w])
+ PIXMAN_LINK_WITH_ENV(
+ [CFLAGS=$w],
+ [int main(int c, char **v) { (void)c; (void)v; return 0; }],
+ [WERROR=$w; yesno=yes], [yesno=no])
+ AC_MSG_RESULT($_yesno)
+ fi
+done
+
+dnl PIXMAN_CHECK_CFLAG(flag, [program])
+dnl Adds flag to CFLAGS if the given program links without warnings or errors.
+AC_DEFUN([PIXMAN_CHECK_CFLAG], [dnl
+ AC_MSG_CHECKING([whether the compiler supports $1])
+ PIXMAN_LINK_WITH_ENV(
+ [CFLAGS="$WERROR $1"],
+ [$2
+ int main(int c, char **v) { (void)c; (void)v; return 0; }
+ ],
+ [_yesno=yes],
+ [_yesno=no])
+ if test "x$_yesno" = xyes; then
+ CFLAGS="$CFLAGS $1"
+ fi
+ AC_MSG_RESULT($_yesno)
+])
+
+AC_CHECK_SIZEOF(long)
+
+# Checks for Sun Studio compilers
+AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
+AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
+
+# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
+# if we're using Sun Studio and neither the user nor a config.site
+# has set CFLAGS.
+if test $SUNCC = yes && \
+ test "$test_CFLAGS" == "" && \
+ test "$CFLAGS" = "-g"
+then
+ CFLAGS="-O -g"
+fi
+
+#
+# We ignore pixman_major in the version here because the major version should
+# always be encoded in the actual library name. Ie., the soname is:
+#
+# pixman-$(pixman_major).0.minor.micro
+#
+m4_define([lt_current], [pixman_minor])
+m4_define([lt_revision], [pixman_micro])
+m4_define([lt_age], [pixman_minor])
+
+LT_VERSION_INFO="lt_current:lt_revision:lt_age"
+
+PIXMAN_VERSION_MAJOR=pixman_major()
+AC_SUBST(PIXMAN_VERSION_MAJOR)
+PIXMAN_VERSION_MINOR=pixman_minor()
+AC_SUBST(PIXMAN_VERSION_MINOR)
+PIXMAN_VERSION_MICRO=pixman_micro()
+AC_SUBST(PIXMAN_VERSION_MICRO)
+
+AC_SUBST(LT_VERSION_INFO)
+
+# Check for dependencies
+
+PIXMAN_CHECK_CFLAG([-Wall])
+PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
+
+AC_PATH_PROG(PERL, perl, no)
+if test "x$PERL" = xno; then
+ AC_MSG_ERROR([Perl is required to build pixman.])
+fi
+AC_SUBST(PERL)
+
+dnl =========================================================================
+dnl OpenMP for the test suite?
+dnl
+
+# Check for OpenMP support (only supported by autoconf >=2.62)
+OPENMP_CFLAGS=
+m4_ifdef([AC_OPENMP], [AC_OPENMP])
+
+m4_define([openmp_test_program],[dnl
+#include <stdio.h>
+
+extern unsigned int lcg_seed;
+#pragma omp threadprivate(lcg_seed)
+unsigned int lcg_seed;
+
+unsigned function(unsigned a, unsigned b)
+{
+ lcg_seed ^= b;
+ return ((a + b) ^ a ) + lcg_seed;
+}
+
+int main(int argc, char **argv)
+{
+ int i;
+ int n1 = 0, n2 = argc;
+ unsigned checksum = 0;
+ int verbose = argv != NULL;
+ unsigned (*test_function)(unsigned, unsigned);
+ test_function = function;
+ #pragma omp parallel for reduction(+:checksum) default(none) \
+ shared(n1, n2, test_function, verbose)
+ for (i = n1; i < n2; i++)
+ {
+ unsigned crc = test_function (i, 0);
+ if (verbose)
+ printf ("%d: %08X\n", i, crc);
+ checksum += crc;
+ }
+ printf("%u\n", checksum);
+ return 0;
+}
+])
+
+PIXMAN_LINK_WITH_ENV(
+ [CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"],
+ [openmp_test_program],
+ [have_openmp=yes],
+ [have_openmp=no])
+if test "x$have_openmp" = "xyes"; then
+ AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
+else
+ OPENMP_CFLAGS=""
+fi
+AC_SUBST(OPENMP_CFLAGS)
+
+dnl =========================================================================
+dnl -fvisibility stuff
+
+PIXMAN_CHECK_CFLAG([-fvisibility=hidden], [dnl
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#ifdef _WIN32
+#error Have -fvisibility but it is ignored and generates a warning
+#endif
+#else
+error Need GCC 4.0 for visibility
+#endif
+])
+
+PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
+#else
+error Need Sun Studio 8 for visibility
+#endif
+])
+
+dnl ===========================================================================
+dnl Check for MMX
+
+if test "x$MMX_CFLAGS" = "x" ; then
+ if test "x$SUNCC" = "xyes"; then
+ # Sun Studio doesn't have an -xarch=mmx flag, so we have to use sse
+ # but if we're building 64-bit, mmx & sse support is on by default and
+ # -xarch=sse throws an error instead
+ if test "$AMD64_ABI" = "no" ; then
+ MMX_CFLAGS="-xarch=sse"
+ fi
+ else
+ MMX_CFLAGS="-mmmx -Winline"
+ fi
+fi
+
+have_mmx_intrinsics=no
+AC_MSG_CHECKING(whether to use MMX intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$MMX_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 3.4 for MMX intrinsics"
+#endif
+#include <mmintrin.h>
+int main () {
+ __m64 v = _mm_cvtsi32_si64 (1);
+ return _mm_cvtsi64_si32 (v);
+}], have_mmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(mmx,
+ [AC_HELP_STRING([--disable-mmx],
+ [disable MMX fast paths])],
+ [enable_mmx=$enableval], [enable_mmx=auto])
+
+if test $enable_mmx = no ; then
+ have_mmx_intrinsics=disabled
+fi
+
+if test $have_mmx_intrinsics = yes ; then
+ AC_DEFINE(USE_MMX, 1, [use MMX compiler intrinsics])
+else
+ MMX_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_mmx_intrinsics)
+if test $enable_mmx = yes && test $have_mmx_intrinsics = no ; then
+ AC_MSG_ERROR([MMX intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_MMX, test $have_mmx_intrinsics = yes)
+
+dnl ===========================================================================
+dnl Check for SSE2
+
+if test "x$SSE2_CFLAGS" = "x" ; then
+ if test "x$SUNCC" = "xyes"; then
+ # SSE2 is enabled by default in the Sun Studio 64-bit environment
+ if test "$AMD64_ABI" = "no" ; then
+ SSE2_CFLAGS="-xarch=sse2"
+ fi
+ else
+ SSE2_CFLAGS="-mmmx -msse2 -Winline"
+ fi
+fi
+
+have_sse2_intrinsics=no
+AC_MSG_CHECKING(whether to use SSE2 intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSE2_CFLAGS $CFLAGS"
+
+AC_COMPILE_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
+# if !defined(__amd64__) && !defined(__x86_64__)
+# error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
+# endif
+#endif
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+int main () {
+ __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+ c = _mm_xor_si128 (a, b);
+ return 0;
+}], have_sse2_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(sse2,
+ [AC_HELP_STRING([--disable-sse2],
+ [disable SSE2 fast paths])],
+ [enable_sse2=$enableval], [enable_sse2=auto])
+
+if test $enable_sse2 = no ; then
+ have_sse2_intrinsics=disabled
+fi
+
+if test $have_sse2_intrinsics = yes ; then
+ AC_DEFINE(USE_SSE2, 1, [use SSE2 compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_sse2_intrinsics)
+if test $enable_sse2 = yes && test $have_sse2_intrinsics = no ; then
+ AC_MSG_ERROR([SSE2 intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
+
+dnl ===========================================================================
+dnl Other special flags needed when building code using MMX or SSE instructions
+case $host_os in
+ solaris*)
+ # When building 32-bit binaries, apply a mapfile to ensure that the
+ # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
+ # since they check at runtime before using those instructions.
+ # Not all linkers grok the mapfile format so we check for that first.
+ if test "$AMD64_ABI" = "no" ; then
+ use_hwcap_mapfile=no
+ AC_MSG_CHECKING(whether to use a hardware capability map file)
+ hwcap_save_LDFLAGS="$LDFLAGS"
+ HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
+ LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
+ AC_LINK_IFELSE([int main() { return 0; }],
+ use_hwcap_mapfile=yes,
+ HWCAP_LDFLAGS="")
+ LDFLAGS="$hwcap_save_LDFLAGS"
+ AC_MSG_RESULT($use_hwcap_mapfile)
+ fi
+ if test "x$MMX_LDFLAGS" = "x" ; then
+ MMX_LDFLAGS="$HWCAP_LDFLAGS"
+ fi
+ if test "x$SSE2_LDFLAGS" = "x" ; then
+ SSE2_LDFLAGS="$HWCAP_LDFLAGS"
+ fi
+ ;;
+esac
+
+AC_SUBST(MMX_CFLAGS)
+AC_SUBST(MMX_LDFLAGS)
+AC_SUBST(SSE2_CFLAGS)
+AC_SUBST(SSE2_LDFLAGS)
+
+dnl ===========================================================================
+dnl Check for VMX/Altivec
+if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
+ VMX_CFLAGS="-faltivec"
+else
+ VMX_CFLAGS="-maltivec -mabi=altivec"
+fi
+
+have_vmx_intrinsics=no
+AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$VMX_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 3.4 for sane altivec support"
+#endif
+#include <altivec.h>
+int main () {
+ vector unsigned int v = vec_splat_u32 (1);
+ v = vec_sub (v, v);
+ return 0;
+}], have_vmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(vmx,
+ [AC_HELP_STRING([--disable-vmx],
+ [disable VMX fast paths])],
+ [enable_vmx=$enableval], [enable_vmx=auto])
+
+if test $enable_vmx = no ; then
+ have_vmx_intrinsics=disabled
+fi
+
+if test $have_vmx_intrinsics = yes ; then
+ AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
+else
+ VMX_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_vmx_intrinsics)
+if test $enable_vmx = yes && test $have_vmx_intrinsics = no ; then
+ AC_MSG_ERROR([VMX intrinsics not detected])
+fi
+
+AC_SUBST(VMX_CFLAGS)
+
+AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
+
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports ARM SIMD instructions
+have_arm_simd=no
+AC_MSG_CHECKING(whether to use ARM SIMD assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([[
+.text
+.arch armv6
+.object_arch armv4
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+uqadd8 r0, r0, r0]], have_arm_simd=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-simd,
+ [AC_HELP_STRING([--disable-arm-simd],
+ [disable ARM SIMD fast paths])],
+ [enable_arm_simd=$enableval], [enable_arm_simd=auto])
+
+if test $enable_arm_simd = no ; then
+ have_arm_simd=disabled
+fi
+
+if test $have_arm_simd = yes ; then
+ AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
+
+AC_MSG_RESULT($have_arm_simd)
+if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
+ AC_MSG_ERROR([ARM SIMD intrinsics not detected])
+fi
+
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports NEON instructions
+have_arm_neon=no
+AC_MSG_CHECKING(whether to use ARM NEON assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([[
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+vmovn.u16 d0, q0]], have_arm_neon=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-neon,
+ [AC_HELP_STRING([--disable-arm-neon],
+ [disable ARM NEON fast paths])],
+ [enable_arm_neon=$enableval], [enable_arm_neon=auto])
+
+if test $enable_arm_neon = no ; then
+ have_arm_neon=disabled
+fi
+
+if test $have_arm_neon = yes ; then
+ AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
+
+AC_MSG_RESULT($have_arm_neon)
+if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
+ AC_MSG_ERROR([ARM NEON intrinsics not detected])
+fi
+
+dnl =========================================================================================
+dnl Check for GNU-style inline assembly support
+
+have_gcc_inline_asm=no
+AC_MSG_CHECKING(whether to use GNU-style inline assembler)
+AC_COMPILE_IFELSE([
+int main () {
+ /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
+ asm volatile ( "\tnop\n" : : : "cc", "memory" );
+ return 0;
+}], have_gcc_inline_asm=yes)
+
+AC_ARG_ENABLE(gcc-inline-asm,
+ [AC_HELP_STRING([--disable-gcc-inline-asm],
+ [disable GNU-style inline assembler])],
+ [enable_gcc_inline_asm=$enableval], [enable_gcc_inline_asm=auto])
+
+if test $enable_gcc_inline_asm = no ; then
+ have_gcc_inline_asm=disabled
+fi
+
+if test $have_gcc_inline_asm = yes ; then
+ AC_DEFINE(USE_GCC_INLINE_ASM, 1, [use GNU-style inline assembler])
+fi
+
+AC_MSG_RESULT($have_gcc_inline_asm)
+if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then
+ AC_MSG_ERROR([GNU-style inline assembler not detected])
+fi
+
+AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
+
+dnl ==============================================
+dnl Static test programs
+
+AC_ARG_ENABLE(static-testprogs,
+ [AC_HELP_STRING([--enable-static-testprogs],
+ [build test programs as static binaries [default=no]])],
+ [enable_static_testprogs=$enableval], [enable_static_testprogs=no])
+
+TESTPROGS_EXTRA_LDFLAGS=
+if test "x$enable_static_testprogs" = "xyes" ; then
+ TESTPROGS_EXTRA_LDFLAGS="-all-static"
+fi
+AC_SUBST(TESTPROGS_EXTRA_LDFLAGS)
+
+dnl ==============================================
+dnl Timers
+
+AC_ARG_ENABLE(timers,
+ [AC_HELP_STRING([--enable-timers],
+ [enable TIMER_BEGIN and TIMER_END macros [default=no]])],
+ [enable_timers=$enableval], [enable_timers=no])
+
+if test $enable_timers = yes ; then
+ AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros])
+fi
+AC_SUBST(PIXMAN_TIMERS)
+
+dnl ===================================
+dnl GTK+
+
+AC_ARG_ENABLE(gtk,
+ [AC_HELP_STRING([--enable-gtk],
+ [enable tests using GTK+ [default=auto]])],
+ [enable_gtk=$enableval], [enable_gtk=auto])
+
+PKG_PROG_PKG_CONFIG
+
+if test $enable_gtk = yes ; then
+ AC_CHECK_LIB([pixman-1], [pixman_version_string])
+ PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1])
+fi
+
+if test $enable_gtk = auto ; then
+ AC_CHECK_LIB([pixman-1], [pixman_version_string], [enable_gtk=auto], [enable_gtk=no])
+fi
+
+if test $enable_gtk = auto ; then
+ PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1], [enable_gtk=yes], [enable_gtk=no])
+fi
+
+AM_CONDITIONAL(HAVE_GTK, [test "x$enable_gtk" = xyes])
+
+AC_SUBST(GTK_CFLAGS)
+AC_SUBST(GTK_LIBS)
+AC_SUBST(DEP_CFLAGS)
+AC_SUBST(DEP_LIBS)
+
+dnl =====================================
+dnl posix_memalign, sigaction, alarm, gettimeofday
+
+AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
+if test x$have_posix_memalign = xyes; then
+ AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
+fi
+
+AC_CHECK_FUNC(sigaction, have_sigaction=yes, have_sigaction=no)
+if test x$have_sigaction = xyes; then
+ AC_DEFINE(HAVE_SIGACTION, 1, [Whether we have sigaction()])
+fi
+
+AC_CHECK_FUNC(alarm, have_alarm=yes, have_alarm=no)
+if test x$have_alarm = xyes; then
+ AC_DEFINE(HAVE_ALARM, 1, [Whether we have alarm()])
+fi
+
+AC_CHECK_HEADER([sys/mman.h],
+ [AC_DEFINE(HAVE_SYS_MMAN_H, [1], [Define to 1 if we have <sys/mman.h>])])
+
+AC_CHECK_FUNC(mprotect, have_mprotect=yes, have_mprotect=no)
+if test x$have_mprotect = xyes; then
+ AC_DEFINE(HAVE_MPROTECT, 1, [Whether we have mprotect()])
+fi
+
+AC_CHECK_FUNC(getpagesize, have_getpagesize=yes, have_getpagesize=no)
+if test x$have_getpagesize = xyes; then
+ AC_DEFINE(HAVE_GETPAGESIZE, 1, [Whether we have getpagesize()])
+fi
+
+AC_CHECK_HEADER([fenv.h],
+ [AC_DEFINE(HAVE_FENV_H, [1], [Define to 1 if we have <fenv.h>])])
+
+AC_CHECK_LIB(m, feenableexcept, have_feenableexcept=yes, have_feenableexcept=no)
+if test x$have_feenableexcept = xyes; then
+ AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()])
+fi
+
+AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no)
+AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no)
+if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
+ AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()])
+fi
+
+dnl =====================================
+dnl Thread local storage
+
+support_for__thread=no
+
+AC_MSG_CHECKING(for __thread)
+AC_LINK_IFELSE([
+#if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
+#error This MinGW version has broken __thread support
+#endif
+#ifdef __OpenBSD__
+#error OpenBSD has broken __thread support
+#endif
+static __thread int x ;
+int main () { x = 123; return x; }
+], support_for__thread=yes)
+
+if test $support_for__thread = yes; then
+ AC_DEFINE([TOOLCHAIN_SUPPORTS__THREAD],[],[Whether the tool chain supports __thread])
+fi
+
+AC_MSG_RESULT($support_for__thread)
+
+dnl
+dnl posix tls
+dnl
+
+m4_define([pthread_test_program],[dnl
+#include <stdlib.h>
+#include <pthread.h>
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+static pthread_key_t key;
+
+static void
+make_key (void)
+{
+ pthread_key_create (&key, NULL);
+}
+
+int
+main ()
+{
+ void *value = NULL;
+
+ if (pthread_once (&once_control, make_key) != 0)
+ {
+ value = NULL;
+ }
+ else
+ {
+ value = pthread_getspecific (key);
+ if (!value)
+ {
+ value = malloc (100);
+ pthread_setspecific (key, value);
+ }
+ }
+ return 0;
+}
+])
+
+AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
+ if test "z$support_for_pthread_setspecific" != "zyes"; then
+ PIXMAN_LINK_WITH_ENV(
+ [$1], [pthread_test_program],
+ [PTHREAD_CFLAGS="$CFLAGS"
+ PTHREAD_LIBS="$LIBS"
+ PTHREAD_LDFLAGS="$LDFLAGS"
+ support_for_pthread_setspecific=yes])
+ fi
+])
+
+if test $support_for__thread = no; then
+ support_for_pthread_setspecific=no
+
+ AC_MSG_CHECKING(for pthread_setspecific)
+
+ PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
+ PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
+ PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
+
+ if test $support_for_pthread_setspecific = yes; then
+ CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+ AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
+ fi
+
+ AC_MSG_RESULT($support_for_pthread_setspecific);
+fi
+
+AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
+AC_SUBST(HAVE_PTHREAD_SETSPECIFIC)
+AC_SUBST(PTHREAD_LDFLAGS)
+AC_SUBST(PTHREAD_LIBS)
+
+dnl =====================================
+dnl __attribute__((constructor))
+
+support_for_attribute_constructor=no
+
+AC_MSG_CHECKING(for __attribute__((constructor)))
+AC_LINK_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
+/* attribute 'constructor' is supported since gcc 2.7, but some compilers
+ * may only pretend to be gcc, so let's try to actually use it
+ */
+static int x = 1;
+static void __attribute__((constructor)) constructor_function () { x = 0; }
+int main (void) { return x; }
+#else
+#error not gcc or gcc version is older than 2.7
+#endif
+], support_for_attribute_constructor=yes)
+
+if test x$support_for_attribute_constructor = xyes; then
+ AC_DEFINE([TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR],
+ [],[Whether the tool chain supports __attribute__((constructor))])
+fi
+
+AC_MSG_RESULT($support_for_attribute_constructor)
+AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
+
+AC_OUTPUT([pixman-1.pc
+ pixman-1-uninstalled.pc
+ Makefile
+ pixman/Makefile
+ pixman/pixman-version.h
+ test/Makefile])
diff --git a/pixman/pixman/pixman-bits-image.c b/pixman/pixman/pixman-bits-image.c
index 85ff2a339..c453e0ee6 100644
--- a/pixman/pixman/pixman-bits-image.c
+++ b/pixman/pixman/pixman-bits-image.c
@@ -1,1443 +1,1441 @@
-/*
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- * 2005 Lars Knoll & Zack Rusin, Trolltech
- * 2008 Aaron Plattner, NVIDIA Corporation
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007, 2009 Red Hat, Inc.
- * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. Keith Packard makes no
- * representations about the suitability of this software for any purpose. It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-
-/* Store functions */
-void
-_pixman_image_store_scanline_32 (bits_image_t * image,
- int x,
- int y,
- int width,
- const uint32_t *buffer)
-{
- image->store_scanline_32 (image, x, y, width, buffer);
-
- if (image->common.alpha_map)
- {
- x -= image->common.alpha_origin_x;
- y -= image->common.alpha_origin_y;
-
- image->common.alpha_map->store_scanline_32 (
- image->common.alpha_map, x, y, width, buffer);
- }
-}
-
-void
-_pixman_image_store_scanline_64 (bits_image_t * image,
- int x,
- int y,
- int width,
- const uint32_t *buffer)
-{
- image->store_scanline_64 (image, x, y, width, buffer);
-
- if (image->common.alpha_map)
- {
- x -= image->common.alpha_origin_x;
- y -= image->common.alpha_origin_y;
-
- image->common.alpha_map->store_scanline_64 (
- image->common.alpha_map, x, y, width, buffer);
- }
-}
-
-/* Fetch functions */
-
-static force_inline uint32_t
-fetch_pixel_no_alpha (bits_image_t *image,
- int x, int y, pixman_bool_t check_bounds)
-{
- if (check_bounds &&
- (x < 0 || x >= image->width || y < 0 || y >= image->height))
- {
- return 0;
- }
-
- return image->fetch_pixel_32 (image, x, y);
-}
-
-typedef uint32_t (* get_pixel_t) (bits_image_t *image,
- int x, int y, pixman_bool_t check_bounds);
-
-static force_inline void
-repeat (pixman_repeat_t repeat, int size, int *coord)
-{
- switch (repeat)
- {
- case PIXMAN_REPEAT_NORMAL:
- *coord = MOD (*coord, size);
- break;
-
- case PIXMAN_REPEAT_PAD:
- *coord = CLIP (*coord, 0, size - 1);
- break;
-
- case PIXMAN_REPEAT_REFLECT:
- *coord = MOD (*coord, size * 2);
-
- if (*coord >= size)
- *coord = size * 2 - *coord - 1;
- break;
-
- case PIXMAN_REPEAT_NONE:
- break;
-
- default:
- break;
- }
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_nearest (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
- int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
- if (image->common.repeat != PIXMAN_REPEAT_NONE)
- {
- repeat (image->common.repeat, image->width, &x0);
- repeat (image->common.repeat, image->height, &y0);
-
- return get_pixel (image, x0, y0, FALSE);
- }
- else
- {
- return get_pixel (image, x0, y0, TRUE);
- }
-}
-
-#if SIZEOF_LONG > 4
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
- uint32_t bl, uint32_t br,
- int distx, int disty)
-{
- uint64_t distxy, distxiy, distixy, distixiy;
- uint64_t tl64, tr64, bl64, br64;
- uint64_t f, r;
-
- distxy = distx * disty;
- distxiy = distx * (256 - disty);
- distixy = (256 - distx) * disty;
- distixiy = (256 - distx) * (256 - disty);
-
- /* Alpha and Blue */
- tl64 = tl & 0xff0000ff;
- tr64 = tr & 0xff0000ff;
- bl64 = bl & 0xff0000ff;
- br64 = br & 0xff0000ff;
-
- f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
- r = f & 0x0000ff0000ff0000ull;
-
- /* Red and Green */
- tl64 = tl;
- tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
-
- tr64 = tr;
- tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
-
- bl64 = bl;
- bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
-
- br64 = br;
- br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
-
- f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
- r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
-
- return (uint32_t)(r >> 16);
-}
-
-#else
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
- uint32_t bl, uint32_t br,
- int distx, int disty)
-{
- int distxy, distxiy, distixy, distixiy;
- uint32_t f, r;
-
- distxy = distx * disty;
- distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */
- distixy = (disty << 8) - distxy; /* disty * (256 - distx) */
- distixiy =
- 256 * 256 - (disty << 8) -
- (distx << 8) + distxy; /* (256 - distx) * (256 - disty) */
-
- /* Blue */
- r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
- + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
-
- /* Green */
- f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
- + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
- r |= f & 0xff000000;
-
- tl >>= 16;
- tr >>= 16;
- bl >>= 16;
- br >>= 16;
- r >>= 16;
-
- /* Red */
- f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
- + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
- r |= f & 0x00ff0000;
-
- /* Alpha */
- f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
- + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
- r |= f & 0xff000000;
-
- return r;
-}
-
-#endif
-
-static force_inline uint32_t
-bits_image_fetch_pixel_bilinear (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- pixman_repeat_t repeat_mode = image->common.repeat;
- int width = image->width;
- int height = image->height;
- int x1, y1, x2, y2;
- uint32_t tl, tr, bl, br;
- int32_t distx, disty;
-
- x1 = x - pixman_fixed_1 / 2;
- y1 = y - pixman_fixed_1 / 2;
-
- distx = (x1 >> 8) & 0xff;
- disty = (y1 >> 8) & 0xff;
-
- x1 = pixman_fixed_to_int (x1);
- y1 = pixman_fixed_to_int (y1);
- x2 = x1 + 1;
- y2 = y1 + 1;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- repeat (repeat_mode, width, &x1);
- repeat (repeat_mode, height, &y1);
- repeat (repeat_mode, width, &x2);
- repeat (repeat_mode, height, &y2);
-
- tl = get_pixel (image, x1, y1, FALSE);
- bl = get_pixel (image, x1, y2, FALSE);
- tr = get_pixel (image, x2, y1, FALSE);
- br = get_pixel (image, x2, y2, FALSE);
- }
- else
- {
- tl = get_pixel (image, x1, y1, TRUE);
- tr = get_pixel (image, x2, y1, TRUE);
- bl = get_pixel (image, x1, y2, TRUE);
- br = get_pixel (image, x2, y2, TRUE);
- }
-
- return bilinear_interpolation (tl, tr, bl, br, distx, disty);
-}
-
-static void
-bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- bits_image_t *bits = &ima->bits;
- pixman_fixed_t x_top, x_bottom, x;
- pixman_fixed_t ux_top, ux_bottom, ux;
- pixman_vector_t v;
- uint32_t top_mask, bottom_mask;
- uint32_t *top_row;
- uint32_t *bottom_row;
- uint32_t *end;
- uint32_t zero[2] = { 0, 0 };
- int y, y1, y2;
- int disty;
- int mask_inc;
- int w;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (bits->common.transform, &v))
- return;
-
- ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
- x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
-
- y = v.vector[1] - pixman_fixed_1/2;
- disty = (y >> 8) & 0xff;
-
- /* Load the pointers to the first and second lines from the source
- * image that bilinear code must read.
- *
- * The main trick in this code is about the check if any line are
- * outside of the image;
- *
- * When I realize that a line (any one) is outside, I change
- * the pointer to a dummy area with zeros. Once I change this, I
- * must be sure the pointer will not change, so I set the
- * variables to each pointer increments inside the loop.
- */
- y1 = pixman_fixed_to_int (y);
- y2 = y1 + 1;
-
- if (y1 < 0 || y1 >= bits->height)
- {
- top_row = zero;
- x_top = 0;
- ux_top = 0;
- }
- else
- {
- top_row = bits->bits + y1 * bits->rowstride;
- x_top = x;
- ux_top = ux;
- }
-
- if (y2 < 0 || y2 >= bits->height)
- {
- bottom_row = zero;
- x_bottom = 0;
- ux_bottom = 0;
- }
- else
- {
- bottom_row = bits->bits + y2 * bits->rowstride;
- x_bottom = x;
- ux_bottom = ux;
- }
-
- /* Instead of checking whether the operation uses the mast in
- * each loop iteration, verify this only once and prepare the
- * variables to make the code smaller inside the loop.
- */
- if (!mask)
- {
- uint32_t mask_bits = 1;
-
- mask_inc = 0;
- mask = &mask_bits;
- }
- else
- {
- /* If have a mask, prepare the variables to check it */
- mask_inc = 1;
- }
-
- /* If both are zero, then the whole thing is zero */
- if (top_row == zero && bottom_row == zero)
- {
- memset (buffer, 0, width * sizeof (uint32_t));
- return;
- }
- else if (bits->format == PIXMAN_x8r8g8b8)
- {
- if (top_row == zero)
- {
- top_mask = 0;
- bottom_mask = 0xff000000;
- }
- else if (bottom_row == zero)
- {
- top_mask = 0xff000000;
- bottom_mask = 0;
- }
- else
- {
- top_mask = 0xff000000;
- bottom_mask = 0xff000000;
- }
- }
- else
- {
- top_mask = 0;
- bottom_mask = 0;
- }
-
- end = buffer + width;
-
- /* Zero fill to the left of the image */
- while (buffer < end && x < pixman_fixed_minus_1)
- {
- *buffer++ = 0;
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Left edge
- */
- while (buffer < end && x < 0)
- {
- uint32_t tr, br;
- int32_t distx;
-
- tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
- br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
- distx = (x >> 8) & 0xff;
-
- *buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
-
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Main part */
- w = pixman_int_to_fixed (bits->width - 1);
-
- while (buffer < end && x < w)
- {
- if (*mask)
- {
- uint32_t tl, tr, bl, br;
- int32_t distx;
-
- tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
- tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
- bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
- br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
- distx = (x >> 8) & 0xff;
-
- *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
- }
-
- buffer++;
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Right Edge */
- w = pixman_int_to_fixed (bits->width);
- while (buffer < end && x < w)
- {
- if (*mask)
- {
- uint32_t tl, bl;
- int32_t distx;
-
- tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
- bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
-
- distx = (x >> 8) & 0xff;
-
- *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
- }
-
- buffer++;
- x += ux;
- x_top += ux_top;
- x_bottom += ux_bottom;
- mask += mask_inc;
- }
-
- /* Zero fill to the left of the image */
- while (buffer < end)
- *buffer++ = 0;
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_convolution (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- pixman_fixed_t *params = image->common.filter_params;
- int x_off = (params[0] - pixman_fixed_1) >> 1;
- int y_off = (params[1] - pixman_fixed_1) >> 1;
- int32_t cwidth = pixman_fixed_to_int (params[0]);
- int32_t cheight = pixman_fixed_to_int (params[1]);
- int32_t srtot, sgtot, sbtot, satot;
- int32_t i, j, x1, x2, y1, y2;
- pixman_repeat_t repeat_mode = image->common.repeat;
- int width = image->width;
- int height = image->height;
-
- params += 2;
-
- x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
- y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
- x2 = x1 + cwidth;
- y2 = y1 + cheight;
-
- srtot = sgtot = sbtot = satot = 0;
-
- for (i = y1; i < y2; ++i)
- {
- for (j = x1; j < x2; ++j)
- {
- int rx = j;
- int ry = i;
-
- pixman_fixed_t f = *params;
-
- if (f)
- {
- uint32_t pixel;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- repeat (repeat_mode, width, &rx);
- repeat (repeat_mode, height, &ry);
-
- pixel = get_pixel (image, rx, ry, FALSE);
- }
- else
- {
- pixel = get_pixel (image, rx, ry, TRUE);
- }
-
- srtot += RED_8 (pixel) * f;
- sgtot += GREEN_8 (pixel) * f;
- sbtot += BLUE_8 (pixel) * f;
- satot += ALPHA_8 (pixel) * f;
- }
-
- params++;
- }
- }
-
- satot >>= 16;
- srtot >>= 16;
- sgtot >>= 16;
- sbtot >>= 16;
-
- satot = CLIP (satot, 0, 0xff);
- srtot = CLIP (srtot, 0, 0xff);
- sgtot = CLIP (sgtot, 0, 0xff);
- sbtot = CLIP (sbtot, 0, 0xff);
-
- return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot));
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_filtered (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
-{
- switch (image->common.filter)
- {
- case PIXMAN_FILTER_NEAREST:
- case PIXMAN_FILTER_FAST:
- return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
- break;
-
- case PIXMAN_FILTER_BILINEAR:
- case PIXMAN_FILTER_GOOD:
- case PIXMAN_FILTER_BEST:
- return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
- break;
-
- case PIXMAN_FILTER_CONVOLUTION:
- return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
- break;
-
- default:
- break;
- }
-
- return 0;
-}
-
-static void
-bits_image_fetch_affine_no_alpha (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- pixman_fixed_t x, y;
- pixman_fixed_t ux, uy;
- pixman_vector_t v;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (image->common.transform)
- {
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
- }
- else
- {
- ux = pixman_fixed_1;
- uy = 0;
- }
-
- x = v.vector[0];
- y = v.vector[1];
-
- for (i = 0; i < width; ++i)
- {
- if (!mask || mask[i])
- {
- buffer[i] = bits_image_fetch_pixel_filtered (
- &image->bits, x, y, fetch_pixel_no_alpha);
- }
-
- x += ux;
- y += uy;
- }
-}
-
-/* General fetcher */
-static force_inline uint32_t
-fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
-{
- uint32_t pixel;
-
- if (check_bounds &&
- (x < 0 || x >= image->width || y < 0 || y >= image->height))
- {
- return 0;
- }
-
- pixel = image->fetch_pixel_32 (image, x, y);
-
- if (image->common.alpha_map)
- {
- uint32_t pixel_a;
-
- x -= image->common.alpha_origin_x;
- y -= image->common.alpha_origin_y;
-
- if (x < 0 || x >= image->common.alpha_map->width ||
- y < 0 || y >= image->common.alpha_map->height)
- {
- pixel_a = 0;
- }
- else
- {
- pixel_a = image->common.alpha_map->fetch_pixel_32 (
- image->common.alpha_map, x, y);
-
- pixel_a = ALPHA_8 (pixel_a);
- }
-
- pixel &= 0x00ffffff;
- pixel |= (pixel_a << 24);
- }
-
- return pixel;
-}
-
-static void
-bits_image_fetch_general (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- pixman_fixed_t x, y, w;
- pixman_fixed_t ux, uy, uw;
- pixman_vector_t v;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (image->common.transform)
- {
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
- uw = image->common.transform->matrix[2][0];
- }
- else
- {
- ux = pixman_fixed_1;
- uy = 0;
- uw = 0;
- }
-
- x = v.vector[0];
- y = v.vector[1];
- w = v.vector[2];
-
- for (i = 0; i < width; ++i)
- {
- pixman_fixed_t x0, y0;
-
- if (!mask || mask[i])
- {
- if (w != 0)
- {
- x0 = ((pixman_fixed_48_16_t)x << 16) / w;
- y0 = ((pixman_fixed_48_16_t)y << 16) / w;
- }
- else
- {
- x0 = 0;
- y0 = 0;
- }
-
- buffer[i] = bits_image_fetch_pixel_filtered (
- &image->bits, x0, y0, fetch_pixel_general);
- }
-
- x += ux;
- y += uy;
- w += uw;
- }
-}
-
-static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
-typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
-
-static force_inline void
-bits_image_fetch_bilinear_affine (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask,
-
- convert_pixel_t convert_pixel,
- pixman_format_code_t format,
- pixman_repeat_t repeat_mode)
-{
- pixman_fixed_t x, y;
- pixman_fixed_t ux, uy;
- pixman_vector_t v;
- bits_image_t *bits = &image->bits;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
-
- x = v.vector[0];
- y = v.vector[1];
-
- for (i = 0; i < width; ++i)
- {
- int x1, y1, x2, y2;
- uint32_t tl, tr, bl, br;
- int32_t distx, disty;
- int width = image->bits.width;
- int height = image->bits.height;
- const uint8_t *row1;
- const uint8_t *row2;
-
- if (mask && !mask[i])
- goto next;
-
- x1 = x - pixman_fixed_1 / 2;
- y1 = y - pixman_fixed_1 / 2;
-
- distx = (x1 >> 8) & 0xff;
- disty = (y1 >> 8) & 0xff;
-
- y1 = pixman_fixed_to_int (y1);
- y2 = y1 + 1;
- x1 = pixman_fixed_to_int (x1);
- x2 = x1 + 1;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- uint32_t mask;
-
- mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
- repeat (repeat_mode, width, &x1);
- repeat (repeat_mode, height, &y1);
- repeat (repeat_mode, width, &x2);
- repeat (repeat_mode, height, &y2);
-
- row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
- row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
-
- tl = convert_pixel (row1, x1) | mask;
- tr = convert_pixel (row1, x2) | mask;
- bl = convert_pixel (row2, x1) | mask;
- br = convert_pixel (row2, x2) | mask;
- }
- else
- {
- uint32_t mask1, mask2;
- int bpp;
-
- /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
- * which means if you use it in expressions, those
- * expressions become unsigned themselves. Since
- * the variables below can be negative in some cases,
- * that will lead to crashes on 64 bit architectures.
- *
- * So this line makes sure bpp is signed
- */
- bpp = PIXMAN_FORMAT_BPP (format);
-
- if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
- {
- buffer[i] = 0;
- goto next;
- }
-
- if (y2 == 0)
- {
- row1 = zero;
- mask1 = 0;
- }
- else
- {
- row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
- row1 += bpp / 8 * x1;
-
- mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
- }
-
- if (y1 == height - 1)
- {
- row2 = zero;
- mask2 = 0;
- }
- else
- {
- row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
- row2 += bpp / 8 * x1;
-
- mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
- }
-
- if (x2 == 0)
- {
- tl = 0;
- bl = 0;
- }
- else
- {
- tl = convert_pixel (row1, 0) | mask1;
- bl = convert_pixel (row2, 0) | mask2;
- }
-
- if (x1 == width - 1)
- {
- tr = 0;
- br = 0;
- }
- else
- {
- tr = convert_pixel (row1, 1) | mask1;
- br = convert_pixel (row2, 1) | mask2;
- }
- }
-
- buffer[i] = bilinear_interpolation (
- tl, tr, bl, br, distx, disty);
-
- next:
- x += ux;
- y += uy;
- }
-}
-
-static force_inline void
-bits_image_fetch_nearest_affine (pixman_image_t * image,
- int offset,
- int line,
- int width,
- uint32_t * buffer,
- const uint32_t * mask,
-
- convert_pixel_t convert_pixel,
- pixman_format_code_t format,
- pixman_repeat_t repeat_mode)
-{
- pixman_fixed_t x, y;
- pixman_fixed_t ux, uy;
- pixman_vector_t v;
- bits_image_t *bits = &image->bits;
- int i;
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (image->common.transform, &v))
- return;
-
- ux = image->common.transform->matrix[0][0];
- uy = image->common.transform->matrix[1][0];
-
- x = v.vector[0];
- y = v.vector[1];
-
- for (i = 0; i < width; ++i)
- {
- int width, height, x0, y0;
- const uint8_t *row;
-
- if (mask && !mask[i])
- goto next;
-
- width = image->bits.width;
- height = image->bits.height;
- x0 = pixman_fixed_to_int (x - pixman_fixed_e);
- y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
- if (repeat_mode == PIXMAN_REPEAT_NONE &&
- (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
- {
- buffer[i] = 0;
- }
- else
- {
- uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
- if (repeat_mode != PIXMAN_REPEAT_NONE)
- {
- repeat (repeat_mode, width, &x0);
- repeat (repeat_mode, height, &y0);
- }
-
- row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
-
- buffer[i] = convert_pixel (row, x0) | mask;
- }
-
- next:
- x += ux;
- y += uy;
- }
-}
-
-static force_inline uint32_t
-convert_a8r8g8b8 (const uint8_t *row, int x)
-{
- return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_x8r8g8b8 (const uint8_t *row, int x)
-{
- return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_a8 (const uint8_t *row, int x)
-{
- return *(row + x) << 24;
-}
-
-static force_inline uint32_t
-convert_r5g6b5 (const uint8_t *row, int x)
-{
- return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
-}
-
-#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode) \
- static void \
- bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image, \
- int offset, \
- int line, \
- int width, \
- uint32_t * buffer, \
- const uint32_t * mask) \
- { \
- bits_image_fetch_bilinear_affine (image, offset, line, \
- width, buffer, mask, \
- convert_ ## format, \
- PIXMAN_ ## format, \
- repeat_mode); \
- } \
- extern int no_such_variable
-
-#define MAKE_NEAREST_FETCHER(name, format, repeat_mode) \
- static void \
- bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image, \
- int offset, \
- int line, \
- int width, \
- uint32_t * buffer, \
- const uint32_t * mask) \
- { \
- bits_image_fetch_nearest_affine (image, offset, line, \
- width, buffer, mask, \
- convert_ ## format, \
- PIXMAN_ ## format, \
- repeat_mode); \
- } \
- extern int no_such_variable
-
-#define MAKE_FETCHERS(name, format, repeat_mode) \
- MAKE_NEAREST_FETCHER (name, format, repeat_mode); \
- MAKE_BILINEAR_FETCHER (name, format, repeat_mode);
-
-MAKE_FETCHERS (pad_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_PAD);
-MAKE_FETCHERS (none_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NONE);
-MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT);
-MAKE_FETCHERS (normal_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NORMAL);
-MAKE_FETCHERS (pad_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_PAD);
-MAKE_FETCHERS (none_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NONE);
-MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT);
-MAKE_FETCHERS (normal_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NORMAL);
-MAKE_FETCHERS (pad_a8, a8, PIXMAN_REPEAT_PAD);
-MAKE_FETCHERS (none_a8, a8, PIXMAN_REPEAT_NONE);
-MAKE_FETCHERS (reflect_a8, a8, PIXMAN_REPEAT_REFLECT);
-MAKE_FETCHERS (normal_a8, a8, PIXMAN_REPEAT_NORMAL);
-MAKE_FETCHERS (pad_r5g6b5, r5g6b5, PIXMAN_REPEAT_PAD);
-MAKE_FETCHERS (none_r5g6b5, r5g6b5, PIXMAN_REPEAT_NONE);
-MAKE_FETCHERS (reflect_r5g6b5, r5g6b5, PIXMAN_REPEAT_REFLECT);
-MAKE_FETCHERS (normal_r5g6b5, r5g6b5, PIXMAN_REPEAT_NORMAL);
-
-static void
-bits_image_fetch_solid_32 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- uint32_t color;
- uint32_t *end;
-
- color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
-
- end = buffer + width;
- while (buffer < end)
- *(buffer++) = color;
-}
-
-static void
-bits_image_fetch_solid_64 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * b,
- const uint32_t * unused)
-{
- uint64_t color;
- uint64_t *buffer = (uint64_t *)b;
- uint64_t *end;
-
- color = image->bits.fetch_pixel_64 (&image->bits, 0, 0);
-
- end = buffer + width;
- while (buffer < end)
- *(buffer++) = color;
-}
-
-static void
-bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
- pixman_bool_t wide,
- int x,
- int y,
- int width,
- uint32_t * buffer)
-{
- uint32_t w;
-
- if (y < 0 || y >= image->height)
- {
- memset (buffer, 0, width * (wide? 8 : 4));
- return;
- }
-
- if (x < 0)
- {
- w = MIN (width, -x);
-
- memset (buffer, 0, w * (wide ? 8 : 4));
-
- width -= w;
- buffer += w * (wide? 2 : 1);
- x += w;
- }
-
- if (x < image->width)
- {
- w = MIN (width, image->width - x);
-
- if (wide)
- image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
- else
- image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-
- width -= w;
- buffer += w * (wide? 2 : 1);
- x += w;
- }
-
- memset (buffer, 0, width * (wide ? 8 : 4));
-}
-
-static void
-bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
- pixman_bool_t wide,
- int x,
- int y,
- int width,
- uint32_t * buffer)
-{
- uint32_t w;
-
- while (y < 0)
- y += image->height;
-
- while (y >= image->height)
- y -= image->height;
-
- while (width)
- {
- while (x < 0)
- x += image->width;
- while (x >= image->width)
- x -= image->width;
-
- w = MIN (width, image->width - x);
-
- if (wide)
- image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
- else
- image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-
- buffer += w * (wide? 2 : 1);
- x += w;
- width -= w;
- }
-}
-
-static void
-bits_image_fetch_untransformed_32 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * buffer,
- const uint32_t * mask)
-{
- if (image->common.repeat == PIXMAN_REPEAT_NONE)
- {
- bits_image_fetch_untransformed_repeat_none (
- &image->bits, FALSE, x, y, width, buffer);
- }
- else
- {
- bits_image_fetch_untransformed_repeat_normal (
- &image->bits, FALSE, x, y, width, buffer);
- }
-}
-
-static void
-bits_image_fetch_untransformed_64 (pixman_image_t * image,
- int x,
- int y,
- int width,
- uint32_t * buffer,
- const uint32_t * unused)
-{
- if (image->common.repeat == PIXMAN_REPEAT_NONE)
- {
- bits_image_fetch_untransformed_repeat_none (
- &image->bits, TRUE, x, y, width, buffer);
- }
- else
- {
- bits_image_fetch_untransformed_repeat_normal (
- &image->bits, TRUE, x, y, width, buffer);
- }
-}
-
-typedef struct
-{
- pixman_format_code_t format;
- uint32_t flags;
- fetch_scanline_t fetch_32;
- fetch_scanline_t fetch_64;
-} fetcher_info_t;
-
-static const fetcher_info_t fetcher_info[] =
-{
- { PIXMAN_solid,
- FAST_PATH_NO_ALPHA_MAP,
- bits_image_fetch_solid_32,
- bits_image_fetch_solid_64
- },
-
- { PIXMAN_any,
- (FAST_PATH_NO_ALPHA_MAP |
- FAST_PATH_ID_TRANSFORM |
- FAST_PATH_NO_CONVOLUTION_FILTER |
- FAST_PATH_NO_PAD_REPEAT |
- FAST_PATH_NO_REFLECT_REPEAT),
- bits_image_fetch_untransformed_32,
- bits_image_fetch_untransformed_64
- },
-
-#define FAST_BILINEAR_FLAGS \
- (FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_HAS_TRANSFORM | \
- FAST_PATH_AFFINE_TRANSFORM | \
- FAST_PATH_X_UNIT_POSITIVE | \
- FAST_PATH_Y_UNIT_ZERO | \
- FAST_PATH_NONE_REPEAT | \
- FAST_PATH_BILINEAR_FILTER)
-
- { PIXMAN_a8r8g8b8,
- FAST_BILINEAR_FLAGS,
- bits_image_fetch_bilinear_no_repeat_8888,
- _pixman_image_get_scanline_generic_64
- },
-
- { PIXMAN_x8r8g8b8,
- FAST_BILINEAR_FLAGS,
- bits_image_fetch_bilinear_no_repeat_8888,
- _pixman_image_get_scanline_generic_64
- },
-
-#define GENERAL_BILINEAR_FLAGS \
- (FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_HAS_TRANSFORM | \
- FAST_PATH_AFFINE_TRANSFORM | \
- FAST_PATH_BILINEAR_FILTER)
-
-#define GENERAL_NEAREST_FLAGS \
- (FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_HAS_TRANSFORM | \
- FAST_PATH_AFFINE_TRANSFORM | \
- FAST_PATH_NEAREST_FILTER)
-
-#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \
- { PIXMAN_ ## format, \
- GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
- bits_image_fetch_bilinear_affine_ ## name, \
- _pixman_image_get_scanline_generic_64 \
- },
-
-#define NEAREST_AFFINE_FAST_PATH(name, format, repeat) \
- { PIXMAN_ ## format, \
- GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
- bits_image_fetch_nearest_affine_ ## name, \
- _pixman_image_get_scanline_generic_64 \
- },
-
-#define AFFINE_FAST_PATHS(name, format, repeat) \
- BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \
- NEAREST_AFFINE_FAST_PATH(name, format, repeat)
-
- AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
- AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
- AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
- AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
- AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
- AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
- AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
- AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
- AFFINE_FAST_PATHS (pad_a8, a8, PAD)
- AFFINE_FAST_PATHS (none_a8, a8, NONE)
- AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
- AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
- AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
- AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
- AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
- AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
-
- /* Affine, no alpha */
- { PIXMAN_any,
- (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
- bits_image_fetch_affine_no_alpha,
- _pixman_image_get_scanline_generic_64
- },
-
- /* General */
- { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
-
- { PIXMAN_null },
-};
-
-static void
-bits_image_property_changed (pixman_image_t *image)
-{
- uint32_t flags = image->common.flags;
- pixman_format_code_t format = image->common.extended_format_code;
- const fetcher_info_t *info;
-
- _pixman_bits_image_setup_accessors (&image->bits);
-
- info = fetcher_info;
- while (info->format != PIXMAN_null)
- {
- if ((info->format == format || info->format == PIXMAN_any) &&
- (info->flags & flags) == info->flags)
- {
- image->common.get_scanline_32 = info->fetch_32;
- image->common.get_scanline_64 = info->fetch_64;
- break;
- }
-
- info++;
- }
-}
-
-static uint32_t *
-create_bits (pixman_format_code_t format,
- int width,
- int height,
- int * rowstride_bytes)
-{
- int stride;
- int buf_size;
- int bpp;
-
- /* what follows is a long-winded way, avoiding any possibility of integer
- * overflows, of saying:
- * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
- */
-
- bpp = PIXMAN_FORMAT_BPP (format);
- if (pixman_multiply_overflows_int (width, bpp))
- return NULL;
-
- stride = width * bpp;
- if (pixman_addition_overflows_int (stride, 0x1f))
- return NULL;
-
- stride += 0x1f;
- stride >>= 5;
-
- stride *= sizeof (uint32_t);
-
- if (pixman_multiply_overflows_int (height, stride))
- return NULL;
-
- buf_size = height * stride;
-
- if (rowstride_bytes)
- *rowstride_bytes = stride;
-
- return calloc (buf_size, 1);
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_bits (pixman_format_code_t format,
- int width,
- int height,
- uint32_t * bits,
- int rowstride_bytes)
-{
- pixman_image_t *image;
- uint32_t *free_me = NULL;
-
- /* must be a whole number of uint32_t's
- */
- return_val_if_fail (
- bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
-
- return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
-
- if (!bits && width && height)
- {
- free_me = bits = create_bits (format, width, height, &rowstride_bytes);
- if (!bits)
- return NULL;
- }
-
- image = _pixman_image_allocate ();
-
- if (!image)
- {
- if (free_me)
- free (free_me);
-
- return NULL;
- }
-
- image->type = BITS;
- image->bits.format = format;
- image->bits.width = width;
- image->bits.height = height;
- image->bits.bits = bits;
- image->bits.free_me = free_me;
- image->bits.read_func = NULL;
- image->bits.write_func = NULL;
-
- /* The rowstride is stored in number of uint32_t */
- image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t);
-
- image->bits.indexed = NULL;
-
- image->common.property_changed = bits_image_property_changed;
-
- _pixman_image_reset_clip_region (image);
-
- return image;
-}
+/*
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * 2005 Lars Knoll & Zack Rusin, Trolltech
+ * 2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007, 2009 Red Hat, Inc.
+ * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Keith Packard makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+
+/* Store functions */
+void
+_pixman_image_store_scanline_32 (bits_image_t * image,
+ int x,
+ int y,
+ int width,
+ const uint32_t *buffer)
+{
+ image->store_scanline_32 (image, x, y, width, buffer);
+
+ if (image->common.alpha_map)
+ {
+ x -= image->common.alpha_origin_x;
+ y -= image->common.alpha_origin_y;
+
+ image->common.alpha_map->store_scanline_32 (
+ image->common.alpha_map, x, y, width, buffer);
+ }
+}
+
+void
+_pixman_image_store_scanline_64 (bits_image_t * image,
+ int x,
+ int y,
+ int width,
+ const uint32_t *buffer)
+{
+ image->store_scanline_64 (image, x, y, width, buffer);
+
+ if (image->common.alpha_map)
+ {
+ x -= image->common.alpha_origin_x;
+ y -= image->common.alpha_origin_y;
+
+ image->common.alpha_map->store_scanline_64 (
+ image->common.alpha_map, x, y, width, buffer);
+ }
+}
+
+/* Fetch functions */
+
+static force_inline uint32_t
+fetch_pixel_no_alpha (bits_image_t *image,
+ int x, int y, pixman_bool_t check_bounds)
+{
+ if (check_bounds &&
+ (x < 0 || x >= image->width || y < 0 || y >= image->height))
+ {
+ return 0;
+ }
+
+ return image->fetch_pixel_32 (image, x, y);
+}
+
+typedef uint32_t (* get_pixel_t) (bits_image_t *image,
+ int x, int y, pixman_bool_t check_bounds);
+
+static force_inline void
+repeat (pixman_repeat_t repeat, int size, int *coord)
+{
+ switch (repeat)
+ {
+ case PIXMAN_REPEAT_NORMAL:
+ *coord = MOD (*coord, size);
+ break;
+
+ case PIXMAN_REPEAT_PAD:
+ *coord = CLIP (*coord, 0, size - 1);
+ break;
+
+ case PIXMAN_REPEAT_REFLECT:
+ *coord = MOD (*coord, size * 2);
+
+ if (*coord >= size)
+ *coord = size * 2 - *coord - 1;
+ break;
+
+ case PIXMAN_REPEAT_NONE:
+ break;
+
+ default:
+ break;
+ }
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_nearest (bits_image_t *image,
+ pixman_fixed_t x,
+ pixman_fixed_t y,
+ get_pixel_t get_pixel)
+{
+ int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+ int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+ if (image->common.repeat != PIXMAN_REPEAT_NONE)
+ {
+ repeat (image->common.repeat, image->width, &x0);
+ repeat (image->common.repeat, image->height, &y0);
+
+ return get_pixel (image, x0, y0, FALSE);
+ }
+ else
+ {
+ return get_pixel (image, x0, y0, TRUE);
+ }
+}
+
+#if SIZEOF_LONG > 4
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+ uint32_t bl, uint32_t br,
+ int distx, int disty)
+{
+ uint64_t distxy, distxiy, distixy, distixiy;
+ uint64_t tl64, tr64, bl64, br64;
+ uint64_t f, r;
+
+ distxy = distx * disty;
+ distxiy = distx * (256 - disty);
+ distixy = (256 - distx) * disty;
+ distixiy = (256 - distx) * (256 - disty);
+
+ /* Alpha and Blue */
+ tl64 = tl & 0xff0000ff;
+ tr64 = tr & 0xff0000ff;
+ bl64 = bl & 0xff0000ff;
+ br64 = br & 0xff0000ff;
+
+ f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+ r = f & 0x0000ff0000ff0000ull;
+
+ /* Red and Green */
+ tl64 = tl;
+ tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+ tr64 = tr;
+ tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+ bl64 = bl;
+ bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+ br64 = br;
+ br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+ f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+ r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+ return (uint32_t)(r >> 16);
+}
+
+#else
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+ uint32_t bl, uint32_t br,
+ int distx, int disty)
+{
+ int distxy, distxiy, distixy, distixiy;
+ uint32_t f, r;
+
+ distxy = distx * disty;
+ distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */
+ distixy = (disty << 8) - distxy; /* disty * (256 - distx) */
+ distixiy =
+ 256 * 256 - (disty << 8) -
+ (distx << 8) + distxy; /* (256 - distx) * (256 - disty) */
+
+ /* Blue */
+ r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+ + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
+
+ /* Green */
+ f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+ + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
+ r |= f & 0xff000000;
+
+ tl >>= 16;
+ tr >>= 16;
+ bl >>= 16;
+ br >>= 16;
+ r >>= 16;
+
+ /* Red */
+ f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+ + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
+ r |= f & 0x00ff0000;
+
+ /* Alpha */
+ f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+ + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
+ r |= f & 0xff000000;
+
+ return r;
+}
+
+#endif
+
+static force_inline uint32_t
+bits_image_fetch_pixel_bilinear (bits_image_t *image,
+ pixman_fixed_t x,
+ pixman_fixed_t y,
+ get_pixel_t get_pixel)
+{
+ pixman_repeat_t repeat_mode = image->common.repeat;
+ int width = image->width;
+ int height = image->height;
+ int x1, y1, x2, y2;
+ uint32_t tl, tr, bl, br;
+ int32_t distx, disty;
+
+ x1 = x - pixman_fixed_1 / 2;
+ y1 = y - pixman_fixed_1 / 2;
+
+ distx = (x1 >> 8) & 0xff;
+ disty = (y1 >> 8) & 0xff;
+
+ x1 = pixman_fixed_to_int (x1);
+ y1 = pixman_fixed_to_int (y1);
+ x2 = x1 + 1;
+ y2 = y1 + 1;
+
+ if (repeat_mode != PIXMAN_REPEAT_NONE)
+ {
+ repeat (repeat_mode, width, &x1);
+ repeat (repeat_mode, height, &y1);
+ repeat (repeat_mode, width, &x2);
+ repeat (repeat_mode, height, &y2);
+
+ tl = get_pixel (image, x1, y1, FALSE);
+ bl = get_pixel (image, x1, y2, FALSE);
+ tr = get_pixel (image, x2, y1, FALSE);
+ br = get_pixel (image, x2, y2, FALSE);
+ }
+ else
+ {
+ tl = get_pixel (image, x1, y1, TRUE);
+ tr = get_pixel (image, x2, y1, TRUE);
+ bl = get_pixel (image, x1, y2, TRUE);
+ br = get_pixel (image, x2, y2, TRUE);
+ }
+
+ return bilinear_interpolation (tl, tr, bl, br, distx, disty);
+}
+
+static void
+bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
+ int offset,
+ int line,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask)
+{
+ bits_image_t *bits = &ima->bits;
+ pixman_fixed_t x_top, x_bottom, x;
+ pixman_fixed_t ux_top, ux_bottom, ux;
+ pixman_vector_t v;
+ uint32_t top_mask, bottom_mask;
+ uint32_t *top_row;
+ uint32_t *bottom_row;
+ uint32_t *end;
+ uint32_t zero[2] = { 0, 0 };
+ int y, y1, y2;
+ int disty;
+ int mask_inc;
+ int w;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (bits->common.transform, &v))
+ return;
+
+ ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
+ x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
+
+ y = v.vector[1] - pixman_fixed_1/2;
+ disty = (y >> 8) & 0xff;
+
+ /* Load the pointers to the first and second lines from the source
+ * image that bilinear code must read.
+ *
+ * The main trick in this code is about the check if any line are
+ * outside of the image;
+ *
+ * When I realize that a line (any one) is outside, I change
+ * the pointer to a dummy area with zeros. Once I change this, I
+ * must be sure the pointer will not change, so I set the
+ * variables to each pointer increments inside the loop.
+ */
+ y1 = pixman_fixed_to_int (y);
+ y2 = y1 + 1;
+
+ if (y1 < 0 || y1 >= bits->height)
+ {
+ top_row = zero;
+ x_top = 0;
+ ux_top = 0;
+ }
+ else
+ {
+ top_row = bits->bits + y1 * bits->rowstride;
+ x_top = x;
+ ux_top = ux;
+ }
+
+ if (y2 < 0 || y2 >= bits->height)
+ {
+ bottom_row = zero;
+ x_bottom = 0;
+ ux_bottom = 0;
+ }
+ else
+ {
+ bottom_row = bits->bits + y2 * bits->rowstride;
+ x_bottom = x;
+ ux_bottom = ux;
+ }
+
+ /* Instead of checking whether the operation uses the mast in
+ * each loop iteration, verify this only once and prepare the
+ * variables to make the code smaller inside the loop.
+ */
+ if (!mask)
+ {
+ uint32_t mask_bits = 1;
+
+ mask_inc = 0;
+ mask = &mask_bits;
+ }
+ else
+ {
+ /* If have a mask, prepare the variables to check it */
+ mask_inc = 1;
+ }
+
+ /* If both are zero, then the whole thing is zero */
+ if (top_row == zero && bottom_row == zero)
+ {
+ memset (buffer, 0, width * sizeof (uint32_t));
+ return;
+ }
+ else if (bits->format == PIXMAN_x8r8g8b8)
+ {
+ if (top_row == zero)
+ {
+ top_mask = 0;
+ bottom_mask = 0xff000000;
+ }
+ else if (bottom_row == zero)
+ {
+ top_mask = 0xff000000;
+ bottom_mask = 0;
+ }
+ else
+ {
+ top_mask = 0xff000000;
+ bottom_mask = 0xff000000;
+ }
+ }
+ else
+ {
+ top_mask = 0;
+ bottom_mask = 0;
+ }
+
+ end = buffer + width;
+
+ /* Zero fill to the left of the image */
+ while (buffer < end && x < pixman_fixed_minus_1)
+ {
+ *buffer++ = 0;
+ x += ux;
+ x_top += ux_top;
+ x_bottom += ux_bottom;
+ mask += mask_inc;
+ }
+
+ /* Left edge
+ */
+ while (buffer < end && x < 0)
+ {
+ uint32_t tr, br;
+ int32_t distx;
+
+ tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
+ br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+ distx = (x >> 8) & 0xff;
+
+ *buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
+
+ x += ux;
+ x_top += ux_top;
+ x_bottom += ux_bottom;
+ mask += mask_inc;
+ }
+
+ /* Main part */
+ w = pixman_int_to_fixed (bits->width - 1);
+
+ while (buffer < end && x < w)
+ {
+ if (*mask)
+ {
+ uint32_t tl, tr, bl, br;
+ int32_t distx;
+
+ tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+ tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
+ bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+ br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+ distx = (x >> 8) & 0xff;
+
+ *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
+ }
+
+ buffer++;
+ x += ux;
+ x_top += ux_top;
+ x_bottom += ux_bottom;
+ mask += mask_inc;
+ }
+
+ /* Right Edge */
+ w = pixman_int_to_fixed (bits->width);
+ while (buffer < end && x < w)
+ {
+ if (*mask)
+ {
+ uint32_t tl, bl;
+ int32_t distx;
+
+ tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+ bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+
+ distx = (x >> 8) & 0xff;
+
+ *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
+ }
+
+ buffer++;
+ x += ux;
+ x_top += ux_top;
+ x_bottom += ux_bottom;
+ mask += mask_inc;
+ }
+
+ /* Zero fill to the left of the image */
+ while (buffer < end)
+ *buffer++ = 0;
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_convolution (bits_image_t *image,
+ pixman_fixed_t x,
+ pixman_fixed_t y,
+ get_pixel_t get_pixel)
+{
+ pixman_fixed_t *params = image->common.filter_params;
+ int x_off = (params[0] - pixman_fixed_1) >> 1;
+ int y_off = (params[1] - pixman_fixed_1) >> 1;
+ int32_t cwidth = pixman_fixed_to_int (params[0]);
+ int32_t cheight = pixman_fixed_to_int (params[1]);
+ int32_t srtot, sgtot, sbtot, satot;
+ int32_t i, j, x1, x2, y1, y2;
+ pixman_repeat_t repeat_mode = image->common.repeat;
+ int width = image->width;
+ int height = image->height;
+
+ params += 2;
+
+ x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+ y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+ x2 = x1 + cwidth;
+ y2 = y1 + cheight;
+
+ srtot = sgtot = sbtot = satot = 0;
+
+ for (i = y1; i < y2; ++i)
+ {
+ for (j = x1; j < x2; ++j)
+ {
+ int rx = j;
+ int ry = i;
+
+ pixman_fixed_t f = *params;
+
+ if (f)
+ {
+ uint32_t pixel;
+
+ if (repeat_mode != PIXMAN_REPEAT_NONE)
+ {
+ repeat (repeat_mode, width, &rx);
+ repeat (repeat_mode, height, &ry);
+
+ pixel = get_pixel (image, rx, ry, FALSE);
+ }
+ else
+ {
+ pixel = get_pixel (image, rx, ry, TRUE);
+ }
+
+ srtot += RED_8 (pixel) * f;
+ sgtot += GREEN_8 (pixel) * f;
+ sbtot += BLUE_8 (pixel) * f;
+ satot += ALPHA_8 (pixel) * f;
+ }
+
+ params++;
+ }
+ }
+
+ satot >>= 16;
+ srtot >>= 16;
+ sgtot >>= 16;
+ sbtot >>= 16;
+
+ satot = CLIP (satot, 0, 0xff);
+ srtot = CLIP (srtot, 0, 0xff);
+ sgtot = CLIP (sgtot, 0, 0xff);
+ sbtot = CLIP (sbtot, 0, 0xff);
+
+ return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot));
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_filtered (bits_image_t *image,
+ pixman_fixed_t x,
+ pixman_fixed_t y,
+ get_pixel_t get_pixel)
+{
+ switch (image->common.filter)
+ {
+ case PIXMAN_FILTER_NEAREST:
+ case PIXMAN_FILTER_FAST:
+ return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
+ break;
+
+ case PIXMAN_FILTER_BILINEAR:
+ case PIXMAN_FILTER_GOOD:
+ case PIXMAN_FILTER_BEST:
+ return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
+ break;
+
+ case PIXMAN_FILTER_CONVOLUTION:
+ return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static void
+bits_image_fetch_affine_no_alpha (pixman_image_t * image,
+ int offset,
+ int line,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask)
+{
+ pixman_fixed_t x, y;
+ pixman_fixed_t ux, uy;
+ pixman_vector_t v;
+ int i;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (image->common.transform)
+ {
+ if (!pixman_transform_point_3d (image->common.transform, &v))
+ return;
+
+ ux = image->common.transform->matrix[0][0];
+ uy = image->common.transform->matrix[1][0];
+ }
+ else
+ {
+ ux = pixman_fixed_1;
+ uy = 0;
+ }
+
+ x = v.vector[0];
+ y = v.vector[1];
+
+ for (i = 0; i < width; ++i)
+ {
+ if (!mask || mask[i])
+ {
+ buffer[i] = bits_image_fetch_pixel_filtered (
+ &image->bits, x, y, fetch_pixel_no_alpha);
+ }
+
+ x += ux;
+ y += uy;
+ }
+}
+
+/* General fetcher */
+static force_inline uint32_t
+fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
+{
+ uint32_t pixel;
+
+ if (check_bounds &&
+ (x < 0 || x >= image->width || y < 0 || y >= image->height))
+ {
+ return 0;
+ }
+
+ pixel = image->fetch_pixel_32 (image, x, y);
+
+ if (image->common.alpha_map)
+ {
+ uint32_t pixel_a;
+
+ x -= image->common.alpha_origin_x;
+ y -= image->common.alpha_origin_y;
+
+ if (x < 0 || x >= image->common.alpha_map->width ||
+ y < 0 || y >= image->common.alpha_map->height)
+ {
+ pixel_a = 0;
+ }
+ else
+ {
+ pixel_a = image->common.alpha_map->fetch_pixel_32 (
+ image->common.alpha_map, x, y);
+
+ pixel_a = ALPHA_8 (pixel_a);
+ }
+
+ pixel &= 0x00ffffff;
+ pixel |= (pixel_a << 24);
+ }
+
+ return pixel;
+}
+
+static void
+bits_image_fetch_general (pixman_image_t * image,
+ int offset,
+ int line,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask)
+{
+ pixman_fixed_t x, y, w;
+ pixman_fixed_t ux, uy, uw;
+ pixman_vector_t v;
+ int i;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (image->common.transform)
+ {
+ if (!pixman_transform_point_3d (image->common.transform, &v))
+ return;
+
+ ux = image->common.transform->matrix[0][0];
+ uy = image->common.transform->matrix[1][0];
+ uw = image->common.transform->matrix[2][0];
+ }
+ else
+ {
+ ux = pixman_fixed_1;
+ uy = 0;
+ uw = 0;
+ }
+
+ x = v.vector[0];
+ y = v.vector[1];
+ w = v.vector[2];
+
+ for (i = 0; i < width; ++i)
+ {
+ pixman_fixed_t x0, y0;
+
+ if (!mask || mask[i])
+ {
+ if (w != 0)
+ {
+ x0 = ((pixman_fixed_48_16_t)x << 16) / w;
+ y0 = ((pixman_fixed_48_16_t)y << 16) / w;
+ }
+ else
+ {
+ x0 = 0;
+ y0 = 0;
+ }
+
+ buffer[i] = bits_image_fetch_pixel_filtered (
+ &image->bits, x0, y0, fetch_pixel_general);
+ }
+
+ x += ux;
+ y += uy;
+ w += uw;
+ }
+}
+
+static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
+
+static force_inline void
+bits_image_fetch_bilinear_affine (pixman_image_t * image,
+ int offset,
+ int line,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask,
+
+ convert_pixel_t convert_pixel,
+ pixman_format_code_t format,
+ pixman_repeat_t repeat_mode)
+{
+ pixman_fixed_t x, y;
+ pixman_fixed_t ux, uy;
+ pixman_vector_t v;
+ bits_image_t *bits = &image->bits;
+ int i;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (image->common.transform, &v))
+ return;
+
+ ux = image->common.transform->matrix[0][0];
+ uy = image->common.transform->matrix[1][0];
+
+ x = v.vector[0];
+ y = v.vector[1];
+
+ for (i = 0; i < width; ++i)
+ {
+ int x1, y1, x2, y2;
+ uint32_t tl, tr, bl, br;
+ int32_t distx, disty;
+ int width = image->bits.width;
+ int height = image->bits.height;
+ const uint8_t *row1;
+ const uint8_t *row2;
+
+ if (mask && !mask[i])
+ goto next;
+
+ x1 = x - pixman_fixed_1 / 2;
+ y1 = y - pixman_fixed_1 / 2;
+
+ distx = (x1 >> 8) & 0xff;
+ disty = (y1 >> 8) & 0xff;
+
+ y1 = pixman_fixed_to_int (y1);
+ y2 = y1 + 1;
+ x1 = pixman_fixed_to_int (x1);
+ x2 = x1 + 1;
+
+ if (repeat_mode != PIXMAN_REPEAT_NONE)
+ {
+ uint32_t mask;
+
+ mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+ repeat (repeat_mode, width, &x1);
+ repeat (repeat_mode, height, &y1);
+ repeat (repeat_mode, width, &x2);
+ repeat (repeat_mode, height, &y2);
+
+ row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+ row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+
+ tl = convert_pixel (row1, x1) | mask;
+ tr = convert_pixel (row1, x2) | mask;
+ bl = convert_pixel (row2, x1) | mask;
+ br = convert_pixel (row2, x2) | mask;
+ }
+ else
+ {
+ uint32_t mask1, mask2;
+ int bpp;
+
+ /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
+ * which means if you use it in expressions, those
+ * expressions become unsigned themselves. Since
+ * the variables below can be negative in some cases,
+ * that will lead to crashes on 64 bit architectures.
+ *
+ * So this line makes sure bpp is signed
+ */
+ bpp = PIXMAN_FORMAT_BPP (format);
+
+ if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
+ {
+ buffer[i] = 0;
+ goto next;
+ }
+
+ if (y2 == 0)
+ {
+ row1 = zero;
+ mask1 = 0;
+ }
+ else
+ {
+ row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+ row1 += bpp / 8 * x1;
+
+ mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+ }
+
+ if (y1 == height - 1)
+ {
+ row2 = zero;
+ mask2 = 0;
+ }
+ else
+ {
+ row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+ row2 += bpp / 8 * x1;
+
+ mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+ }
+
+ if (x2 == 0)
+ {
+ tl = 0;
+ bl = 0;
+ }
+ else
+ {
+ tl = convert_pixel (row1, 0) | mask1;
+ bl = convert_pixel (row2, 0) | mask2;
+ }
+
+ if (x1 == width - 1)
+ {
+ tr = 0;
+ br = 0;
+ }
+ else
+ {
+ tr = convert_pixel (row1, 1) | mask1;
+ br = convert_pixel (row2, 1) | mask2;
+ }
+ }
+
+ buffer[i] = bilinear_interpolation (
+ tl, tr, bl, br, distx, disty);
+
+ next:
+ x += ux;
+ y += uy;
+ }
+}
+
+static force_inline void
+bits_image_fetch_nearest_affine (pixman_image_t * image,
+ int offset,
+ int line,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask,
+
+ convert_pixel_t convert_pixel,
+ pixman_format_code_t format,
+ pixman_repeat_t repeat_mode)
+{
+ pixman_fixed_t x, y;
+ pixman_fixed_t ux, uy;
+ pixman_vector_t v;
+ bits_image_t *bits = &image->bits;
+ int i;
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (image->common.transform, &v))
+ return;
+
+ ux = image->common.transform->matrix[0][0];
+ uy = image->common.transform->matrix[1][0];
+
+ x = v.vector[0];
+ y = v.vector[1];
+
+ for (i = 0; i < width; ++i)
+ {
+ int width, height, x0, y0;
+ const uint8_t *row;
+
+ if (mask && !mask[i])
+ goto next;
+
+ width = image->bits.width;
+ height = image->bits.height;
+ x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+ y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+ if (repeat_mode == PIXMAN_REPEAT_NONE &&
+ (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
+ {
+ buffer[i] = 0;
+ }
+ else
+ {
+ uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+ if (repeat_mode != PIXMAN_REPEAT_NONE)
+ {
+ repeat (repeat_mode, width, &x0);
+ repeat (repeat_mode, height, &y0);
+ }
+
+ row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
+
+ buffer[i] = convert_pixel (row, x0) | mask;
+ }
+
+ next:
+ x += ux;
+ y += uy;
+ }
+}
+
+static force_inline uint32_t
+convert_a8r8g8b8 (const uint8_t *row, int x)
+{
+ return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_x8r8g8b8 (const uint8_t *row, int x)
+{
+ return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_a8 (const uint8_t *row, int x)
+{
+ return *(row + x) << 24;
+}
+
+static force_inline uint32_t
+convert_r5g6b5 (const uint8_t *row, int x)
+{
+ return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
+}
+
+#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode) \
+ static void \
+ bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image, \
+ int offset, \
+ int line, \
+ int width, \
+ uint32_t * buffer, \
+ const uint32_t * mask) \
+ { \
+ bits_image_fetch_bilinear_affine (image, offset, line, \
+ width, buffer, mask, \
+ convert_ ## format, \
+ PIXMAN_ ## format, \
+ repeat_mode); \
+ }
+
+#define MAKE_NEAREST_FETCHER(name, format, repeat_mode) \
+ static void \
+ bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image, \
+ int offset, \
+ int line, \
+ int width, \
+ uint32_t * buffer, \
+ const uint32_t * mask) \
+ { \
+ bits_image_fetch_nearest_affine (image, offset, line, \
+ width, buffer, mask, \
+ convert_ ## format, \
+ PIXMAN_ ## format, \
+ repeat_mode); \
+ }
+
+#define MAKE_FETCHERS(name, format, repeat_mode) \
+ MAKE_NEAREST_FETCHER (name, format, repeat_mode) \
+ MAKE_BILINEAR_FETCHER (name, format, repeat_mode)
+
+MAKE_FETCHERS (pad_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_a8, a8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8, a8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8, a8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8, a8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_r5g6b5, r5g6b5, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_r5g6b5, r5g6b5, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_r5g6b5, r5g6b5, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_r5g6b5, r5g6b5, PIXMAN_REPEAT_NORMAL)
+
+static void
+bits_image_fetch_solid_32 (pixman_image_t * image,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask)
+{
+ uint32_t color;
+ uint32_t *end;
+
+ color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
+
+ end = buffer + width;
+ while (buffer < end)
+ *(buffer++) = color;
+}
+
+static void
+bits_image_fetch_solid_64 (pixman_image_t * image,
+ int x,
+ int y,
+ int width,
+ uint32_t * b,
+ const uint32_t * unused)
+{
+ uint64_t color;
+ uint64_t *buffer = (uint64_t *)b;
+ uint64_t *end;
+
+ color = image->bits.fetch_pixel_64 (&image->bits, 0, 0);
+
+ end = buffer + width;
+ while (buffer < end)
+ *(buffer++) = color;
+}
+
+static void
+bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
+ pixman_bool_t wide,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer)
+{
+ uint32_t w;
+
+ if (y < 0 || y >= image->height)
+ {
+ memset (buffer, 0, width * (wide? 8 : 4));
+ return;
+ }
+
+ if (x < 0)
+ {
+ w = MIN (width, -x);
+
+ memset (buffer, 0, w * (wide ? 8 : 4));
+
+ width -= w;
+ buffer += w * (wide? 2 : 1);
+ x += w;
+ }
+
+ if (x < image->width)
+ {
+ w = MIN (width, image->width - x);
+
+ if (wide)
+ image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+ else
+ image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+ width -= w;
+ buffer += w * (wide? 2 : 1);
+ x += w;
+ }
+
+ memset (buffer, 0, width * (wide ? 8 : 4));
+}
+
+static void
+bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
+ pixman_bool_t wide,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer)
+{
+ uint32_t w;
+
+ while (y < 0)
+ y += image->height;
+
+ while (y >= image->height)
+ y -= image->height;
+
+ while (width)
+ {
+ while (x < 0)
+ x += image->width;
+ while (x >= image->width)
+ x -= image->width;
+
+ w = MIN (width, image->width - x);
+
+ if (wide)
+ image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+ else
+ image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+ buffer += w * (wide? 2 : 1);
+ x += w;
+ width -= w;
+ }
+}
+
+static void
+bits_image_fetch_untransformed_32 (pixman_image_t * image,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * mask)
+{
+ if (image->common.repeat == PIXMAN_REPEAT_NONE)
+ {
+ bits_image_fetch_untransformed_repeat_none (
+ &image->bits, FALSE, x, y, width, buffer);
+ }
+ else
+ {
+ bits_image_fetch_untransformed_repeat_normal (
+ &image->bits, FALSE, x, y, width, buffer);
+ }
+}
+
+static void
+bits_image_fetch_untransformed_64 (pixman_image_t * image,
+ int x,
+ int y,
+ int width,
+ uint32_t * buffer,
+ const uint32_t * unused)
+{
+ if (image->common.repeat == PIXMAN_REPEAT_NONE)
+ {
+ bits_image_fetch_untransformed_repeat_none (
+ &image->bits, TRUE, x, y, width, buffer);
+ }
+ else
+ {
+ bits_image_fetch_untransformed_repeat_normal (
+ &image->bits, TRUE, x, y, width, buffer);
+ }
+}
+
+typedef struct
+{
+ pixman_format_code_t format;
+ uint32_t flags;
+ fetch_scanline_t fetch_32;
+ fetch_scanline_t fetch_64;
+} fetcher_info_t;
+
+static const fetcher_info_t fetcher_info[] =
+{
+ { PIXMAN_solid,
+ FAST_PATH_NO_ALPHA_MAP,
+ bits_image_fetch_solid_32,
+ bits_image_fetch_solid_64
+ },
+
+ { PIXMAN_any,
+ (FAST_PATH_NO_ALPHA_MAP |
+ FAST_PATH_ID_TRANSFORM |
+ FAST_PATH_NO_CONVOLUTION_FILTER |
+ FAST_PATH_NO_PAD_REPEAT |
+ FAST_PATH_NO_REFLECT_REPEAT),
+ bits_image_fetch_untransformed_32,
+ bits_image_fetch_untransformed_64
+ },
+
+#define FAST_BILINEAR_FLAGS \
+ (FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_HAS_TRANSFORM | \
+ FAST_PATH_AFFINE_TRANSFORM | \
+ FAST_PATH_X_UNIT_POSITIVE | \
+ FAST_PATH_Y_UNIT_ZERO | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_BILINEAR_FILTER)
+
+ { PIXMAN_a8r8g8b8,
+ FAST_BILINEAR_FLAGS,
+ bits_image_fetch_bilinear_no_repeat_8888,
+ _pixman_image_get_scanline_generic_64
+ },
+
+ { PIXMAN_x8r8g8b8,
+ FAST_BILINEAR_FLAGS,
+ bits_image_fetch_bilinear_no_repeat_8888,
+ _pixman_image_get_scanline_generic_64
+ },
+
+#define GENERAL_BILINEAR_FLAGS \
+ (FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_HAS_TRANSFORM | \
+ FAST_PATH_AFFINE_TRANSFORM | \
+ FAST_PATH_BILINEAR_FILTER)
+
+#define GENERAL_NEAREST_FLAGS \
+ (FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_HAS_TRANSFORM | \
+ FAST_PATH_AFFINE_TRANSFORM | \
+ FAST_PATH_NEAREST_FILTER)
+
+#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \
+ { PIXMAN_ ## format, \
+ GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
+ bits_image_fetch_bilinear_affine_ ## name, \
+ _pixman_image_get_scanline_generic_64 \
+ },
+
+#define NEAREST_AFFINE_FAST_PATH(name, format, repeat) \
+ { PIXMAN_ ## format, \
+ GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
+ bits_image_fetch_nearest_affine_ ## name, \
+ _pixman_image_get_scanline_generic_64 \
+ },
+
+#define AFFINE_FAST_PATHS(name, format, repeat) \
+ BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \
+ NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+
+ AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
+ AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
+ AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
+ AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
+ AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
+ AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
+ AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
+ AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
+ AFFINE_FAST_PATHS (pad_a8, a8, PAD)
+ AFFINE_FAST_PATHS (none_a8, a8, NONE)
+ AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
+ AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
+ AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
+ AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
+ AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
+ AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
+
+ /* Affine, no alpha */
+ { PIXMAN_any,
+ (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
+ bits_image_fetch_affine_no_alpha,
+ _pixman_image_get_scanline_generic_64
+ },
+
+ /* General */
+ { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
+
+ { PIXMAN_null },
+};
+
+static void
+bits_image_property_changed (pixman_image_t *image)
+{
+ uint32_t flags = image->common.flags;
+ pixman_format_code_t format = image->common.extended_format_code;
+ const fetcher_info_t *info;
+
+ _pixman_bits_image_setup_accessors (&image->bits);
+
+ info = fetcher_info;
+ while (info->format != PIXMAN_null)
+ {
+ if ((info->format == format || info->format == PIXMAN_any) &&
+ (info->flags & flags) == info->flags)
+ {
+ image->common.get_scanline_32 = info->fetch_32;
+ image->common.get_scanline_64 = info->fetch_64;
+ break;
+ }
+
+ info++;
+ }
+}
+
+static uint32_t *
+create_bits (pixman_format_code_t format,
+ int width,
+ int height,
+ int * rowstride_bytes)
+{
+ int stride;
+ int buf_size;
+ int bpp;
+
+ /* what follows is a long-winded way, avoiding any possibility of integer
+ * overflows, of saying:
+ * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
+ */
+
+ bpp = PIXMAN_FORMAT_BPP (format);
+ if (pixman_multiply_overflows_int (width, bpp))
+ return NULL;
+
+ stride = width * bpp;
+ if (pixman_addition_overflows_int (stride, 0x1f))
+ return NULL;
+
+ stride += 0x1f;
+ stride >>= 5;
+
+ stride *= sizeof (uint32_t);
+
+ if (pixman_multiply_overflows_int (height, stride))
+ return NULL;
+
+ buf_size = height * stride;
+
+ if (rowstride_bytes)
+ *rowstride_bytes = stride;
+
+ return calloc (buf_size, 1);
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_bits (pixman_format_code_t format,
+ int width,
+ int height,
+ uint32_t * bits,
+ int rowstride_bytes)
+{
+ pixman_image_t *image;
+ uint32_t *free_me = NULL;
+
+ /* must be a whole number of uint32_t's
+ */
+ return_val_if_fail (
+ bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
+
+ return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
+
+ if (!bits && width && height)
+ {
+ free_me = bits = create_bits (format, width, height, &rowstride_bytes);
+ if (!bits)
+ return NULL;
+ }
+
+ image = _pixman_image_allocate ();
+
+ if (!image)
+ {
+ if (free_me)
+ free (free_me);
+
+ return NULL;
+ }
+
+ image->type = BITS;
+ image->bits.format = format;
+ image->bits.width = width;
+ image->bits.height = height;
+ image->bits.bits = bits;
+ image->bits.free_me = free_me;
+ image->bits.read_func = NULL;
+ image->bits.write_func = NULL;
+
+ /* The rowstride is stored in number of uint32_t */
+ image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t);
+
+ image->bits.indexed = NULL;
+
+ image->common.property_changed = bits_image_property_changed;
+
+ _pixman_image_reset_clip_region (image);
+
+ return image;
+}
diff --git a/pixman/pixman/pixman-compiler.h b/pixman/pixman/pixman-compiler.h
index 32698e007..8f6c787f6 100644
--- a/pixman/pixman/pixman-compiler.h
+++ b/pixman/pixman/pixman-compiler.h
@@ -1,204 +1,203 @@
-/* Pixman uses some non-standard compiler features. This file ensures
- * they exist
- *
- * The features are:
- *
- * FUNC must be defined to expand to the current function
- * PIXMAN_EXPORT should be defined to whatever is required to
- * export functions from a shared library
- * limits limits for various types must be defined
- * inline must be defined
- * force_inline must be defined
- */
-#if defined (__GNUC__)
-# define FUNC ((const char*) (__PRETTY_FUNCTION__))
-#elif defined (__sun) || (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
-# define FUNC ((const char*) (__func__))
-#else
-# define FUNC ((const char*) ("???"))
-#endif
-
-#ifndef INT16_MIN
-# define INT16_MIN (-32767-1)
-#endif
-
-#ifndef INT16_MAX
-# define INT16_MAX (32767)
-#endif
-
-#ifndef INT32_MIN
-# define INT32_MIN (-2147483647-1)
-#endif
-
-#ifndef INT32_MAX
-# define INT32_MAX (2147483647)
-#endif
-
-#ifndef UINT32_MIN
-# define UINT32_MIN (0)
-#endif
-
-#ifndef UINT32_MAX
-# define UINT32_MAX (4294967295U)
-#endif
-
-#ifndef M_PI
-# define M_PI 3.14159265358979323846
-#endif
-
-#ifdef _MSC_VER
-/* 'inline' is available only in C++ in MSVC */
-# define inline __inline
-# define force_inline __forceinline
-# define noinline __declspec(noinline)
-#elif defined __GNUC__ || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
-# define inline __inline__
-# define force_inline __inline__ __attribute__ ((__always_inline__))
-# define noinline __attribute__((noinline))
-#else
-# ifndef force_inline
-# define force_inline inline
-# endif
-# ifndef noinline
-# define noinline
-# endif
-#endif
-
-/* GCC visibility */
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32)
-# define PIXMAN_EXPORT __attribute__ ((visibility("default")))
-/* Sun Studio 8 visibility */
-#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
-# define PIXMAN_EXPORT __global
-#else
-# define PIXMAN_EXPORT
-#endif
-
-/* TLS */
-#if defined(PIXMAN_NO_TLS)
-
-# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static type name
-# define PIXMAN_GET_THREAD_LOCAL(name) \
- (&name)
-
-#elif defined(TOOLCHAIN_SUPPORTS__THREAD)
-
-# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static __thread type name
-# define PIXMAN_GET_THREAD_LOCAL(name) \
- (&name)
-
-#elif defined(__MINGW32__)
-
-# define _NO_W32_PSEUDO_MODIFIERS
-# include <windows.h>
-
-# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static volatile int tls_ ## name ## _initialized = 0; \
- static void *tls_ ## name ## _mutex = NULL; \
- static unsigned tls_ ## name ## _index; \
- \
- static type * \
- tls_ ## name ## _alloc (void) \
- { \
- type *value = calloc (1, sizeof (type)); \
- if (value) \
- TlsSetValue (tls_ ## name ## _index, value); \
- return value; \
- } \
- \
- static force_inline type * \
- tls_ ## name ## _get (void) \
- { \
- type *value; \
- if (!tls_ ## name ## _initialized) \
- { \
- if (!tls_ ## name ## _mutex) \
- { \
- void *mutex = CreateMutexA (NULL, 0, NULL); \
- if (InterlockedCompareExchangePointer ( \
- &tls_ ## name ## _mutex, mutex, NULL) != NULL) \
- { \
- CloseHandle (mutex); \
- } \
- } \
- WaitForSingleObject (tls_ ## name ## _mutex, 0xFFFFFFFF); \
- if (!tls_ ## name ## _initialized) \
- { \
- tls_ ## name ## _index = TlsAlloc (); \
- tls_ ## name ## _initialized = 1; \
- } \
- ReleaseMutex (tls_ ## name ## _mutex); \
- } \
- if (tls_ ## name ## _index == 0xFFFFFFFF) \
- return NULL; \
- value = TlsGetValue (tls_ ## name ## _index); \
- if (!value) \
- value = tls_ ## name ## _alloc (); \
- return value; \
- }
-
-# define PIXMAN_GET_THREAD_LOCAL(name) \
- tls_ ## name ## _get ()
-
-#elif defined(_MSC_VER)
-
-# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static __declspec(thread) type name
-# define PIXMAN_GET_THREAD_LOCAL(name) \
- (&name)
-
-#elif defined(HAVE_PTHREAD_SETSPECIFIC)
-
-#include <pthread.h>
-
-# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static pthread_once_t tls_ ## name ## _once_control = PTHREAD_ONCE_INIT; \
- static pthread_key_t tls_ ## name ## _key; \
- \
- static void \
- tls_ ## name ## _destroy_value (void *value) \
- { \
- free (value); \
- } \
- \
- static void \
- tls_ ## name ## _make_key (void) \
- { \
- pthread_key_create (&tls_ ## name ## _key, \
- tls_ ## name ## _destroy_value); \
- } \
- \
- static type * \
- tls_ ## name ## _alloc (void) \
- { \
- type *value = calloc (1, sizeof (type)); \
- if (value) \
- pthread_setspecific (tls_ ## name ## _key, value); \
- return value; \
- } \
- \
- static force_inline type * \
- tls_ ## name ## _get (void) \
- { \
- type *value = NULL; \
- if (pthread_once (&tls_ ## name ## _once_control, \
- tls_ ## name ## _make_key) == 0) \
- { \
- value = pthread_getspecific (tls_ ## name ## _key); \
- if (!value) \
- value = tls_ ## name ## _alloc (); \
- } \
- return value; \
- } \
- extern int no_such_variable
-
-# define PIXMAN_GET_THREAD_LOCAL(name) \
- tls_ ## name ## _get ()
-
-#else
-
-# error "Unknown thread local support for this system. Pixman will not work with multiple threads. Define PIXMAN_NO_TLS to acknowledge and accept this limitation and compile pixman without thread-safety support."
-
-#endif
+/* Pixman uses some non-standard compiler features. This file ensures
+ * they exist
+ *
+ * The features are:
+ *
+ * FUNC must be defined to expand to the current function
+ * PIXMAN_EXPORT should be defined to whatever is required to
+ * export functions from a shared library
+ * limits limits for various types must be defined
+ * inline must be defined
+ * force_inline must be defined
+ */
+#if defined (__GNUC__)
+# define FUNC ((const char*) (__PRETTY_FUNCTION__))
+#elif defined (__sun) || (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+# define FUNC ((const char*) (__func__))
+#else
+# define FUNC ((const char*) ("???"))
+#endif
+
+#ifndef INT16_MIN
+# define INT16_MIN (-32767-1)
+#endif
+
+#ifndef INT16_MAX
+# define INT16_MAX (32767)
+#endif
+
+#ifndef INT32_MIN
+# define INT32_MIN (-2147483647-1)
+#endif
+
+#ifndef INT32_MAX
+# define INT32_MAX (2147483647)
+#endif
+
+#ifndef UINT32_MIN
+# define UINT32_MIN (0)
+#endif
+
+#ifndef UINT32_MAX
+# define UINT32_MAX (4294967295U)
+#endif
+
+#ifndef M_PI
+# define M_PI 3.14159265358979323846
+#endif
+
+#ifdef _MSC_VER
+/* 'inline' is available only in C++ in MSVC */
+# define inline __inline
+# define force_inline __forceinline
+# define noinline __declspec(noinline)
+#elif defined __GNUC__ || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
+# define inline __inline__
+# define force_inline __inline__ __attribute__ ((__always_inline__))
+# define noinline __attribute__((noinline))
+#else
+# ifndef force_inline
+# define force_inline inline
+# endif
+# ifndef noinline
+# define noinline
+# endif
+#endif
+
+/* GCC visibility */
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32)
+# define PIXMAN_EXPORT __attribute__ ((visibility("default")))
+/* Sun Studio 8 visibility */
+#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
+# define PIXMAN_EXPORT __global
+#else
+# define PIXMAN_EXPORT
+#endif
+
+/* TLS */
+#if defined(PIXMAN_NO_TLS)
+
+# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
+ static type name
+# define PIXMAN_GET_THREAD_LOCAL(name) \
+ (&name)
+
+#elif defined(TOOLCHAIN_SUPPORTS__THREAD)
+
+# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
+ static __thread type name
+# define PIXMAN_GET_THREAD_LOCAL(name) \
+ (&name)
+
+#elif defined(__MINGW32__)
+
+# define _NO_W32_PSEUDO_MODIFIERS
+# include <windows.h>
+
+# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
+ static volatile int tls_ ## name ## _initialized = 0; \
+ static void *tls_ ## name ## _mutex = NULL; \
+ static unsigned tls_ ## name ## _index; \
+ \
+ static type * \
+ tls_ ## name ## _alloc (void) \
+ { \
+ type *value = calloc (1, sizeof (type)); \
+ if (value) \
+ TlsSetValue (tls_ ## name ## _index, value); \
+ return value; \
+ } \
+ \
+ static force_inline type * \
+ tls_ ## name ## _get (void) \
+ { \
+ type *value; \
+ if (!tls_ ## name ## _initialized) \
+ { \
+ if (!tls_ ## name ## _mutex) \
+ { \
+ void *mutex = CreateMutexA (NULL, 0, NULL); \
+ if (InterlockedCompareExchangePointer ( \
+ &tls_ ## name ## _mutex, mutex, NULL) != NULL) \
+ { \
+ CloseHandle (mutex); \
+ } \
+ } \
+ WaitForSingleObject (tls_ ## name ## _mutex, 0xFFFFFFFF); \
+ if (!tls_ ## name ## _initialized) \
+ { \
+ tls_ ## name ## _index = TlsAlloc (); \
+ tls_ ## name ## _initialized = 1; \
+ } \
+ ReleaseMutex (tls_ ## name ## _mutex); \
+ } \
+ if (tls_ ## name ## _index == 0xFFFFFFFF) \
+ return NULL; \
+ value = TlsGetValue (tls_ ## name ## _index); \
+ if (!value) \
+ value = tls_ ## name ## _alloc (); \
+ return value; \
+ }
+
+# define PIXMAN_GET_THREAD_LOCAL(name) \
+ tls_ ## name ## _get ()
+
+#elif defined(_MSC_VER)
+
+# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
+ static __declspec(thread) type name
+# define PIXMAN_GET_THREAD_LOCAL(name) \
+ (&name)
+
+#elif defined(HAVE_PTHREAD_SETSPECIFIC)
+
+#include <pthread.h>
+
+# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
+ static pthread_once_t tls_ ## name ## _once_control = PTHREAD_ONCE_INIT; \
+ static pthread_key_t tls_ ## name ## _key; \
+ \
+ static void \
+ tls_ ## name ## _destroy_value (void *value) \
+ { \
+ free (value); \
+ } \
+ \
+ static void \
+ tls_ ## name ## _make_key (void) \
+ { \
+ pthread_key_create (&tls_ ## name ## _key, \
+ tls_ ## name ## _destroy_value); \
+ } \
+ \
+ static type * \
+ tls_ ## name ## _alloc (void) \
+ { \
+ type *value = calloc (1, sizeof (type)); \
+ if (value) \
+ pthread_setspecific (tls_ ## name ## _key, value); \
+ return value; \
+ } \
+ \
+ static force_inline type * \
+ tls_ ## name ## _get (void) \
+ { \
+ type *value = NULL; \
+ if (pthread_once (&tls_ ## name ## _once_control, \
+ tls_ ## name ## _make_key) == 0) \
+ { \
+ value = pthread_getspecific (tls_ ## name ## _key); \
+ if (!value) \
+ value = tls_ ## name ## _alloc (); \
+ } \
+ return value; \
+ }
+
+# define PIXMAN_GET_THREAD_LOCAL(name) \
+ tls_ ## name ## _get ()
+
+#else
+
+# error "Unknown thread local support for this system. Pixman will not work with multiple threads. Define PIXMAN_NO_TLS to acknowledge and accept this limitation and compile pixman without thread-safety support."
+
+#endif
diff --git a/pixman/pixman/pixman-cpu.c b/pixman/pixman/pixman-cpu.c
index 8032fb42c..70253d1ea 100644
--- a/pixman/pixman/pixman-cpu.c
+++ b/pixman/pixman/pixman-cpu.c
@@ -1,598 +1,603 @@
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. SuSE makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <string.h>
-
-#if defined(USE_ARM_SIMD) && defined(_MSC_VER)
-/* Needed for EXCEPTION_ILLEGAL_INSTRUCTION */
-#include <windows.h>
-#endif
-
-#include "pixman-private.h"
-
-#ifdef USE_VMX
-
-/* The CPU detection code needs to be in a file not compiled with
- * "-maltivec -mabi=altivec", as gcc would try to save vector register
- * across function calls causing SIGILL on cpus without Altivec/vmx.
- */
-static pixman_bool_t initialized = FALSE;
-static volatile pixman_bool_t have_vmx = TRUE;
-
-#ifdef __APPLE__
-#include <sys/sysctl.h>
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
- if (!initialized)
- {
- size_t length = sizeof(have_vmx);
- int error =
- sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0);
-
- if (error)
- have_vmx = FALSE;
-
- initialized = TRUE;
- }
- return have_vmx;
-}
-
-#elif defined (__OpenBSD__)
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <machine/cpu.h>
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
- if (!initialized)
- {
- int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
- size_t length = sizeof(have_vmx);
- int error =
- sysctl (mib, 2, &have_vmx, &length, NULL, 0);
-
- if (error != 0)
- have_vmx = FALSE;
-
- initialized = TRUE;
- }
- return have_vmx;
-}
-
-#elif defined (__linux__)
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <linux/auxvec.h>
-#include <asm/cputable.h>
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
- if (!initialized)
- {
- char fname[64];
- unsigned long buf[64];
- ssize_t count = 0;
- pid_t pid;
- int fd, i;
-
- pid = getpid ();
- snprintf (fname, sizeof(fname) - 1, "/proc/%d/auxv", pid);
-
- fd = open (fname, O_RDONLY);
- if (fd >= 0)
- {
- for (i = 0; i <= (count / sizeof(unsigned long)); i += 2)
- {
- /* Read more if buf is empty... */
- if (i == (count / sizeof(unsigned long)))
- {
- count = read (fd, buf, sizeof(buf));
- if (count <= 0)
- break;
- i = 0;
- }
-
- if (buf[i] == AT_HWCAP)
- {
- have_vmx = !!(buf[i + 1] & PPC_FEATURE_HAS_ALTIVEC);
- initialized = TRUE;
- break;
- }
- else if (buf[i] == AT_NULL)
- {
- break;
- }
- }
- close (fd);
- }
- }
- if (!initialized)
- {
- /* Something went wrong. Assume 'no' rather than playing
- fragile tricks with catching SIGILL. */
- have_vmx = FALSE;
- initialized = TRUE;
- }
-
- return have_vmx;
-}
-
-#else /* !__APPLE__ && !__OpenBSD__ && !__linux__ */
-#include <signal.h>
-#include <setjmp.h>
-
-static jmp_buf jump_env;
-
-static void
-vmx_test (int sig,
- siginfo_t *si,
- void * unused)
-{
- longjmp (jump_env, 1);
-}
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
- struct sigaction sa, osa;
- int jmp_result;
-
- if (!initialized)
- {
- sa.sa_flags = SA_SIGINFO;
- sigemptyset (&sa.sa_mask);
- sa.sa_sigaction = vmx_test;
- sigaction (SIGILL, &sa, &osa);
- jmp_result = setjmp (jump_env);
- if (jmp_result == 0)
- {
- asm volatile ( "vor 0, 0, 0" );
- }
- sigaction (SIGILL, &osa, NULL);
- have_vmx = (jmp_result == 0);
- initialized = TRUE;
- }
- return have_vmx;
-}
-
-#endif /* __APPLE__ */
-#endif /* USE_VMX */
-
-#if defined(USE_ARM_SIMD) || defined(USE_ARM_NEON)
-
-#if defined(_MSC_VER)
-
-#if defined(USE_ARM_SIMD)
-extern int pixman_msvc_try_arm_simd_op ();
-
-pixman_bool_t
-pixman_have_arm_simd (void)
-{
- static pixman_bool_t initialized = FALSE;
- static pixman_bool_t have_arm_simd = FALSE;
-
- if (!initialized)
- {
- __try {
- pixman_msvc_try_arm_simd_op ();
- have_arm_simd = TRUE;
- } __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION) {
- have_arm_simd = FALSE;
- }
- initialized = TRUE;
- }
-
- return have_arm_simd;
-}
-
-#endif /* USE_ARM_SIMD */
-
-#if defined(USE_ARM_NEON)
-extern int pixman_msvc_try_arm_neon_op ();
-
-pixman_bool_t
-pixman_have_arm_neon (void)
-{
- static pixman_bool_t initialized = FALSE;
- static pixman_bool_t have_arm_neon = FALSE;
-
- if (!initialized)
- {
- __try
- {
- pixman_msvc_try_arm_neon_op ();
- have_arm_neon = TRUE;
- }
- __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION)
- {
- have_arm_neon = FALSE;
- }
- initialized = TRUE;
- }
-
- return have_arm_neon;
-}
-
-#endif /* USE_ARM_NEON */
-
-#else /* linux ELF */
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <string.h>
-#include <elf.h>
-
-static pixman_bool_t arm_has_v7 = FALSE;
-static pixman_bool_t arm_has_v6 = FALSE;
-static pixman_bool_t arm_has_vfp = FALSE;
-static pixman_bool_t arm_has_neon = FALSE;
-static pixman_bool_t arm_has_iwmmxt = FALSE;
-static pixman_bool_t arm_tests_initialized = FALSE;
-
-static void
-pixman_arm_read_auxv ()
-{
- int fd;
- Elf32_auxv_t aux;
-
- fd = open ("/proc/self/auxv", O_RDONLY);
- if (fd >= 0)
- {
- while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t))
- {
- if (aux.a_type == AT_HWCAP)
- {
- uint32_t hwcap = aux.a_un.a_val;
- /* hardcode these values to avoid depending on specific
- * versions of the hwcap header, e.g. HWCAP_NEON
- */
- arm_has_vfp = (hwcap & 64) != 0;
- arm_has_iwmmxt = (hwcap & 512) != 0;
- /* this flag is only present on kernel 2.6.29 */
- arm_has_neon = (hwcap & 4096) != 0;
- }
- else if (aux.a_type == AT_PLATFORM)
- {
- const char *plat = (const char*) aux.a_un.a_val;
- if (strncmp (plat, "v7l", 3) == 0)
- {
- arm_has_v7 = TRUE;
- arm_has_v6 = TRUE;
- }
- else if (strncmp (plat, "v6l", 3) == 0)
- {
- arm_has_v6 = TRUE;
- }
- }
- }
- close (fd);
- }
-
- arm_tests_initialized = TRUE;
-}
-
-#if defined(USE_ARM_SIMD)
-pixman_bool_t
-pixman_have_arm_simd (void)
-{
- if (!arm_tests_initialized)
- pixman_arm_read_auxv ();
-
- return arm_has_v6;
-}
-
-#endif /* USE_ARM_SIMD */
-
-#if defined(USE_ARM_NEON)
-pixman_bool_t
-pixman_have_arm_neon (void)
-{
- if (!arm_tests_initialized)
- pixman_arm_read_auxv ();
-
- return arm_has_neon;
-}
-
-#endif /* USE_ARM_NEON */
-
-#endif /* linux */
-
-#endif /* USE_ARM_SIMD || USE_ARM_NEON */
-
-#if defined(USE_MMX) || defined(USE_SSE2)
-/* The CPU detection code needs to be in a file not compiled with
- * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
- * that would lead to SIGILL instructions on old CPUs that don't have
- * it.
- */
-#if !defined(__amd64__) && !defined(__x86_64__) && !defined(_M_AMD64)
-
-#ifdef HAVE_GETISAX
-#include <sys/auxv.h>
-#endif
-
-typedef enum
-{
- NO_FEATURES = 0,
- MMX = 0x1,
- MMX_EXTENSIONS = 0x2,
- SSE = 0x6,
- SSE2 = 0x8,
- CMOV = 0x10
-} cpu_features_t;
-
-
-static unsigned int
-detect_cpu_features (void)
-{
- unsigned int features = 0;
- unsigned int result = 0;
-
-#ifdef HAVE_GETISAX
- if (getisax (&result, 1))
- {
- if (result & AV_386_CMOV)
- features |= CMOV;
- if (result & AV_386_MMX)
- features |= MMX;
- if (result & AV_386_AMD_MMX)
- features |= MMX_EXTENSIONS;
- if (result & AV_386_SSE)
- features |= SSE;
- if (result & AV_386_SSE2)
- features |= SSE2;
- }
-#else
- char vendor[13];
-#ifdef _MSC_VER
- int vendor0 = 0, vendor1, vendor2;
-#endif
- vendor[0] = 0;
- vendor[12] = 0;
-
-#ifdef __GNUC__
- /* see p. 118 of amd64 instruction set manual Vol3 */
- /* We need to be careful about the handling of %ebx and
- * %esp here. We can't declare either one as clobbered
- * since they are special registers (%ebx is the "PIC
- * register" holding an offset to global data, %esp the
- * stack pointer), so we need to make sure they have their
- * original values when we access the output operands.
- */
- __asm__ (
- "pushf\n"
- "pop %%eax\n"
- "mov %%eax, %%ecx\n"
- "xor $0x00200000, %%eax\n"
- "push %%eax\n"
- "popf\n"
- "pushf\n"
- "pop %%eax\n"
- "mov $0x0, %%edx\n"
- "xor %%ecx, %%eax\n"
- "jz 1f\n"
-
- "mov $0x00000000, %%eax\n"
- "push %%ebx\n"
- "cpuid\n"
- "mov %%ebx, %%eax\n"
- "pop %%ebx\n"
- "mov %%eax, %1\n"
- "mov %%edx, %2\n"
- "mov %%ecx, %3\n"
- "mov $0x00000001, %%eax\n"
- "push %%ebx\n"
- "cpuid\n"
- "pop %%ebx\n"
- "1:\n"
- "mov %%edx, %0\n"
- : "=r" (result),
- "=m" (vendor[0]),
- "=m" (vendor[4]),
- "=m" (vendor[8])
- :
- : "%eax", "%ecx", "%edx"
- );
-
-#elif defined (_MSC_VER)
-
- _asm {
- pushfd
- pop eax
- mov ecx, eax
- xor eax, 00200000h
- push eax
- popfd
- pushfd
- pop eax
- mov edx, 0
- xor eax, ecx
- jz nocpuid
-
- mov eax, 0
- push ebx
- cpuid
- mov eax, ebx
- pop ebx
- mov vendor0, eax
- mov vendor1, edx
- mov vendor2, ecx
- mov eax, 1
- push ebx
- cpuid
- pop ebx
- nocpuid:
- mov result, edx
- }
- memmove (vendor + 0, &vendor0, 4);
- memmove (vendor + 4, &vendor1, 4);
- memmove (vendor + 8, &vendor2, 4);
-
-#else
-# error unsupported compiler
-#endif
-
- features = 0;
- if (result)
- {
- /* result now contains the standard feature bits */
- if (result & (1 << 15))
- features |= CMOV;
- if (result & (1 << 23))
- features |= MMX;
- if (result & (1 << 25))
- features |= SSE;
- if (result & (1 << 26))
- features |= SSE2;
- if ((features & MMX) && !(features & SSE) &&
- (strcmp (vendor, "AuthenticAMD") == 0 ||
- strcmp (vendor, "Geode by NSC") == 0))
- {
- /* check for AMD MMX extensions */
-#ifdef __GNUC__
- __asm__ (
- " push %%ebx\n"
- " mov $0x80000000, %%eax\n"
- " cpuid\n"
- " xor %%edx, %%edx\n"
- " cmp $0x1, %%eax\n"
- " jge 2f\n"
- " mov $0x80000001, %%eax\n"
- " cpuid\n"
- "2:\n"
- " pop %%ebx\n"
- " mov %%edx, %0\n"
- : "=r" (result)
- :
- : "%eax", "%ecx", "%edx"
- );
-#elif defined _MSC_VER
- _asm {
- push ebx
- mov eax, 80000000h
- cpuid
- xor edx, edx
- cmp eax, 1
- jge notamd
- mov eax, 80000001h
- cpuid
- notamd:
- pop ebx
- mov result, edx
- }
-#endif
- if (result & (1 << 22))
- features |= MMX_EXTENSIONS;
- }
- }
-#endif /* HAVE_GETISAX */
-
- return features;
-}
-
-static pixman_bool_t
-pixman_have_mmx (void)
-{
- static pixman_bool_t initialized = FALSE;
- static pixman_bool_t mmx_present;
-
- if (!initialized)
- {
- unsigned int features = detect_cpu_features ();
- mmx_present = (features & (MMX | MMX_EXTENSIONS)) == (MMX | MMX_EXTENSIONS);
- initialized = TRUE;
- }
-
- return mmx_present;
-}
-
-#ifdef USE_SSE2
-static pixman_bool_t
-pixman_have_sse2 (void)
-{
- static pixman_bool_t initialized = FALSE;
- static pixman_bool_t sse2_present;
-
- if (!initialized)
- {
- unsigned int features = detect_cpu_features ();
- sse2_present = (features & (MMX | MMX_EXTENSIONS | SSE | SSE2)) == (MMX | MMX_EXTENSIONS | SSE | SSE2);
- initialized = TRUE;
- }
-
- return sse2_present;
-}
-
-#endif
-
-#else /* __amd64__ */
-#ifdef USE_MMX
-#define pixman_have_mmx() TRUE
-#endif
-#ifdef USE_SSE2
-#define pixman_have_sse2() TRUE
-#endif
-#endif /* __amd64__ */
-#endif
-
-pixman_implementation_t *
-_pixman_choose_implementation (void)
-{
-#ifdef USE_SSE2
- if (pixman_have_sse2 ())
- return _pixman_implementation_create_sse2 ();
-#endif
-#ifdef USE_MMX
- if (pixman_have_mmx ())
- return _pixman_implementation_create_mmx ();
-#endif
-
-#ifdef USE_ARM_NEON
- if (pixman_have_arm_neon ())
- return _pixman_implementation_create_arm_neon ();
-#endif
-#ifdef USE_ARM_SIMD
- if (pixman_have_arm_simd ())
- return _pixman_implementation_create_arm_simd ();
-#endif
-#ifdef USE_VMX
- if (pixman_have_vmx ())
- return _pixman_implementation_create_vmx ();
-#endif
-
- return _pixman_implementation_create_fast_path ();
-}
-
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. SuSE makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+
+#if defined(USE_ARM_SIMD) && defined(_MSC_VER)
+/* Needed for EXCEPTION_ILLEGAL_INSTRUCTION */
+#include <windows.h>
+#endif
+
+#include "pixman-private.h"
+
+#ifdef USE_VMX
+
+/* The CPU detection code needs to be in a file not compiled with
+ * "-maltivec -mabi=altivec", as gcc would try to save vector register
+ * across function calls causing SIGILL on cpus without Altivec/vmx.
+ */
+static pixman_bool_t initialized = FALSE;
+static volatile pixman_bool_t have_vmx = TRUE;
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+ if (!initialized)
+ {
+ size_t length = sizeof(have_vmx);
+ int error =
+ sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0);
+
+ if (error)
+ have_vmx = FALSE;
+
+ initialized = TRUE;
+ }
+ return have_vmx;
+}
+
+#elif defined (__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+ if (!initialized)
+ {
+ int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+ size_t length = sizeof(have_vmx);
+ int error =
+ sysctl (mib, 2, &have_vmx, &length, NULL, 0);
+
+ if (error != 0)
+ have_vmx = FALSE;
+
+ initialized = TRUE;
+ }
+ return have_vmx;
+}
+
+#elif defined (__linux__)
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <linux/auxvec.h>
+#include <asm/cputable.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+ if (!initialized)
+ {
+ char fname[64];
+ unsigned long buf[64];
+ ssize_t count = 0;
+ pid_t pid;
+ int fd, i;
+
+ pid = getpid ();
+ snprintf (fname, sizeof(fname) - 1, "/proc/%d/auxv", pid);
+
+ fd = open (fname, O_RDONLY);
+ if (fd >= 0)
+ {
+ for (i = 0; i <= (count / sizeof(unsigned long)); i += 2)
+ {
+ /* Read more if buf is empty... */
+ if (i == (count / sizeof(unsigned long)))
+ {
+ count = read (fd, buf, sizeof(buf));
+ if (count <= 0)
+ break;
+ i = 0;
+ }
+
+ if (buf[i] == AT_HWCAP)
+ {
+ have_vmx = !!(buf[i + 1] & PPC_FEATURE_HAS_ALTIVEC);
+ initialized = TRUE;
+ break;
+ }
+ else if (buf[i] == AT_NULL)
+ {
+ break;
+ }
+ }
+ close (fd);
+ }
+ }
+ if (!initialized)
+ {
+ /* Something went wrong. Assume 'no' rather than playing
+ fragile tricks with catching SIGILL. */
+ have_vmx = FALSE;
+ initialized = TRUE;
+ }
+
+ return have_vmx;
+}
+
+#else /* !__APPLE__ && !__OpenBSD__ && !__linux__ */
+#include <signal.h>
+#include <setjmp.h>
+
+static jmp_buf jump_env;
+
+static void
+vmx_test (int sig,
+ siginfo_t *si,
+ void * unused)
+{
+ longjmp (jump_env, 1);
+}
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+ struct sigaction sa, osa;
+ int jmp_result;
+
+ if (!initialized)
+ {
+ sa.sa_flags = SA_SIGINFO;
+ sigemptyset (&sa.sa_mask);
+ sa.sa_sigaction = vmx_test;
+ sigaction (SIGILL, &sa, &osa);
+ jmp_result = setjmp (jump_env);
+ if (jmp_result == 0)
+ {
+ asm volatile ( "vor 0, 0, 0" );
+ }
+ sigaction (SIGILL, &osa, NULL);
+ have_vmx = (jmp_result == 0);
+ initialized = TRUE;
+ }
+ return have_vmx;
+}
+
+#endif /* __APPLE__ */
+#endif /* USE_VMX */
+
+#if defined(USE_ARM_SIMD) || defined(USE_ARM_NEON)
+
+#if defined(_MSC_VER)
+
+#if defined(USE_ARM_SIMD)
+extern int pixman_msvc_try_arm_simd_op ();
+
+pixman_bool_t
+pixman_have_arm_simd (void)
+{
+ static pixman_bool_t initialized = FALSE;
+ static pixman_bool_t have_arm_simd = FALSE;
+
+ if (!initialized)
+ {
+ __try {
+ pixman_msvc_try_arm_simd_op ();
+ have_arm_simd = TRUE;
+ } __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION) {
+ have_arm_simd = FALSE;
+ }
+ initialized = TRUE;
+ }
+
+ return have_arm_simd;
+}
+
+#endif /* USE_ARM_SIMD */
+
+#if defined(USE_ARM_NEON)
+extern int pixman_msvc_try_arm_neon_op ();
+
+pixman_bool_t
+pixman_have_arm_neon (void)
+{
+ static pixman_bool_t initialized = FALSE;
+ static pixman_bool_t have_arm_neon = FALSE;
+
+ if (!initialized)
+ {
+ __try
+ {
+ pixman_msvc_try_arm_neon_op ();
+ have_arm_neon = TRUE;
+ }
+ __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION)
+ {
+ have_arm_neon = FALSE;
+ }
+ initialized = TRUE;
+ }
+
+ return have_arm_neon;
+}
+
+#endif /* USE_ARM_NEON */
+
+#elif defined (__linux__) /* linux ELF */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <elf.h>
+
+static pixman_bool_t arm_has_v7 = FALSE;
+static pixman_bool_t arm_has_v6 = FALSE;
+static pixman_bool_t arm_has_vfp = FALSE;
+static pixman_bool_t arm_has_neon = FALSE;
+static pixman_bool_t arm_has_iwmmxt = FALSE;
+static pixman_bool_t arm_tests_initialized = FALSE;
+
+static void
+pixman_arm_read_auxv ()
+{
+ int fd;
+ Elf32_auxv_t aux;
+
+ fd = open ("/proc/self/auxv", O_RDONLY);
+ if (fd >= 0)
+ {
+ while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t))
+ {
+ if (aux.a_type == AT_HWCAP)
+ {
+ uint32_t hwcap = aux.a_un.a_val;
+ /* hardcode these values to avoid depending on specific
+ * versions of the hwcap header, e.g. HWCAP_NEON
+ */
+ arm_has_vfp = (hwcap & 64) != 0;
+ arm_has_iwmmxt = (hwcap & 512) != 0;
+ /* this flag is only present on kernel 2.6.29 */
+ arm_has_neon = (hwcap & 4096) != 0;
+ }
+ else if (aux.a_type == AT_PLATFORM)
+ {
+ const char *plat = (const char*) aux.a_un.a_val;
+ if (strncmp (plat, "v7l", 3) == 0)
+ {
+ arm_has_v7 = TRUE;
+ arm_has_v6 = TRUE;
+ }
+ else if (strncmp (plat, "v6l", 3) == 0)
+ {
+ arm_has_v6 = TRUE;
+ }
+ }
+ }
+ close (fd);
+ }
+
+ arm_tests_initialized = TRUE;
+}
+
+#if defined(USE_ARM_SIMD)
+pixman_bool_t
+pixman_have_arm_simd (void)
+{
+ if (!arm_tests_initialized)
+ pixman_arm_read_auxv ();
+
+ return arm_has_v6;
+}
+
+#endif /* USE_ARM_SIMD */
+
+#if defined(USE_ARM_NEON)
+pixman_bool_t
+pixman_have_arm_neon (void)
+{
+ if (!arm_tests_initialized)
+ pixman_arm_read_auxv ();
+
+ return arm_has_neon;
+}
+
+#endif /* USE_ARM_NEON */
+
+#else /* linux ELF */
+
+#define pixman_have_arm_simd() FALSE
+#define pixman_have_arm_neon() FALSE
+
+#endif
+
+#endif /* USE_ARM_SIMD || USE_ARM_NEON */
+
+#if defined(USE_MMX) || defined(USE_SSE2)
+/* The CPU detection code needs to be in a file not compiled with
+ * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
+ * that would lead to SIGILL instructions on old CPUs that don't have
+ * it.
+ */
+#if !defined(__amd64__) && !defined(__x86_64__) && !defined(_M_AMD64)
+
+#ifdef HAVE_GETISAX
+#include <sys/auxv.h>
+#endif
+
+typedef enum
+{
+ NO_FEATURES = 0,
+ MMX = 0x1,
+ MMX_EXTENSIONS = 0x2,
+ SSE = 0x6,
+ SSE2 = 0x8,
+ CMOV = 0x10
+} cpu_features_t;
+
+
+static unsigned int
+detect_cpu_features (void)
+{
+ unsigned int features = 0;
+ unsigned int result = 0;
+
+#ifdef HAVE_GETISAX
+ if (getisax (&result, 1))
+ {
+ if (result & AV_386_CMOV)
+ features |= CMOV;
+ if (result & AV_386_MMX)
+ features |= MMX;
+ if (result & AV_386_AMD_MMX)
+ features |= MMX_EXTENSIONS;
+ if (result & AV_386_SSE)
+ features |= SSE;
+ if (result & AV_386_SSE2)
+ features |= SSE2;
+ }
+#else
+ char vendor[13];
+#ifdef _MSC_VER
+ int vendor0 = 0, vendor1, vendor2;
+#endif
+ vendor[0] = 0;
+ vendor[12] = 0;
+
+#ifdef __GNUC__
+ /* see p. 118 of amd64 instruction set manual Vol3 */
+ /* We need to be careful about the handling of %ebx and
+ * %esp here. We can't declare either one as clobbered
+ * since they are special registers (%ebx is the "PIC
+ * register" holding an offset to global data, %esp the
+ * stack pointer), so we need to make sure they have their
+ * original values when we access the output operands.
+ */
+ __asm__ (
+ "pushf\n"
+ "pop %%eax\n"
+ "mov %%eax, %%ecx\n"
+ "xor $0x00200000, %%eax\n"
+ "push %%eax\n"
+ "popf\n"
+ "pushf\n"
+ "pop %%eax\n"
+ "mov $0x0, %%edx\n"
+ "xor %%ecx, %%eax\n"
+ "jz 1f\n"
+
+ "mov $0x00000000, %%eax\n"
+ "push %%ebx\n"
+ "cpuid\n"
+ "mov %%ebx, %%eax\n"
+ "pop %%ebx\n"
+ "mov %%eax, %1\n"
+ "mov %%edx, %2\n"
+ "mov %%ecx, %3\n"
+ "mov $0x00000001, %%eax\n"
+ "push %%ebx\n"
+ "cpuid\n"
+ "pop %%ebx\n"
+ "1:\n"
+ "mov %%edx, %0\n"
+ : "=r" (result),
+ "=m" (vendor[0]),
+ "=m" (vendor[4]),
+ "=m" (vendor[8])
+ :
+ : "%eax", "%ecx", "%edx"
+ );
+
+#elif defined (_MSC_VER)
+
+ _asm {
+ pushfd
+ pop eax
+ mov ecx, eax
+ xor eax, 00200000h
+ push eax
+ popfd
+ pushfd
+ pop eax
+ mov edx, 0
+ xor eax, ecx
+ jz nocpuid
+
+ mov eax, 0
+ push ebx
+ cpuid
+ mov eax, ebx
+ pop ebx
+ mov vendor0, eax
+ mov vendor1, edx
+ mov vendor2, ecx
+ mov eax, 1
+ push ebx
+ cpuid
+ pop ebx
+ nocpuid:
+ mov result, edx
+ }
+ memmove (vendor + 0, &vendor0, 4);
+ memmove (vendor + 4, &vendor1, 4);
+ memmove (vendor + 8, &vendor2, 4);
+
+#else
+# error unsupported compiler
+#endif
+
+ features = 0;
+ if (result)
+ {
+ /* result now contains the standard feature bits */
+ if (result & (1 << 15))
+ features |= CMOV;
+ if (result & (1 << 23))
+ features |= MMX;
+ if (result & (1 << 25))
+ features |= SSE;
+ if (result & (1 << 26))
+ features |= SSE2;
+ if ((features & MMX) && !(features & SSE) &&
+ (strcmp (vendor, "AuthenticAMD") == 0 ||
+ strcmp (vendor, "Geode by NSC") == 0))
+ {
+ /* check for AMD MMX extensions */
+#ifdef __GNUC__
+ __asm__ (
+ " push %%ebx\n"
+ " mov $0x80000000, %%eax\n"
+ " cpuid\n"
+ " xor %%edx, %%edx\n"
+ " cmp $0x1, %%eax\n"
+ " jge 2f\n"
+ " mov $0x80000001, %%eax\n"
+ " cpuid\n"
+ "2:\n"
+ " pop %%ebx\n"
+ " mov %%edx, %0\n"
+ : "=r" (result)
+ :
+ : "%eax", "%ecx", "%edx"
+ );
+#elif defined _MSC_VER
+ _asm {
+ push ebx
+ mov eax, 80000000h
+ cpuid
+ xor edx, edx
+ cmp eax, 1
+ jge notamd
+ mov eax, 80000001h
+ cpuid
+ notamd:
+ pop ebx
+ mov result, edx
+ }
+#endif
+ if (result & (1 << 22))
+ features |= MMX_EXTENSIONS;
+ }
+ }
+#endif /* HAVE_GETISAX */
+
+ return features;
+}
+
+static pixman_bool_t
+pixman_have_mmx (void)
+{
+ static pixman_bool_t initialized = FALSE;
+ static pixman_bool_t mmx_present;
+
+ if (!initialized)
+ {
+ unsigned int features = detect_cpu_features ();
+ mmx_present = (features & (MMX | MMX_EXTENSIONS)) == (MMX | MMX_EXTENSIONS);
+ initialized = TRUE;
+ }
+
+ return mmx_present;
+}
+
+#ifdef USE_SSE2
+static pixman_bool_t
+pixman_have_sse2 (void)
+{
+ static pixman_bool_t initialized = FALSE;
+ static pixman_bool_t sse2_present;
+
+ if (!initialized)
+ {
+ unsigned int features = detect_cpu_features ();
+ sse2_present = (features & (MMX | MMX_EXTENSIONS | SSE | SSE2)) == (MMX | MMX_EXTENSIONS | SSE | SSE2);
+ initialized = TRUE;
+ }
+
+ return sse2_present;
+}
+
+#endif
+
+#else /* __amd64__ */
+#ifdef USE_MMX
+#define pixman_have_mmx() TRUE
+#endif
+#ifdef USE_SSE2
+#define pixman_have_sse2() TRUE
+#endif
+#endif /* __amd64__ */
+#endif
+
+pixman_implementation_t *
+_pixman_choose_implementation (void)
+{
+#ifdef USE_SSE2
+ if (pixman_have_sse2 ())
+ return _pixman_implementation_create_sse2 ();
+#endif
+#ifdef USE_MMX
+ if (pixman_have_mmx ())
+ return _pixman_implementation_create_mmx ();
+#endif
+
+#ifdef USE_ARM_NEON
+ if (pixman_have_arm_neon ())
+ return _pixman_implementation_create_arm_neon ();
+#endif
+#ifdef USE_ARM_SIMD
+ if (pixman_have_arm_simd ())
+ return _pixman_implementation_create_arm_simd ();
+#endif
+#ifdef USE_VMX
+ if (pixman_have_vmx ())
+ return _pixman_implementation_create_vmx ();
+#endif
+
+ return _pixman_implementation_create_fast_path ();
+}
+
diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c
index 01a3017c7..f103b4cf1 100644
--- a/pixman/pixman/pixman-fast-path.c
+++ b/pixman/pixman/pixman-fast-path.c
@@ -1,1937 +1,1937 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. SuSE makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author: Keith Packard, SuSE, Inc.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <string.h>
-#include <stdlib.h>
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-fast-path.h"
-
-static force_inline uint32_t
-fetch_24 (uint8_t *a)
-{
- if (((unsigned long)a) & 1)
- {
-#ifdef WORDS_BIGENDIAN
- return (*a << 16) | (*(uint16_t *)(a + 1));
-#else
- return *a | (*(uint16_t *)(a + 1) << 8);
-#endif
- }
- else
- {
-#ifdef WORDS_BIGENDIAN
- return (*(uint16_t *)a << 8) | *(a + 2);
-#else
- return *(uint16_t *)a | (*(a + 2) << 16);
-#endif
- }
-}
-
-static force_inline void
-store_24 (uint8_t *a,
- uint32_t v)
-{
- if (((unsigned long)a) & 1)
- {
-#ifdef WORDS_BIGENDIAN
- *a = (uint8_t) (v >> 16);
- *(uint16_t *)(a + 1) = (uint16_t) (v);
-#else
- *a = (uint8_t) (v);
- *(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
-#endif
- }
- else
- {
-#ifdef WORDS_BIGENDIAN
- *(uint16_t *)a = (uint16_t)(v >> 8);
- *(a + 2) = (uint8_t)v;
-#else
- *(uint16_t *)a = (uint16_t)v;
- *(a + 2) = (uint8_t)(v >> 16);
-#endif
- }
-}
-
-static force_inline uint32_t
-over (uint32_t src,
- uint32_t dest)
-{
- uint32_t a = ~src >> 24;
-
- UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
-
- return dest;
-}
-
-static uint32_t
-in (uint32_t x,
- uint8_t y)
-{
- uint16_t a = y;
-
- UN8x4_MUL_UN8 (x, a);
-
- return x;
-}
-
-/*
- * Naming convention:
- *
- * op_src_mask_dest
- */
-static void
-fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line;
- uint32_t *dst, *dst_line;
- uint8_t *mask, *mask_line;
- int src_stride, mask_stride, dst_stride;
- uint8_t m;
- uint32_t s, d;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
- while (w--)
- {
- m = *mask++;
- if (m)
- {
- s = *src | 0xff000000;
-
- if (m == 0xff)
- {
- *dst = s;
- }
- else
- {
- d = in (s, m);
- *dst = over (d, *dst);
- }
- }
- src++;
- dst++;
- }
- }
-}
-
-static void
-fast_composite_in_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dest_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
- uint16_t t;
-
- src = _pixman_image_get_solid (src_image, dest_image->bits.format);
-
- srca = src >> 24;
-
- PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- if (srca == 0xff)
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
-
- if (m == 0)
- *dst = 0;
- else if (m != 0xff)
- *dst = MUL_UN8 (m, *dst, t);
-
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- m = MUL_UN8 (m, srca, t);
-
- if (m == 0)
- *dst = 0;
- else if (m != 0xff)
- *dst = MUL_UN8 (m, *dst, t);
-
- dst++;
- }
- }
- }
-}
-
-static void
-fast_composite_in_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dest_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint8_t s;
- uint16_t t;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
-
- if (s == 0)
- *dst = 0;
- else if (s != 0xff)
- *dst = MUL_UN8 (s, *dst, t);
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, *dst, d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- *dst = src;
- else
- *dst = over (src, *dst);
- }
- else if (m)
- {
- d = in (src, m);
- *dst = over (d, *dst);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca, s;
- uint32_t *dst_line, *dst, d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
-
- if (ma)
- {
- d = *dst;
- s = src;
-
- UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
-
- *dst = s;
- }
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca, s;
- uint32_t *dst_line, *dst, d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
- if (ma == 0xffffffff)
- {
- if (srca == 0xff)
- *dst = src;
- else
- *dst = over (src, *dst);
- }
- else if (ma)
- {
- d = *dst;
- s = src;
-
- UN8x4_MUL_UN8x4 (s, ma);
- UN8x4_MUL_UN8 (ma, srca);
- ma = ~ma;
- UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
- *dst = d;
- }
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint8_t *dst_line, *dst;
- uint32_t d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- {
- d = src;
- }
- else
- {
- d = fetch_24 (dst);
- d = over (src, d);
- }
- store_24 (dst, d);
- }
- else if (m)
- {
- d = over (in (src, m), fetch_24 (dst));
- store_24 (dst, d);
- }
- dst += 3;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- {
- d = src;
- }
- else
- {
- d = *dst;
- d = over (src, CONVERT_0565_TO_0888 (d));
- }
- *dst = CONVERT_8888_TO_0565 (d);
- }
- else if (m)
- {
- d = *dst;
- d = over (in (src, m), CONVERT_0565_TO_0888 (d));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca, s;
- uint16_t src16;
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- src16 = CONVERT_8888_TO_0565 (src);
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
- if (ma == 0xffffffff)
- {
- if (srca == 0xff)
- {
- *dst = src16;
- }
- else
- {
- d = *dst;
- d = over (src, CONVERT_0565_TO_0888 (d));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- }
- else if (ma)
- {
- d = *dst;
- d = CONVERT_0565_TO_0888 (d);
-
- s = src;
-
- UN8x4_MUL_UN8x4 (s, ma);
- UN8x4_MUL_UN8 (ma, srca);
- ma = ~ma;
- UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- uint8_t a;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (a == 0xff)
- *dst = s;
- else if (s)
- *dst = over (s, *dst);
- dst++;
- }
- }
-}
-
-static void
-fast_composite_src_x888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- *dst++ = (*src++) | 0xff000000;
- }
-}
-
-#if 0
-static void
-fast_composite_over_8888_0888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint32_t d;
- uint32_t *src_line, *src, s;
- uint8_t a;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (a)
- {
- if (a == 0xff)
- d = s;
- else
- d = over (s, fetch_24 (dst));
-
- store_24 (dst, d);
- }
- dst += 3;
- }
- }
-}
-#endif
-
-static void
-fast_composite_over_8888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint32_t *src_line, *src, s;
- uint8_t a;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (s)
- {
- if (a == 0xff)
- {
- d = s;
- }
- else
- {
- d = *dst;
- d = over (s, CONVERT_0565_TO_0888 (d));
- }
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_src_x888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- *dst = CONVERT_8888_TO_0565 (s);
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint8_t s, d;
- uint16_t t;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- if (s)
- {
- if (s != 0xff)
- {
- d = *dst;
- t = d + s;
- s = t | (0 - (t >> 8));
- }
- *dst = s;
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint32_t s, d;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- if (s)
- {
- if (s != 0xffffffff)
- {
- d = *dst;
- if (d)
- UN8x4_ADD_UN8x4 (s, d);
- }
- *dst = s;
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t src;
- uint8_t sa;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- sa = (src >> 24);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- uint16_t tmp;
- uint16_t a;
- uint32_t m, d;
- uint32_t r;
-
- a = *mask++;
- d = *dst;
-
- m = MUL_UN8 (sa, a, tmp);
- r = ADD_UN8 (m, d, tmp);
-
- *dst++ = r;
- }
- }
-}
-
-#ifdef WORDS_BIGENDIAN
-#define CREATE_BITMASK(n) (0x80000000 >> (n))
-#define UPDATE_BITMASK(n) ((n) >> 1)
-#else
-#define CREATE_BITMASK(n) (1 << (n))
-#define UPDATE_BITMASK(n) ((n) << 1)
-#endif
-
-#define TEST_BIT(p, n) \
- (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
-#define SET_BIT(p, n) \
- do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
-
-static void
-fast_composite_add_1000_1000 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
- src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
- dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- /*
- * TODO: improve performance by processing uint32_t data instead
- * of individual bits
- */
- if (TEST_BIT (src, src_x + w))
- SET_BIT (dst, dest_x + w);
- }
- }
-}
-
-static void
-fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst, *dst_line;
- uint32_t *mask, *mask_line;
- int mask_stride, dst_stride;
- uint32_t bitcache, bitmask;
- int32_t w;
-
- if (width <= 0)
- return;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
- dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
- mask_stride, mask_line, 1);
- mask_line += mask_x >> 5;
-
- if (srca == 0xff)
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = src;
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = over (src, *dst);
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
-}
-
-static void
-fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dst, *dst_line;
- uint32_t *mask, *mask_line;
- int mask_stride, dst_stride;
- uint32_t bitcache, bitmask;
- int32_t w;
- uint32_t d;
- uint16_t src565;
-
- if (width <= 0)
- return;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
- dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
- mask_stride, mask_line, 1);
- mask_line += mask_x >> 5;
-
- if (srca == 0xff)
- {
- src565 = CONVERT_8888_TO_0565 (src);
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = src565;
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- {
- d = over (src, CONVERT_0565_TO_0888 (*dst));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
-}
-
-/*
- * Simple bitblt
- */
-
-static void
-fast_composite_solid_fill (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (dst_image->bits.format == PIXMAN_a1)
- {
- src = src >> 31;
- }
- else if (dst_image->bits.format == PIXMAN_a8)
- {
- src = src >> 24;
- }
- else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
- dst_image->bits.format == PIXMAN_b5g6r5)
- {
- src = CONVERT_8888_TO_0565 (src);
- }
-
- pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
- dest_x, dest_y,
- width, height,
- src);
-}
-
-static void
-fast_composite_src_memcpy (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8;
- uint32_t n_bytes = width * bpp;
- int dst_stride, src_stride;
- uint8_t *dst;
- uint8_t *src;
-
- src_stride = src_image->bits.rowstride * 4;
- dst_stride = dst_image->bits.rowstride * 4;
-
- src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
- dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
-
- while (height--)
- {
- memcpy (dst, src, n_bytes);
-
- dst += dst_stride;
- src += src_stride;
- }
-}
-
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER);
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE);
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD);
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL);
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER);
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE);
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD);
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL);
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER);
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE);
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD);
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL);
-FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL);
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER);
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE);
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD);
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL);
-
-/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
-static force_inline void
-scaled_nearest_scanline_565_565_SRC (uint16_t * dst,
- uint16_t * src,
- int32_t w,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- pixman_fixed_t max_vx)
-{
- uint16_t tmp1, tmp2, tmp3, tmp4;
- while ((w -= 4) >= 0)
- {
- tmp1 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp3 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp4 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- *dst++ = tmp1;
- *dst++ = tmp2;
- *dst++ = tmp3;
- *dst++ = tmp4;
- }
- if (w & 2)
- {
- tmp1 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- *dst++ = tmp1;
- *dst++ = tmp2;
- }
- if (w & 1)
- *dst++ = src[pixman_fixed_to_int (vx)];
-}
-
-FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, COVER);
-FAST_NEAREST_MAINLOOP (565_565_none_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, NONE);
-FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, PAD);
-
-static force_inline uint32_t
-fetch_nearest (pixman_repeat_t src_repeat,
- pixman_format_code_t format,
- uint32_t *src, int x, int src_width)
-{
- if (repeat (src_repeat, &x, src_width))
- {
- if (format == PIXMAN_x8r8g8b8)
- return *(src + x) | 0xff000000;
- else
- return *(src + x);
- }
- else
- {
- return 0;
- }
-}
-
-static force_inline void
-combine_over (uint32_t s, uint32_t *dst)
-{
- if (s)
- {
- uint8_t ia = 0xff - (s >> 24);
-
- if (ia)
- UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
- else
- *dst = s;
- }
-}
-
-static force_inline void
-combine_src (uint32_t s, uint32_t *dst)
-{
- *dst = s;
-}
-
-static void
-fast_composite_scaled_nearest (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line;
- uint32_t *src_line;
- int dst_stride, src_stride;
- int src_width, src_height;
- pixman_repeat_t src_repeat;
- pixman_fixed_t unit_x, unit_y;
- pixman_format_code_t src_format;
- pixman_vector_t v;
- pixman_fixed_t vy;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
- * transformed from destination space to source space
- */
- PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (src_image->common.transform, &v))
- return;
-
- unit_x = src_image->common.transform->matrix[0][0];
- unit_y = src_image->common.transform->matrix[1][1];
-
- /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
- v.vector[0] -= pixman_fixed_e;
- v.vector[1] -= pixman_fixed_e;
-
- src_height = src_image->bits.height;
- src_width = src_image->bits.width;
- src_repeat = src_image->common.repeat;
- src_format = src_image->bits.format;
-
- vy = v.vector[1];
- while (height--)
- {
- pixman_fixed_t vx = v.vector[0];
- int y = pixman_fixed_to_int (vy);
- uint32_t *dst = dst_line;
-
- dst_line += dst_stride;
-
- /* adjust the y location by a unit vector in the y direction
- * this is equivalent to transforming y+1 of the destination point to source space */
- vy += unit_y;
-
- if (!repeat (src_repeat, &y, src_height))
- {
- if (op == PIXMAN_OP_SRC)
- memset (dst, 0, sizeof (*dst) * width);
- }
- else
- {
- int w = width;
-
- uint32_t *src = src_line + y * src_stride;
-
- while (w >= 2)
- {
- uint32_t s1, s2;
- int x1, x2;
-
- x1 = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- x2 = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- w -= 2;
-
- s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
- s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
-
- if (op == PIXMAN_OP_OVER)
- {
- combine_over (s1, dst++);
- combine_over (s2, dst++);
- }
- else
- {
- combine_src (s1, dst++);
- combine_src (s2, dst++);
- }
- }
-
- while (w--)
- {
- uint32_t s;
- int x;
-
- x = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- s = fetch_nearest (src_repeat, src_format, src, x, src_width);
-
- if (op == PIXMAN_OP_OVER)
- combine_over (s, dst++);
- else
- combine_src (s, dst++);
- }
- }
- }
-}
-
-static const pixman_fast_path_t c_fast_paths[] =
-{
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5, fast_composite_over_n_1_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5, fast_composite_over_n_1_0565),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
- PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
- PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
- PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
-
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
-
-#define NEAREST_FAST_PATH(op,s,d) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, SCALED_NEAREST_FLAGS, \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest, \
- }
-
- NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
- NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
-
- NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
- NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
-
- NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
- NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
-
- NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
- NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
-
- { PIXMAN_OP_NONE },
-};
-
-#ifdef WORDS_BIGENDIAN
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
-#else
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
-#endif
-
-static force_inline void
-pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
-{
- if (offs)
- {
- int leading_pixels = 32 - offs;
- if (leading_pixels >= width)
- {
- if (v)
- *dst |= A1_FILL_MASK (width, offs);
- else
- *dst &= ~A1_FILL_MASK (width, offs);
- return;
- }
- else
- {
- if (v)
- *dst++ |= A1_FILL_MASK (leading_pixels, offs);
- else
- *dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
- width -= leading_pixels;
- }
- }
- while (width >= 32)
- {
- if (v)
- *dst++ = 0xFFFFFFFF;
- else
- *dst++ = 0;
- width -= 32;
- }
- if (width > 0)
- {
- if (v)
- *dst |= A1_FILL_MASK (width, 0);
- else
- *dst &= ~A1_FILL_MASK (width, 0);
- }
-}
-
-static void
-pixman_fill1 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- uint32_t *dst = bits + y * stride + (x >> 5);
- int offs = x & 31;
-
- if (xor & 1)
- {
- while (height--)
- {
- pixman_fill1_line (dst, offs, width, 1);
- dst += stride;
- }
- }
- else
- {
- while (height--)
- {
- pixman_fill1_line (dst, offs, width, 0);
- dst += stride;
- }
- }
-}
-
-static void
-pixman_fill8 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int byte_stride = stride * (int) sizeof (uint32_t);
- uint8_t *dst = (uint8_t *) bits;
- uint8_t v = xor & 0xff;
- int i;
-
- dst = dst + y * byte_stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- dst[i] = v;
-
- dst += byte_stride;
- }
-}
-
-static void
-pixman_fill16 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int short_stride =
- (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
- uint16_t *dst = (uint16_t *)bits;
- uint16_t v = xor & 0xffff;
- int i;
-
- dst = dst + y * short_stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- dst[i] = v;
-
- dst += short_stride;
- }
-}
-
-static void
-pixman_fill32 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int i;
-
- bits = bits + y * stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- bits[i] = xor;
-
- bits += stride;
- }
-}
-
-static pixman_bool_t
-fast_path_fill (pixman_implementation_t *imp,
- uint32_t * bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- switch (bpp)
- {
- case 1:
- pixman_fill1 (bits, stride, x, y, width, height, xor);
- break;
-
- case 8:
- pixman_fill8 (bits, stride, x, y, width, height, xor);
- break;
-
- case 16:
- pixman_fill16 (bits, stride, x, y, width, height, xor);
- break;
-
- case 32:
- pixman_fill32 (bits, stride, x, y, width, height, xor);
- break;
-
- default:
- return _pixman_implementation_fill (
- imp->delegate, bits, stride, bpp, x, y, width, height, xor);
- break;
- }
-
- return TRUE;
-}
-
-pixman_implementation_t *
-_pixman_implementation_create_fast_path (void)
-{
- pixman_implementation_t *general = _pixman_implementation_create_general ();
- pixman_implementation_t *imp = _pixman_implementation_create (general, c_fast_paths);
-
- imp->fill = fast_path_fill;
-
- return imp;
-}
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. SuSE makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-fast-path.h"
+
+static force_inline uint32_t
+fetch_24 (uint8_t *a)
+{
+ if (((unsigned long)a) & 1)
+ {
+#ifdef WORDS_BIGENDIAN
+ return (*a << 16) | (*(uint16_t *)(a + 1));
+#else
+ return *a | (*(uint16_t *)(a + 1) << 8);
+#endif
+ }
+ else
+ {
+#ifdef WORDS_BIGENDIAN
+ return (*(uint16_t *)a << 8) | *(a + 2);
+#else
+ return *(uint16_t *)a | (*(a + 2) << 16);
+#endif
+ }
+}
+
+static force_inline void
+store_24 (uint8_t *a,
+ uint32_t v)
+{
+ if (((unsigned long)a) & 1)
+ {
+#ifdef WORDS_BIGENDIAN
+ *a = (uint8_t) (v >> 16);
+ *(uint16_t *)(a + 1) = (uint16_t) (v);
+#else
+ *a = (uint8_t) (v);
+ *(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
+#endif
+ }
+ else
+ {
+#ifdef WORDS_BIGENDIAN
+ *(uint16_t *)a = (uint16_t)(v >> 8);
+ *(a + 2) = (uint8_t)v;
+#else
+ *(uint16_t *)a = (uint16_t)v;
+ *(a + 2) = (uint8_t)(v >> 16);
+#endif
+ }
+}
+
+static force_inline uint32_t
+over (uint32_t src,
+ uint32_t dest)
+{
+ uint32_t a = ~src >> 24;
+
+ UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
+
+ return dest;
+}
+
+static uint32_t
+in (uint32_t x,
+ uint8_t y)
+{
+ uint16_t a = y;
+
+ UN8x4_MUL_UN8 (x, a);
+
+ return x;
+}
+
+/*
+ * Naming convention:
+ *
+ * op_src_mask_dest
+ */
+static void
+fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *src, *src_line;
+ uint32_t *dst, *dst_line;
+ uint8_t *mask, *mask_line;
+ int src_stride, mask_stride, dst_stride;
+ uint8_t m;
+ uint32_t s, d;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+
+ w = width;
+ while (w--)
+ {
+ m = *mask++;
+ if (m)
+ {
+ s = *src | 0xff000000;
+
+ if (m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ d = in (s, m);
+ *dst = over (d, *dst);
+ }
+ }
+ src++;
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_in_n_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dest_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint16_t t;
+
+ src = _pixman_image_get_solid (src_image, dest_image->bits.format);
+
+ srca = src >> 24;
+
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ if (srca == 0xff)
+ {
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ m = *mask++;
+
+ if (m == 0)
+ *dst = 0;
+ else if (m != 0xff)
+ *dst = MUL_UN8 (m, *dst, t);
+
+ dst++;
+ }
+ }
+ }
+ else
+ {
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ m = *mask++;
+ m = MUL_UN8 (m, srca, t);
+
+ if (m == 0)
+ *dst = 0;
+ else if (m != 0xff)
+ *dst = MUL_UN8 (m, *dst, t);
+
+ dst++;
+ }
+ }
+ }
+}
+
+static void
+fast_composite_in_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dest_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint8_t s;
+ uint16_t t;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+
+ if (s == 0)
+ *dst = 0;
+ else if (s != 0xff)
+ *dst = MUL_UN8 (s, *dst, t);
+
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst, d;
+ uint8_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ m = *mask++;
+ if (m == 0xff)
+ {
+ if (srca == 0xff)
+ *dst = src;
+ else
+ *dst = over (src, *dst);
+ }
+ else if (m)
+ {
+ d = in (src, m);
+ *dst = over (d, *dst);
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca, s;
+ uint32_t *dst_line, *dst, d;
+ uint32_t *mask_line, *mask, ma;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ ma = *mask++;
+
+ if (ma)
+ {
+ d = *dst;
+ s = src;
+
+ UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
+
+ *dst = s;
+ }
+
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca, s;
+ uint32_t *dst_line, *dst, d;
+ uint32_t *mask_line, *mask, ma;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ ma = *mask++;
+ if (ma == 0xffffffff)
+ {
+ if (srca == 0xff)
+ *dst = src;
+ else
+ *dst = over (src, *dst);
+ }
+ else if (ma)
+ {
+ d = *dst;
+ s = src;
+
+ UN8x4_MUL_UN8x4 (s, ma);
+ UN8x4_MUL_UN8 (ma, srca);
+ ma = ~ma;
+ UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+ *dst = d;
+ }
+
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint8_t *dst_line, *dst;
+ uint32_t d;
+ uint8_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ m = *mask++;
+ if (m == 0xff)
+ {
+ if (srca == 0xff)
+ {
+ d = src;
+ }
+ else
+ {
+ d = fetch_24 (dst);
+ d = over (src, d);
+ }
+ store_24 (dst, d);
+ }
+ else if (m)
+ {
+ d = over (in (src, m), fetch_24 (dst));
+ store_24 (dst, d);
+ }
+ dst += 3;
+ }
+ }
+}
+
+static void
+fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint16_t *dst_line, *dst;
+ uint32_t d;
+ uint8_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ m = *mask++;
+ if (m == 0xff)
+ {
+ if (srca == 0xff)
+ {
+ d = src;
+ }
+ else
+ {
+ d = *dst;
+ d = over (src, CONVERT_0565_TO_0888 (d));
+ }
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ else if (m)
+ {
+ d = *dst;
+ d = over (in (src, m), CONVERT_0565_TO_0888 (d));
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca, s;
+ uint16_t src16;
+ uint16_t *dst_line, *dst;
+ uint32_t d;
+ uint32_t *mask_line, *mask, ma;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ src16 = CONVERT_8888_TO_0565 (src);
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ ma = *mask++;
+ if (ma == 0xffffffff)
+ {
+ if (srca == 0xff)
+ {
+ *dst = src16;
+ }
+ else
+ {
+ d = *dst;
+ d = over (src, CONVERT_0565_TO_0888 (d));
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ }
+ else if (ma)
+ {
+ d = *dst;
+ d = CONVERT_0565_TO_0888 (d);
+
+ s = src;
+
+ UN8x4_MUL_UN8x4 (s, ma);
+ UN8x4_MUL_UN8 (ma, srca);
+ ma = ~ma;
+ UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_over_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ uint8_t a;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ a = s >> 24;
+ if (a == 0xff)
+ *dst = s;
+ else if (s)
+ *dst = over (s, *dst);
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_src_x888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ *dst++ = (*src++) | 0xff000000;
+ }
+}
+
+#if 0
+static void
+fast_composite_over_8888_0888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint32_t d;
+ uint32_t *src_line, *src, s;
+ uint8_t a;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ a = s >> 24;
+ if (a)
+ {
+ if (a == 0xff)
+ d = s;
+ else
+ d = over (s, fetch_24 (dst));
+
+ store_24 (dst, d);
+ }
+ dst += 3;
+ }
+ }
+}
+#endif
+
+static void
+fast_composite_over_8888_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dst_line, *dst;
+ uint32_t d;
+ uint32_t *src_line, *src, s;
+ uint8_t a;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ a = s >> 24;
+ if (s)
+ {
+ if (a == 0xff)
+ {
+ d = s;
+ }
+ else
+ {
+ d = *dst;
+ d = over (s, CONVERT_0565_TO_0888 (d));
+ }
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_src_x888_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dst_line, *dst;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ *dst = CONVERT_8888_TO_0565 (s);
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint8_t s, d;
+ uint16_t t;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ if (s)
+ {
+ if (s != 0xff)
+ {
+ d = *dst;
+ t = d + s;
+ s = t | (0 - (t >> 8));
+ }
+ *dst = s;
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_add_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint32_t s, d;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ if (s)
+ {
+ if (s != 0xffffffff)
+ {
+ d = *dst;
+ if (d)
+ UN8x4_ADD_UN8x4 (s, d);
+ }
+ *dst = s;
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_add_n_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t src;
+ uint8_t sa;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+ sa = (src >> 24);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ uint16_t tmp;
+ uint16_t a;
+ uint32_t m, d;
+ uint32_t r;
+
+ a = *mask++;
+ d = *dst;
+
+ m = MUL_UN8 (sa, a, tmp);
+ r = ADD_UN8 (m, d, tmp);
+
+ *dst++ = r;
+ }
+ }
+}
+
+#ifdef WORDS_BIGENDIAN
+#define CREATE_BITMASK(n) (0x80000000 >> (n))
+#define UPDATE_BITMASK(n) ((n) >> 1)
+#else
+#define CREATE_BITMASK(n) (1 << (n))
+#define UPDATE_BITMASK(n) ((n) << 1)
+#endif
+
+#define TEST_BIT(p, n) \
+ (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
+#define SET_BIT(p, n) \
+ do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
+
+static void
+fast_composite_add_1000_1000 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
+ src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
+ dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ /*
+ * TODO: improve performance by processing uint32_t data instead
+ * of individual bits
+ */
+ if (TEST_BIT (src, src_x + w))
+ SET_BIT (dst, dest_x + w);
+ }
+ }
+}
+
+static void
+fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst, *dst_line;
+ uint32_t *mask, *mask_line;
+ int mask_stride, dst_stride;
+ uint32_t bitcache, bitmask;
+ int32_t w;
+
+ if (width <= 0)
+ return;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
+ dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+ mask_stride, mask_line, 1);
+ mask_line += mask_x >> 5;
+
+ if (srca == 0xff)
+ {
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (mask_x & 31);
+
+ while (w--)
+ {
+ if (bitmask == 0)
+ {
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (0);
+ }
+ if (bitcache & bitmask)
+ *dst = src;
+ bitmask = UPDATE_BITMASK (bitmask);
+ dst++;
+ }
+ }
+ }
+ else
+ {
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (mask_x & 31);
+
+ while (w--)
+ {
+ if (bitmask == 0)
+ {
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (0);
+ }
+ if (bitcache & bitmask)
+ *dst = over (src, *dst);
+ bitmask = UPDATE_BITMASK (bitmask);
+ dst++;
+ }
+ }
+ }
+}
+
+static void
+fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint16_t *dst, *dst_line;
+ uint32_t *mask, *mask_line;
+ int mask_stride, dst_stride;
+ uint32_t bitcache, bitmask;
+ int32_t w;
+ uint32_t d;
+ uint16_t src565;
+
+ if (width <= 0)
+ return;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
+ dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+ mask_stride, mask_line, 1);
+ mask_line += mask_x >> 5;
+
+ if (srca == 0xff)
+ {
+ src565 = CONVERT_8888_TO_0565 (src);
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (mask_x & 31);
+
+ while (w--)
+ {
+ if (bitmask == 0)
+ {
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (0);
+ }
+ if (bitcache & bitmask)
+ *dst = src565;
+ bitmask = UPDATE_BITMASK (bitmask);
+ dst++;
+ }
+ }
+ }
+ else
+ {
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (mask_x & 31);
+
+ while (w--)
+ {
+ if (bitmask == 0)
+ {
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (0);
+ }
+ if (bitcache & bitmask)
+ {
+ d = over (src, CONVERT_0565_TO_0888 (*dst));
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ bitmask = UPDATE_BITMASK (bitmask);
+ dst++;
+ }
+ }
+ }
+}
+
+/*
+ * Simple bitblt
+ */
+
+static void
+fast_composite_solid_fill (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ if (dst_image->bits.format == PIXMAN_a1)
+ {
+ src = src >> 31;
+ }
+ else if (dst_image->bits.format == PIXMAN_a8)
+ {
+ src = src >> 24;
+ }
+ else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
+ dst_image->bits.format == PIXMAN_b5g6r5)
+ {
+ src = CONVERT_8888_TO_0565 (src);
+ }
+
+ pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+ PIXMAN_FORMAT_BPP (dst_image->bits.format),
+ dest_x, dest_y,
+ width, height,
+ src);
+}
+
+static void
+fast_composite_src_memcpy (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8;
+ uint32_t n_bytes = width * bpp;
+ int dst_stride, src_stride;
+ uint8_t *dst;
+ uint8_t *src;
+
+ src_stride = src_image->bits.rowstride * 4;
+ dst_stride = dst_image->bits.rowstride * 4;
+
+ src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
+ dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
+
+ while (height--)
+ {
+ memcpy (dst, src, n_bytes);
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
+
+/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
+static force_inline void
+scaled_nearest_scanline_565_565_SRC (uint16_t * dst,
+ uint16_t * src,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx)
+{
+ uint16_t tmp1, tmp2, tmp3, tmp4;
+ while ((w -= 4) >= 0)
+ {
+ tmp1 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp2 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp3 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp4 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ *dst++ = tmp1;
+ *dst++ = tmp2;
+ *dst++ = tmp3;
+ *dst++ = tmp4;
+ }
+ if (w & 2)
+ {
+ tmp1 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp2 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ *dst++ = tmp1;
+ *dst++ = tmp2;
+ }
+ if (w & 1)
+ *dst++ = src[pixman_fixed_to_int (vx)];
+}
+
+FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
+ scaled_nearest_scanline_565_565_SRC,
+ uint16_t, uint16_t, COVER)
+FAST_NEAREST_MAINLOOP (565_565_none_SRC,
+ scaled_nearest_scanline_565_565_SRC,
+ uint16_t, uint16_t, NONE)
+FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
+ scaled_nearest_scanline_565_565_SRC,
+ uint16_t, uint16_t, PAD)
+
+static force_inline uint32_t
+fetch_nearest (pixman_repeat_t src_repeat,
+ pixman_format_code_t format,
+ uint32_t *src, int x, int src_width)
+{
+ if (repeat (src_repeat, &x, src_width))
+ {
+ if (format == PIXMAN_x8r8g8b8)
+ return *(src + x) | 0xff000000;
+ else
+ return *(src + x);
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+static force_inline void
+combine_over (uint32_t s, uint32_t *dst)
+{
+ if (s)
+ {
+ uint8_t ia = 0xff - (s >> 24);
+
+ if (ia)
+ UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
+ else
+ *dst = s;
+ }
+}
+
+static force_inline void
+combine_src (uint32_t s, uint32_t *dst)
+{
+ *dst = s;
+}
+
+static void
+fast_composite_scaled_nearest (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line;
+ uint32_t *src_line;
+ int dst_stride, src_stride;
+ int src_width, src_height;
+ pixman_repeat_t src_repeat;
+ pixman_fixed_t unit_x, unit_y;
+ pixman_format_code_t src_format;
+ pixman_vector_t v;
+ pixman_fixed_t vy;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
+ * transformed from destination space to source space
+ */
+ PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (src_image->common.transform, &v))
+ return;
+
+ unit_x = src_image->common.transform->matrix[0][0];
+ unit_y = src_image->common.transform->matrix[1][1];
+
+ /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
+ v.vector[0] -= pixman_fixed_e;
+ v.vector[1] -= pixman_fixed_e;
+
+ src_height = src_image->bits.height;
+ src_width = src_image->bits.width;
+ src_repeat = src_image->common.repeat;
+ src_format = src_image->bits.format;
+
+ vy = v.vector[1];
+ while (height--)
+ {
+ pixman_fixed_t vx = v.vector[0];
+ int y = pixman_fixed_to_int (vy);
+ uint32_t *dst = dst_line;
+
+ dst_line += dst_stride;
+
+ /* adjust the y location by a unit vector in the y direction
+ * this is equivalent to transforming y+1 of the destination point to source space */
+ vy += unit_y;
+
+ if (!repeat (src_repeat, &y, src_height))
+ {
+ if (op == PIXMAN_OP_SRC)
+ memset (dst, 0, sizeof (*dst) * width);
+ }
+ else
+ {
+ int w = width;
+
+ uint32_t *src = src_line + y * src_stride;
+
+ while (w >= 2)
+ {
+ uint32_t s1, s2;
+ int x1, x2;
+
+ x1 = pixman_fixed_to_int (vx);
+ vx += unit_x;
+
+ x2 = pixman_fixed_to_int (vx);
+ vx += unit_x;
+
+ w -= 2;
+
+ s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
+ s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
+
+ if (op == PIXMAN_OP_OVER)
+ {
+ combine_over (s1, dst++);
+ combine_over (s2, dst++);
+ }
+ else
+ {
+ combine_src (s1, dst++);
+ combine_src (s2, dst++);
+ }
+ }
+
+ while (w--)
+ {
+ uint32_t s;
+ int x;
+
+ x = pixman_fixed_to_int (vx);
+ vx += unit_x;
+
+ s = fetch_nearest (src_repeat, src_format, src, x, src_width);
+
+ if (op == PIXMAN_OP_OVER)
+ combine_over (s, dst++);
+ else
+ combine_src (s, dst++);
+ }
+ }
+ }
+}
+
+static const pixman_fast_path_t c_fast_paths[] =
+{
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5, fast_composite_over_n_1_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5, fast_composite_over_n_1_0565),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
+ PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+ PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
+ PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
+
+ SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
+
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+ SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
+
+ SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
+
+#define NEAREST_FAST_PATH(op,s,d) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, SCALED_NEAREST_FLAGS, \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest, \
+ }
+
+ NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
+ NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
+ NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
+ NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
+
+ NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
+ NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
+ NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
+ NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
+
+ NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
+ NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
+ NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
+ NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
+
+ NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
+ NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
+ NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
+ NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
+
+ { PIXMAN_OP_NONE },
+};
+
+#ifdef WORDS_BIGENDIAN
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
+#else
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
+#endif
+
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
+{
+ if (offs)
+ {
+ int leading_pixels = 32 - offs;
+ if (leading_pixels >= width)
+ {
+ if (v)
+ *dst |= A1_FILL_MASK (width, offs);
+ else
+ *dst &= ~A1_FILL_MASK (width, offs);
+ return;
+ }
+ else
+ {
+ if (v)
+ *dst++ |= A1_FILL_MASK (leading_pixels, offs);
+ else
+ *dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+ width -= leading_pixels;
+ }
+ }
+ while (width >= 32)
+ {
+ if (v)
+ *dst++ = 0xFFFFFFFF;
+ else
+ *dst++ = 0;
+ width -= 32;
+ }
+ if (width > 0)
+ {
+ if (v)
+ *dst |= A1_FILL_MASK (width, 0);
+ else
+ *dst &= ~A1_FILL_MASK (width, 0);
+ }
+}
+
+static void
+pixman_fill1 (uint32_t *bits,
+ int stride,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ uint32_t *dst = bits + y * stride + (x >> 5);
+ int offs = x & 31;
+
+ if (xor & 1)
+ {
+ while (height--)
+ {
+ pixman_fill1_line (dst, offs, width, 1);
+ dst += stride;
+ }
+ }
+ else
+ {
+ while (height--)
+ {
+ pixman_fill1_line (dst, offs, width, 0);
+ dst += stride;
+ }
+ }
+}
+
+static void
+pixman_fill8 (uint32_t *bits,
+ int stride,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ int byte_stride = stride * (int) sizeof (uint32_t);
+ uint8_t *dst = (uint8_t *) bits;
+ uint8_t v = xor & 0xff;
+ int i;
+
+ dst = dst + y * byte_stride + x;
+
+ while (height--)
+ {
+ for (i = 0; i < width; ++i)
+ dst[i] = v;
+
+ dst += byte_stride;
+ }
+}
+
+static void
+pixman_fill16 (uint32_t *bits,
+ int stride,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ int short_stride =
+ (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
+ uint16_t *dst = (uint16_t *)bits;
+ uint16_t v = xor & 0xffff;
+ int i;
+
+ dst = dst + y * short_stride + x;
+
+ while (height--)
+ {
+ for (i = 0; i < width; ++i)
+ dst[i] = v;
+
+ dst += short_stride;
+ }
+}
+
+static void
+pixman_fill32 (uint32_t *bits,
+ int stride,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ int i;
+
+ bits = bits + y * stride + x;
+
+ while (height--)
+ {
+ for (i = 0; i < width; ++i)
+ bits[i] = xor;
+
+ bits += stride;
+ }
+}
+
+static pixman_bool_t
+fast_path_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ switch (bpp)
+ {
+ case 1:
+ pixman_fill1 (bits, stride, x, y, width, height, xor);
+ break;
+
+ case 8:
+ pixman_fill8 (bits, stride, x, y, width, height, xor);
+ break;
+
+ case 16:
+ pixman_fill16 (bits, stride, x, y, width, height, xor);
+ break;
+
+ case 32:
+ pixman_fill32 (bits, stride, x, y, width, height, xor);
+ break;
+
+ default:
+ return _pixman_implementation_fill (
+ imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+ break;
+ }
+
+ return TRUE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (void)
+{
+ pixman_implementation_t *general = _pixman_implementation_create_general ();
+ pixman_implementation_t *imp = _pixman_implementation_create (general, c_fast_paths);
+
+ imp->fill = fast_path_fill;
+
+ return imp;
+}
diff --git a/pixman/pixman/pixman-fast-path.h b/pixman/pixman/pixman-fast-path.h
index f7c986cab..b82903051 100644
--- a/pixman/pixman/pixman-fast-path.h
+++ b/pixman/pixman/pixman-fast-path.h
@@ -1,445 +1,449 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. SuSE makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author: Keith Packard, SuSE, Inc.
- */
-
-#ifndef PIXMAN_FAST_PATH_H__
-#define PIXMAN_FAST_PATH_H__
-
-#include "pixman-private.h"
-
-#define PIXMAN_REPEAT_COVER -1
-
-static force_inline pixman_bool_t
-repeat (pixman_repeat_t repeat, int *c, int size)
-{
- if (repeat == PIXMAN_REPEAT_NONE)
- {
- if (*c < 0 || *c >= size)
- return FALSE;
- }
- else if (repeat == PIXMAN_REPEAT_NORMAL)
- {
- while (*c >= size)
- *c -= size;
- while (*c < 0)
- *c += size;
- }
- else if (repeat == PIXMAN_REPEAT_PAD)
- {
- *c = CLIP (*c, 0, size - 1);
- }
- else /* REFLECT */
- {
- *c = MOD (*c, size * 2);
- if (*c >= size)
- *c = size * 2 - *c - 1;
- }
- return TRUE;
-}
-
-/*
- * For each scanline fetched from source image with PAD repeat:
- * - calculate how many pixels need to be padded on the left side
- * - calculate how many pixels need to be padded on the right side
- * - update width to only count pixels which are fetched from the image
- * All this information is returned via 'width', 'left_pad', 'right_pad'
- * arguments. The code is assuming that 'unit_x' is positive.
- *
- * Note: 64-bit math is used in order to avoid potential overflows, which
- * is probably excessive in many cases. This particular function
- * may need its own correctness test and performance tuning.
- */
-static force_inline void
-pad_repeat_get_scanline_bounds (int32_t source_image_width,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- int32_t * width,
- int32_t * left_pad,
- int32_t * right_pad)
-{
- int64_t max_vx = (int64_t) source_image_width << 16;
- int64_t tmp;
- if (vx < 0)
- {
- tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
- if (tmp > *width)
- {
- *left_pad = *width;
- *width = 0;
- }
- else
- {
- *left_pad = (int32_t) tmp;
- *width -= (int32_t) tmp;
- }
- }
- else
- {
- *left_pad = 0;
- }
- tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
- if (tmp < 0)
- {
- *right_pad = *width;
- *width = 0;
- }
- else if (tmp >= *width)
- {
- *right_pad = 0;
- }
- else
- {
- *right_pad = *width - (int32_t) tmp;
- *width = (int32_t) tmp;
- }
-}
-
-/* A macroified version of specialized nearest scalers for some
- * common 8888 and 565 formats. It supports SRC and OVER ops.
- *
- * There are two repeat versions, one that handles repeat normal,
- * and one without repeat handling that only works if the src region
- * used is completely covered by the pre-repeated source samples.
- *
- * The loops are unrolled to process two pixels per iteration for better
- * performance on most CPU architectures (superscalar processors
- * can issue several operations simultaneously, other processors can hide
- * instructions latencies by pipelining operations). Unrolling more
- * does not make much sense because the compiler will start running out
- * of spare registers soon.
- */
-
-#define GET_8888_ALPHA(s) ((s) >> 24)
- /* This is not actually used since we don't have an OVER with
- 565 source, but it is needed to build. */
-#define GET_0565_ALPHA(s) 0xff
-
-#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT, \
- src_type_t, dst_type_t, OP, repeat_mode) \
-static force_inline void \
-scanline_func_name (dst_type_t *dst, \
- src_type_t *src, \
- int32_t w, \
- pixman_fixed_t vx, \
- pixman_fixed_t unit_x, \
- pixman_fixed_t max_vx) \
-{ \
- uint32_t d; \
- src_type_t s1, s2; \
- uint8_t a1, a2; \
- int x1, x2; \
- \
- if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER) \
- abort(); \
- \
- while ((w -= 2) >= 0) \
- { \
- x1 = vx >> 16; \
- vx += unit_x; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* This works because we know that unit_x is positive */ \
- while (vx >= max_vx) \
- vx -= max_vx; \
- } \
- s1 = src[x1]; \
- \
- x2 = vx >> 16; \
- vx += unit_x; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* This works because we know that unit_x is positive */ \
- while (vx >= max_vx) \
- vx -= max_vx; \
- } \
- s2 = src[x2]; \
- \
- if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
- { \
- a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
- a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \
- \
- if (a1 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- else if (s1) \
- { \
- d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst); \
- s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
- a1 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- \
- if (a2 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
- } \
- else if (s2) \
- { \
- d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
- s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2); \
- a2 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- } \
- else /* PIXMAN_OP_SRC */ \
- { \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
- } \
- } \
- \
- if (w & 1) \
- { \
- x1 = vx >> 16; \
- s1 = src[x1]; \
- \
- if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
- { \
- a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
- \
- if (a1 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- else if (s1) \
- { \
- d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
- s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
- a1 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- } \
- else /* PIXMAN_OP_SRC */ \
- { \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- } \
-}
-
-#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t, \
- repeat_mode) \
-static void \
-fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp, \
- pixman_op_t op, \
- pixman_image_t * src_image, \
- pixman_image_t * mask_image, \
- pixman_image_t * dst_image, \
- int32_t src_x, \
- int32_t src_y, \
- int32_t mask_x, \
- int32_t mask_y, \
- int32_t dst_x, \
- int32_t dst_y, \
- int32_t width, \
- int32_t height) \
-{ \
- dst_type_t *dst_line; \
- src_type_t *src_first_line; \
- int y; \
- pixman_fixed_t max_vx = max_vx; /* suppress uninitialized variable warning */ \
- pixman_fixed_t max_vy; \
- pixman_vector_t v; \
- pixman_fixed_t vx, vy; \
- pixman_fixed_t unit_x, unit_y; \
- int32_t left_pad, right_pad; \
- \
- src_type_t *src; \
- dst_type_t *dst; \
- int src_stride, dst_stride; \
- \
- PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \
- /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \
- * transformed from destination space to source space */ \
- PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \
- \
- /* reference point is the center of the pixel */ \
- v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \
- v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \
- v.vector[2] = pixman_fixed_1; \
- \
- if (!pixman_transform_point_3d (src_image->common.transform, &v)) \
- return; \
- \
- unit_x = src_image->common.transform->matrix[0][0]; \
- unit_y = src_image->common.transform->matrix[1][1]; \
- \
- /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ \
- v.vector[0] -= pixman_fixed_e; \
- v.vector[1] -= pixman_fixed_e; \
- \
- vx = v.vector[0]; \
- vy = v.vector[1]; \
- \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* Clamp repeating positions inside the actual samples */ \
- max_vx = src_image->bits.width << 16; \
- max_vy = src_image->bits.height << 16; \
- \
- repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \
- repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
- } \
- \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \
- PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
- { \
- pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x, \
- &width, &left_pad, &right_pad); \
- vx += left_pad * unit_x; \
- } \
- \
- while (--height >= 0) \
- { \
- dst = dst_line; \
- dst_line += dst_stride; \
- \
- y = vy >> 16; \
- vy += unit_y; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \
- { \
- repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height); \
- src = src_first_line + src_stride * y; \
- if (left_pad > 0) \
- { \
- scanline_func (dst, src, left_pad, 0, 0, 0); \
- } \
- if (width > 0) \
- { \
- scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \
- } \
- if (right_pad > 0) \
- { \
- scanline_func (dst + left_pad + width, src + src_image->bits.width - 1, \
- right_pad, 0, 0, 0); \
- } \
- } \
- else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
- { \
- static src_type_t zero = 0; \
- if (y < 0 || y >= src_image->bits.height) \
- { \
- scanline_func (dst, &zero, left_pad + width + right_pad, 0, 0, 0); \
- continue; \
- } \
- src = src_first_line + src_stride * y; \
- if (left_pad > 0) \
- { \
- scanline_func (dst, &zero, left_pad, 0, 0, 0); \
- } \
- if (width > 0) \
- { \
- scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \
- } \
- if (right_pad > 0) \
- { \
- scanline_func (dst + left_pad + width, &zero, right_pad, 0, 0, 0); \
- } \
- } \
- else \
- { \
- src = src_first_line + src_stride * y; \
- scanline_func (dst, src, width, vx, unit_x, max_vx); \
- } \
- } \
-}
-
-#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT, \
- src_type_t, dst_type_t, OP, repeat_mode) \
- FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
- SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t, \
- OP, repeat_mode) \
- FAST_NEAREST_MAINLOOP(scale_func_name##_##OP, \
- scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
- src_type_t, dst_type_t, repeat_mode) \
- \
- extern int no_such_variable
-
-
-#define SCALED_NEAREST_FLAGS \
- (FAST_PATH_SCALE_TRANSFORM | \
- FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NEAREST_FILTER | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_NARROW_FORMAT)
-
-#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_NORMAL_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_PAD_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_NONE_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \
- }
-
-/* Prefer the use of 'cover' variant, because it is faster */
-#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \
- SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
-
-#endif
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. SuSE makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Keith Packard, SuSE, Inc.
+ */
+
+#ifndef PIXMAN_FAST_PATH_H__
+#define PIXMAN_FAST_PATH_H__
+
+#include "pixman-private.h"
+
+#define PIXMAN_REPEAT_COVER -1
+
+static force_inline pixman_bool_t
+repeat (pixman_repeat_t repeat, int *c, int size)
+{
+ if (repeat == PIXMAN_REPEAT_NONE)
+ {
+ if (*c < 0 || *c >= size)
+ return FALSE;
+ }
+ else if (repeat == PIXMAN_REPEAT_NORMAL)
+ {
+ while (*c >= size)
+ *c -= size;
+ while (*c < 0)
+ *c += size;
+ }
+ else if (repeat == PIXMAN_REPEAT_PAD)
+ {
+ *c = CLIP (*c, 0, size - 1);
+ }
+ else /* REFLECT */
+ {
+ *c = MOD (*c, size * 2);
+ if (*c >= size)
+ *c = size * 2 - *c - 1;
+ }
+ return TRUE;
+}
+
+/*
+ * For each scanline fetched from source image with PAD repeat:
+ * - calculate how many pixels need to be padded on the left side
+ * - calculate how many pixels need to be padded on the right side
+ * - update width to only count pixels which are fetched from the image
+ * All this information is returned via 'width', 'left_pad', 'right_pad'
+ * arguments. The code is assuming that 'unit_x' is positive.
+ *
+ * Note: 64-bit math is used in order to avoid potential overflows, which
+ * is probably excessive in many cases. This particular function
+ * may need its own correctness test and performance tuning.
+ */
+static force_inline void
+pad_repeat_get_scanline_bounds (int32_t source_image_width,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ int32_t * width,
+ int32_t * left_pad,
+ int32_t * right_pad)
+{
+ int64_t max_vx = (int64_t) source_image_width << 16;
+ int64_t tmp;
+ if (vx < 0)
+ {
+ tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
+ if (tmp > *width)
+ {
+ *left_pad = *width;
+ *width = 0;
+ }
+ else
+ {
+ *left_pad = (int32_t) tmp;
+ *width -= (int32_t) tmp;
+ }
+ }
+ else
+ {
+ *left_pad = 0;
+ }
+ tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
+ if (tmp < 0)
+ {
+ *right_pad = *width;
+ *width = 0;
+ }
+ else if (tmp >= *width)
+ {
+ *right_pad = 0;
+ }
+ else
+ {
+ *right_pad = *width - (int32_t) tmp;
+ *width = (int32_t) tmp;
+ }
+}
+
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+ 565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+
+#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT, \
+ src_type_t, dst_type_t, OP, repeat_mode) \
+static force_inline void \
+scanline_func_name (dst_type_t *dst, \
+ src_type_t *src, \
+ int32_t w, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx) \
+{ \
+ uint32_t d; \
+ src_type_t s1, s2; \
+ uint8_t a1, a2; \
+ int x1, x2; \
+ \
+ if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER) \
+ abort(); \
+ \
+ while ((w -= 2) >= 0) \
+ { \
+ x1 = vx >> 16; \
+ vx += unit_x; \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
+ { \
+ /* This works because we know that unit_x is positive */ \
+ while (vx >= max_vx) \
+ vx -= max_vx; \
+ } \
+ s1 = src[x1]; \
+ \
+ x2 = vx >> 16; \
+ vx += unit_x; \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
+ { \
+ /* This works because we know that unit_x is positive */ \
+ while (vx >= max_vx) \
+ vx -= max_vx; \
+ } \
+ s2 = src[x2]; \
+ \
+ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
+ { \
+ a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
+ a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \
+ \
+ if (a1 == 0xff) \
+ { \
+ *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
+ } \
+ else if (s1) \
+ { \
+ d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst); \
+ s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
+ a1 ^= 0xff; \
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
+ *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
+ } \
+ dst++; \
+ \
+ if (a2 == 0xff) \
+ { \
+ *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
+ } \
+ else if (s2) \
+ { \
+ d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
+ s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2); \
+ a2 ^= 0xff; \
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2); \
+ *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
+ } \
+ dst++; \
+ } \
+ else /* PIXMAN_OP_SRC */ \
+ { \
+ *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
+ *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
+ } \
+ } \
+ \
+ if (w & 1) \
+ { \
+ x1 = vx >> 16; \
+ s1 = src[x1]; \
+ \
+ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
+ { \
+ a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
+ \
+ if (a1 == 0xff) \
+ { \
+ *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
+ } \
+ else if (s1) \
+ { \
+ d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
+ s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
+ a1 ^= 0xff; \
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
+ *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
+ } \
+ dst++; \
+ } \
+ else /* PIXMAN_OP_SRC */ \
+ { \
+ *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
+ } \
+ } \
+}
+
+#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, dst_type_t, \
+ repeat_mode) \
+static void \
+fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dst_x, \
+ int32_t dst_y, \
+ int32_t width, \
+ int32_t height) \
+{ \
+ dst_type_t *dst_line; \
+ src_type_t *src_first_line; \
+ int y; \
+ pixman_fixed_t max_vx = max_vx; /* suppress uninitialized variable warning */ \
+ pixman_fixed_t max_vy; \
+ pixman_vector_t v; \
+ pixman_fixed_t vx, vy; \
+ pixman_fixed_t unit_x, unit_y; \
+ int32_t left_pad, right_pad; \
+ \
+ src_type_t *src; \
+ dst_type_t *dst; \
+ int src_stride, dst_stride; \
+ \
+ PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \
+ /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \
+ * transformed from destination space to source space */ \
+ PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \
+ \
+ /* reference point is the center of the pixel */ \
+ v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \
+ v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \
+ v.vector[2] = pixman_fixed_1; \
+ \
+ if (!pixman_transform_point_3d (src_image->common.transform, &v)) \
+ return; \
+ \
+ unit_x = src_image->common.transform->matrix[0][0]; \
+ unit_y = src_image->common.transform->matrix[1][1]; \
+ \
+ /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ \
+ v.vector[0] -= pixman_fixed_e; \
+ v.vector[1] -= pixman_fixed_e; \
+ \
+ vx = v.vector[0]; \
+ vy = v.vector[1]; \
+ \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
+ { \
+ /* Clamp repeating positions inside the actual samples */ \
+ max_vx = src_image->bits.width << 16; \
+ max_vy = src_image->bits.height << 16; \
+ \
+ repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \
+ repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
+ } \
+ \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \
+ PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
+ { \
+ pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x, \
+ &width, &left_pad, &right_pad); \
+ vx += left_pad * unit_x; \
+ } \
+ \
+ while (--height >= 0) \
+ { \
+ dst = dst_line; \
+ dst_line += dst_stride; \
+ \
+ y = vy >> 16; \
+ vy += unit_y; \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
+ repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \
+ { \
+ repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height); \
+ src = src_first_line + src_stride * y; \
+ if (left_pad > 0) \
+ { \
+ scanline_func (dst, src, left_pad, 0, 0, 0); \
+ } \
+ if (width > 0) \
+ { \
+ scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \
+ } \
+ if (right_pad > 0) \
+ { \
+ scanline_func (dst + left_pad + width, src + src_image->bits.width - 1, \
+ right_pad, 0, 0, 0); \
+ } \
+ } \
+ else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
+ { \
+ static src_type_t zero[1] = { 0 }; \
+ if (y < 0 || y >= src_image->bits.height) \
+ { \
+ scanline_func (dst, zero, left_pad + width + right_pad, 0, 0, 0); \
+ continue; \
+ } \
+ src = src_first_line + src_stride * y; \
+ if (left_pad > 0) \
+ { \
+ scanline_func (dst, zero, left_pad, 0, 0, 0); \
+ } \
+ if (width > 0) \
+ { \
+ scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \
+ } \
+ if (right_pad > 0) \
+ { \
+ scanline_func (dst + left_pad + width, zero, right_pad, 0, 0, 0); \
+ } \
+ } \
+ else \
+ { \
+ src = src_first_line + src_stride * y; \
+ scanline_func (dst, src, width, vx, unit_x, max_vx); \
+ } \
+ } \
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t, \
+ repeat_mode) \
+ FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, dst_type_t, \
+ repeat_mode) \
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT, \
+ src_type_t, dst_type_t, OP, repeat_mode) \
+ FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
+ SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t, \
+ OP, repeat_mode) \
+ FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name ## _ ## OP, \
+ scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
+ src_type_t, dst_type_t, repeat_mode)
+
+
+#define SCALED_NEAREST_FLAGS \
+ (FAST_PATH_SCALE_TRANSFORM | \
+ FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NEAREST_FILTER | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_NORMAL_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_PAD_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \
+ }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \
+ SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \
+ SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
+
+#endif
diff --git a/pixman/pixman/pixman-matrix.c b/pixman/pixman/pixman-matrix.c
index abdfa0525..f2f67ab41 100644
--- a/pixman/pixman/pixman-matrix.c
+++ b/pixman/pixman/pixman-matrix.c
@@ -425,7 +425,8 @@ pixman_transform_is_inverse (const struct pixman_transform *a,
{
struct pixman_transform t;
- pixman_transform_multiply (&t, a, b);
+ if (!pixman_transform_multiply (&t, a, b))
+ return FALSE;
return pixman_transform_is_identity (&t);
}
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index b19043bbd..94ba54cbf 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -1,6031 +1,6031 @@
-/*
- * Copyright © 2008 Rodrigo Kumpera
- * Copyright © 2008 André Tupinambá
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Red Hat not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. Red Hat makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author: Rodrigo Kumpera (kumpera@gmail.com)
- * André Tupinambá (andrelrt@gmail.com)
- *
- * Based on work by Owen Taylor and Søren Sandmann
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <mmintrin.h>
-#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
-#include <emmintrin.h> /* for SSE2 intrinsics */
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-fast-path.h"
-
-#if defined(_MSC_VER) && defined(_M_AMD64)
-/* Windows 64 doesn't allow MMX to be used, so
- * the pixman-x64-mmx-emulation.h file contains
- * implementations of those MMX intrinsics that
- * are used in the SSE2 implementation.
- */
-# include "pixman-x64-mmx-emulation.h"
-#endif
-
-#ifdef USE_SSE2
-
-/* --------------------------------------------------------------------
- * Locals
- */
-
-static __m64 mask_x0080;
-static __m64 mask_x00ff;
-static __m64 mask_x0101;
-static __m64 mask_x_alpha;
-
-static __m64 mask_x565_rgb;
-static __m64 mask_x565_unpack;
-
-static __m128i mask_0080;
-static __m128i mask_00ff;
-static __m128i mask_0101;
-static __m128i mask_ffff;
-static __m128i mask_ff000000;
-static __m128i mask_alpha;
-
-static __m128i mask_565_r;
-static __m128i mask_565_g1, mask_565_g2;
-static __m128i mask_565_b;
-static __m128i mask_red;
-static __m128i mask_green;
-static __m128i mask_blue;
-
-static __m128i mask_565_fix_rb;
-static __m128i mask_565_fix_g;
-
-/* ----------------------------------------------------------------------
- * SSE2 Inlines
- */
-static force_inline __m128i
-unpack_32_1x128 (uint32_t data)
-{
- return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
-}
-
-static force_inline void
-unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
-{
- *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
- *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
-}
-
-static force_inline __m128i
-unpack_565_to_8888 (__m128i lo)
-{
- __m128i r, g, b, rb, t;
-
- r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
- g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
- b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
-
- rb = _mm_or_si128 (r, b);
- t = _mm_and_si128 (rb, mask_565_fix_rb);
- t = _mm_srli_epi32 (t, 5);
- rb = _mm_or_si128 (rb, t);
-
- t = _mm_and_si128 (g, mask_565_fix_g);
- t = _mm_srli_epi32 (t, 6);
- g = _mm_or_si128 (g, t);
-
- return _mm_or_si128 (rb, g);
-}
-
-static force_inline void
-unpack_565_128_4x128 (__m128i data,
- __m128i* data0,
- __m128i* data1,
- __m128i* data2,
- __m128i* data3)
-{
- __m128i lo, hi;
-
- lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
- hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
-
- lo = unpack_565_to_8888 (lo);
- hi = unpack_565_to_8888 (hi);
-
- unpack_128_2x128 (lo, data0, data1);
- unpack_128_2x128 (hi, data2, data3);
-}
-
-static force_inline uint16_t
-pack_565_32_16 (uint32_t pixel)
-{
- return (uint16_t) (((pixel >> 8) & 0xf800) |
- ((pixel >> 5) & 0x07e0) |
- ((pixel >> 3) & 0x001f));
-}
-
-static force_inline __m128i
-pack_2x128_128 (__m128i lo, __m128i hi)
-{
- return _mm_packus_epi16 (lo, hi);
-}
-
-static force_inline __m128i
-pack_565_2x128_128 (__m128i lo, __m128i hi)
-{
- __m128i data;
- __m128i r, g1, g2, b;
-
- data = pack_2x128_128 (lo, hi);
-
- r = _mm_and_si128 (data, mask_565_r);
- g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
- g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
- b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
-
- return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
-}
-
-static force_inline __m128i
-pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
-{
- return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
- pack_565_2x128_128 (*xmm2, *xmm3));
-}
-
-static force_inline int
-is_opaque (__m128i x)
-{
- __m128i ffs = _mm_cmpeq_epi8 (x, x);
-
- return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
-}
-
-static force_inline int
-is_zero (__m128i x)
-{
- return _mm_movemask_epi8 (
- _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
-}
-
-static force_inline int
-is_transparent (__m128i x)
-{
- return (_mm_movemask_epi8 (
- _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
-}
-
-static force_inline __m128i
-expand_pixel_32_1x128 (uint32_t data)
-{
- return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
-}
-
-static force_inline __m128i
-expand_alpha_1x128 (__m128i data)
-{
- return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
- _MM_SHUFFLE (3, 3, 3, 3)),
- _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline void
-expand_alpha_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
- hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
-
- *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
- *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline void
-expand_alpha_rev_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
- hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
- *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
- *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline void
-pix_multiply_2x128 (__m128i* data_lo,
- __m128i* data_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi,
- __m128i* ret_lo,
- __m128i* ret_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
- hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
- lo = _mm_adds_epu16 (lo, mask_0080);
- hi = _mm_adds_epu16 (hi, mask_0080);
- *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
- *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
-}
-
-static force_inline void
-pix_add_multiply_2x128 (__m128i* src_lo,
- __m128i* src_hi,
- __m128i* alpha_dst_lo,
- __m128i* alpha_dst_hi,
- __m128i* dst_lo,
- __m128i* dst_hi,
- __m128i* alpha_src_lo,
- __m128i* alpha_src_hi,
- __m128i* ret_lo,
- __m128i* ret_hi)
-{
- __m128i t1_lo, t1_hi;
- __m128i t2_lo, t2_hi;
-
- pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
- pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
-
- *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
- *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
-}
-
-static force_inline void
-negate_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* neg_lo,
- __m128i* neg_hi)
-{
- *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
- *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
-}
-
-static force_inline void
-invert_colors_2x128 (__m128i data_lo,
- __m128i data_hi,
- __m128i* inv_lo,
- __m128i* inv_hi)
-{
- __m128i lo, hi;
-
- lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
- hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
- *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
- *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
-static force_inline void
-over_2x128 (__m128i* src_lo,
- __m128i* src_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi,
- __m128i* dst_lo,
- __m128i* dst_hi)
-{
- __m128i t1, t2;
-
- negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
-
- pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
-
- *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
- *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
-}
-
-static force_inline void
-over_rev_non_pre_2x128 (__m128i src_lo,
- __m128i src_hi,
- __m128i* dst_lo,
- __m128i* dst_hi)
-{
- __m128i lo, hi;
- __m128i alpha_lo, alpha_hi;
-
- expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
-
- lo = _mm_or_si128 (alpha_lo, mask_alpha);
- hi = _mm_or_si128 (alpha_hi, mask_alpha);
-
- invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
-
- pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
-
- over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
-}
-
-static force_inline void
-in_over_2x128 (__m128i* src_lo,
- __m128i* src_hi,
- __m128i* alpha_lo,
- __m128i* alpha_hi,
- __m128i* mask_lo,
- __m128i* mask_hi,
- __m128i* dst_lo,
- __m128i* dst_hi)
-{
- __m128i s_lo, s_hi;
- __m128i a_lo, a_hi;
-
- pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
- pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
-
- over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
-}
-
-/* load 4 pixels from a 16-byte boundary aligned address */
-static force_inline __m128i
-load_128_aligned (__m128i* src)
-{
- return _mm_load_si128 (src);
-}
-
-/* load 4 pixels from a unaligned address */
-static force_inline __m128i
-load_128_unaligned (const __m128i* src)
-{
- return _mm_loadu_si128 (src);
-}
-
-/* save 4 pixels using Write Combining memory on a 16-byte
- * boundary aligned address
- */
-static force_inline void
-save_128_write_combining (__m128i* dst,
- __m128i data)
-{
- _mm_stream_si128 (dst, data);
-}
-
-/* save 4 pixels on a 16-byte boundary aligned address */
-static force_inline void
-save_128_aligned (__m128i* dst,
- __m128i data)
-{
- _mm_store_si128 (dst, data);
-}
-
-/* save 4 pixels on a unaligned address */
-static force_inline void
-save_128_unaligned (__m128i* dst,
- __m128i data)
-{
- _mm_storeu_si128 (dst, data);
-}
-
-/* ------------------------------------------------------------------
- * MMX inlines
- */
-
-static force_inline __m64
-load_32_1x64 (uint32_t data)
-{
- return _mm_cvtsi32_si64 (data);
-}
-
-static force_inline __m64
-unpack_32_1x64 (uint32_t data)
-{
- return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
-}
-
-static force_inline __m64
-expand_alpha_1x64 (__m64 data)
-{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline __m64
-expand_alpha_rev_1x64 (__m64 data)
-{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline __m64
-expand_pixel_8_1x64 (uint8_t data)
-{
- return _mm_shuffle_pi16 (
- unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline __m64
-pix_multiply_1x64 (__m64 data,
- __m64 alpha)
-{
- return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
- mask_x0080),
- mask_x0101);
-}
-
-static force_inline __m64
-pix_add_multiply_1x64 (__m64* src,
- __m64* alpha_dst,
- __m64* dst,
- __m64* alpha_src)
-{
- __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
- __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
-
- return _mm_adds_pu8 (t1, t2);
-}
-
-static force_inline __m64
-negate_1x64 (__m64 data)
-{
- return _mm_xor_si64 (data, mask_x00ff);
-}
-
-static force_inline __m64
-invert_colors_1x64 (__m64 data)
-{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
-static force_inline __m64
-over_1x64 (__m64 src, __m64 alpha, __m64 dst)
-{
- return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
-}
-
-static force_inline __m64
-in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
-{
- return over_1x64 (pix_multiply_1x64 (*src, *mask),
- pix_multiply_1x64 (*alpha, *mask),
- *dst);
-}
-
-static force_inline __m64
-over_rev_non_pre_1x64 (__m64 src, __m64 dst)
-{
- __m64 alpha = expand_alpha_1x64 (src);
-
- return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
- _mm_or_si64 (alpha, mask_x_alpha)),
- alpha,
- dst);
-}
-
-static force_inline uint32_t
-pack_1x64_32 (__m64 data)
-{
- return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
-}
-
-/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
- *
- * 00RR00GG00BB
- *
- * --- Expanding 565 in the low word ---
- *
- * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
- * m = m & (01f0003f001f);
- * m = m * (008404100840);
- * m = m >> 8;
- *
- * Note the trick here - the top word is shifted by another nibble to
- * avoid it bumping into the middle word
- */
-static force_inline __m64
-expand565_16_1x64 (uint16_t pixel)
-{
- __m64 p;
- __m64 t1, t2;
-
- p = _mm_cvtsi32_si64 ((uint32_t) pixel);
-
- t1 = _mm_slli_si64 (p, 36 - 11);
- t2 = _mm_slli_si64 (p, 16 - 5);
-
- p = _mm_or_si64 (t1, p);
- p = _mm_or_si64 (t2, p);
- p = _mm_and_si64 (p, mask_x565_rgb);
- p = _mm_mullo_pi16 (p, mask_x565_unpack);
-
- return _mm_srli_pi16 (p, 8);
-}
-
-/* ----------------------------------------------------------------------------
- * Compose Core transformations
- */
-static force_inline uint32_t
-core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
-{
- uint8_t a;
- __m64 ms;
-
- a = src >> 24;
-
- if (a == 0xff)
- {
- return src;
- }
- else if (src)
- {
- ms = unpack_32_1x64 (src);
- return pack_1x64_32 (
- over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
- }
-
- return dst;
-}
-
-static force_inline uint32_t
-combine1 (const uint32_t *ps, const uint32_t *pm)
-{
- uint32_t s = *ps;
-
- if (pm)
- {
- __m64 ms, mm;
-
- mm = unpack_32_1x64 (*pm);
- mm = expand_alpha_1x64 (mm);
-
- ms = unpack_32_1x64 (s);
- ms = pix_multiply_1x64 (ms, mm);
-
- s = pack_1x64_32 (ms);
- }
-
- return s;
-}
-
-static force_inline __m128i
-combine4 (const __m128i *ps, const __m128i *pm)
-{
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_msk_lo, xmm_msk_hi;
- __m128i s;
-
- if (pm)
- {
- xmm_msk_lo = load_128_unaligned (pm);
-
- if (is_transparent (xmm_msk_lo))
- return _mm_setzero_si128 ();
- }
-
- s = load_128_unaligned (ps);
-
- if (pm)
- {
- unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
-
- expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_msk_lo, &xmm_msk_hi,
- &xmm_src_lo, &xmm_src_hi);
-
- s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
- }
-
- return s;
-}
-
-static force_inline void
-core_combine_over_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- /* Align dst on a 16-byte boundary */
- while (w && ((unsigned long)pd & 15))
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
- ps++;
- if (pm)
- pm++;
- w--;
- }
-
- while (w >= 4)
- {
- /* I'm loading unaligned because I'm not sure about
- * the address alignment.
- */
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
-
- if (is_opaque (xmm_src_hi))
- {
- save_128_aligned ((__m128i*)pd, xmm_src_hi);
- }
- else if (!is_zero (xmm_src_hi))
- {
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (
- xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- /* rebuid the 4 pixel data and save*/
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- w -= 4;
- ps += 4;
- pd += 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
- ps++;
- if (pm)
- pm++;
-
- w--;
- }
-}
-
-static force_inline void
-core_combine_over_reverse_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- /* Align dst on a 16-byte boundary */
- while (w &&
- ((unsigned long)pd & 15))
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- *pd++ = core_combine_over_u_pixel_sse2 (d, s);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- /* I'm loading unaligned because I'm not sure
- * about the address alignment.
- */
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_src_lo, &xmm_src_hi);
-
- /* rebuid the 4 pixel data and save*/
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_src_lo, xmm_src_hi));
-
- w -= 4;
- ps += 4;
- pd += 4;
-
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- d = *pd;
- s = combine1 (ps, pm);
-
- *pd++ = core_combine_over_u_pixel_sse2 (d, s);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
-{
- uint32_t maska = src >> 24;
-
- if (maska == 0)
- {
- return 0;
- }
- else if (maska != 0xff)
- {
- return pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (dst),
- expand_alpha_1x64 (unpack_32_1x64 (src))));
- }
-
- return dst;
-}
-
-static force_inline void
-core_combine_in_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixelsse2 (d, s);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixelsse2 (d, s);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline void
-core_combine_reverse_in_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixelsse2 (s, d);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_in_u_pixelsse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline void
-core_combine_reverse_out_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- while (w && ((unsigned long) pd & 15))
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
-
- if (pm)
- pm++;
- ps++;
- w--;
- }
-
- while (w >= 4)
- {
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- if (pm)
- pm += 4;
-
- w -= 4;
- }
-
- while (w)
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
- ps++;
- if (pm)
- pm++;
- w--;
- }
-}
-
-static force_inline void
-core_combine_out_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- while (w && ((unsigned long) pd & 15))
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (d)))));
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (d)))));
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_atop_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
-
- __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
- __m64 da = expand_alpha_1x64 (d);
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
-}
-
-static force_inline void
-core_combine_atop_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-
- pix_add_multiply_2x128 (
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
-
- __m64 sa = expand_alpha_1x64 (s);
- __m64 da = negate_1x64 (expand_alpha_1x64 (d));
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
-}
-
-static force_inline void
-core_combine_reverse_atop_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
-{
- uint32_t s, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_add_multiply_2x128 (
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
- ps++;
- w--;
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_xor_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
-
- __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
- __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
-}
-
-static force_inline void
-core_combine_xor_u_sse2 (uint32_t* dst,
- const uint32_t* src,
- const uint32_t *mask,
- int width)
-{
- int w = width;
- uint32_t s, d;
- uint32_t* pd = dst;
- const uint32_t* ps = src;
- const uint32_t* pm = mask;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
-
- while (w && ((unsigned long) pd & 15))
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
- xmm_dst = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_add_multiply_2x128 (
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- w -= 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline void
-core_combine_add_u_sse2 (uint32_t* dst,
- const uint32_t* src,
- const uint32_t* mask,
- int width)
-{
- int w = width;
- uint32_t s, d;
- uint32_t* pd = dst;
- const uint32_t* ps = src;
- const uint32_t* pm = mask;
-
- while (w && (unsigned long)pd & 15)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- ps++;
- if (pm)
- pm++;
- *pd++ = _mm_cvtsi64_si32 (
- _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
- w--;
- }
-
- while (w >= 4)
- {
- __m128i s;
-
- s = combine4 ((__m128i*)ps, (__m128i*)pm);
-
- save_128_aligned (
- (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
-
- pd += 4;
- ps += 4;
- if (pm)
- pm += 4;
- w -= 4;
- }
-
- while (w--)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- ps++;
- *pd++ = _mm_cvtsi64_si32 (
- _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
- if (pm)
- pm++;
- }
-}
-
-static force_inline uint32_t
-core_combine_saturate_u_pixel_sse2 (uint32_t src,
- uint32_t dst)
-{
- __m64 ms = unpack_32_1x64 (src);
- __m64 md = unpack_32_1x64 (dst);
- uint32_t sa = src >> 24;
- uint32_t da = ~dst >> 24;
-
- if (sa > da)
- {
- ms = pix_multiply_1x64 (
- ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
- }
-
- return pack_1x64_32 (_mm_adds_pu16 (md, ms));
-}
-
-static force_inline void
-core_combine_saturate_u_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, d;
-
- uint32_t pack_cmp;
- __m128i xmm_src, xmm_dst;
-
- while (w && (unsigned long)pd & 15)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- w--;
- ps++;
- if (pm)
- pm++;
- }
-
- while (w >= 4)
- {
- xmm_dst = load_128_aligned ((__m128i*)pd);
- xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
-
- pack_cmp = _mm_movemask_epi8 (
- _mm_cmpgt_epi32 (
- _mm_srli_epi32 (xmm_src, 24),
- _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
-
- /* if some alpha src is grater than respective ~alpha dst */
- if (pack_cmp)
- {
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
- }
- else
- {
- save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
-
- pd += 4;
- ps += 4;
- if (pm)
- pm += 4;
- }
-
- w -= 4;
- }
-
- while (w--)
- {
- s = combine1 (ps, pm);
- d = *pd;
-
- *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
- ps++;
- if (pm)
- pm++;
- }
-}
-
-static force_inline void
-core_combine_src_ca_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_over_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m64 s = unpack_32_1x64 (src);
- __m64 expAlpha = expand_alpha_1x64 (s);
- __m64 unpk_mask = unpack_32_1x64 (mask);
- __m64 unpk_dst = unpack_32_1x64 (dst);
-
- return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
-}
-
-static force_inline void
-core_combine_over_ca_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m64 d = unpack_32_1x64 (dst);
-
- return pack_1x64_32 (
- over_1x64 (d, expand_alpha_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (src),
- unpack_32_1x64 (mask))));
-}
-
-static force_inline void
-core_combine_over_reverse_ca_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline void
-core_combine_in_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expand_alpha_1x64 (unpack_32_1x64 (d))));
-
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expand_alpha_1x64 (unpack_32_1x64 (d))));
-
- w--;
- }
-}
-
-static force_inline void
-core_combine_in_reverse_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
- w--;
- }
-}
-
-static force_inline void
-core_combine_out_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
- negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
-
- w--;
- }
-}
-
-static force_inline void
-core_combine_out_reverse_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- negate_1x64 (pix_multiply_1x64 (
- unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s))))));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- negate_1x64 (pix_multiply_1x64 (
- unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s))))));
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_atop_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m64 m = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
- __m64 sa = expand_alpha_1x64 (s);
- __m64 da = expand_alpha_1x64 (d);
-
- s = pix_multiply_1x64 (s, m);
- m = negate_1x64 (pix_multiply_1x64 (m, sa));
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
-}
-
-static force_inline void
-core_combine_atop_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- pix_add_multiply_2x128 (
- &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m64 m = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
-
- __m64 da = negate_1x64 (expand_alpha_1x64 (d));
- __m64 sa = expand_alpha_1x64 (s);
-
- s = pix_multiply_1x64 (s, m);
- m = pix_multiply_1x64 (m, sa);
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
-}
-
-static force_inline void
-core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_add_multiply_2x128 (
- &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline uint32_t
-core_combine_xor_ca_pixel_sse2 (uint32_t src,
- uint32_t mask,
- uint32_t dst)
-{
- __m64 a = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
-
- __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
- a, expand_alpha_1x64 (s)));
- __m64 dest = pix_multiply_1x64 (s, a);
- __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
-
- return pack_1x64_32 (pix_add_multiply_1x64 (&d,
- &alpha_dst,
- &dest,
- &alpha_src));
-}
-
-static force_inline void
-core_combine_xor_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
- __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_alpha_src_lo, &xmm_alpha_src_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
- &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
- negate_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_add_multiply_2x128 (
- &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
- w--;
- }
-}
-
-static force_inline void
-core_combine_add_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
- uint32_t s, m, d;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask_lo, xmm_mask_hi;
-
- while (w && (unsigned long)pd & 15)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = load_128_unaligned ((__m128i*)ps);
- xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
- xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_src_lo, &xmm_src_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (
- _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
- _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
-}
-
-/* ---------------------------------------------------
- * fb_compose_setup_sSE2
- */
-static force_inline __m64
-create_mask_16_64 (uint16_t mask)
-{
- return _mm_set1_pi16 (mask);
-}
-
-static force_inline __m128i
-create_mask_16_128 (uint16_t mask)
-{
- return _mm_set1_epi16 (mask);
-}
-
-static force_inline __m64
-create_mask_2x32_64 (uint32_t mask0,
- uint32_t mask1)
-{
- return _mm_set_pi32 (mask0, mask1);
-}
-
-/* Work around a code generation bug in Sun Studio 12. */
-#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
-# define create_mask_2x32_128(mask0, mask1) \
- (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
-#else
-static force_inline __m128i
-create_mask_2x32_128 (uint32_t mask0,
- uint32_t mask1)
-{
- return _mm_set_epi32 (mask0, mask1, mask0, mask1);
-}
-#endif
-
-/* SSE2 code patch for fbcompose.c */
-
-static void
-sse2_combine_over_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_over_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_reverse_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_in_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_in_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_out_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_out_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_atop_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_xor_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_xor_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_add_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_add_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_saturate_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_saturate_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_src_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_src_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_over_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_in_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_out_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_atop_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_xor_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_xor_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_add_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_add_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------
- * composite_over_n_8888
- */
-
-static void
-sse2_composite_over_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint32_t *dst_line, *dst, d;
- int32_t w;
- int dst_stride;
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
-
- while (height--)
- {
- dst = dst_line;
-
- dst_line += dst_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 4)
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_dst_lo, &xmm_dst_hi);
-
- /* rebuid the 4 pixel data and save*/
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- w -= 4;
- dst += 4;
- }
-
- while (w)
- {
- d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
- w--;
- }
-
- }
- _mm_empty ();
-}
-
-/* ---------------------------------------------------------------------
- * composite_over_n_0565
- */
-static void
-sse2_composite_over_n_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint16_t *dst_line, *dst, d;
- int32_t w;
- int dst_stride;
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
-
- while (height--)
- {
- dst = dst_line;
-
- dst_line += dst_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- d = *dst;
-
- *dst++ = pack_565_32_16 (
- pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- expand565_16_1x64 (d))));
- w--;
- }
-
- while (w >= 8)
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
- over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_dst0, &xmm_dst1);
- over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_dst2, &xmm_dst3);
-
- xmm_dst = pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
- save_128_aligned ((__m128i*)dst, xmm_dst);
-
- dst += 8;
- w -= 8;
- }
-
- while (w--)
- {
- d = *dst;
- *dst++ = pack_565_32_16 (
- pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- expand565_16_1x64 (d))));
- }
- }
-
- _mm_empty ();
-}
-
-/* ------------------------------
- * composite_add_n_8888_8888_ca
- */
-static void
-sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, d;
- uint32_t *mask_line, m;
- uint32_t pack_cmp;
- int dst_stride, mask_stride;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- srca = src >> 24;
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- xmm_src = _mm_unpacklo_epi8 (
- create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
-
- while (height--)
- {
- int w = width;
- const uint32_t *pm = (uint32_t *)mask_line;
- uint32_t *pd = (uint32_t *)dst_line;
-
- dst_line += dst_stride;
- mask_line += mask_stride;
-
- while (w && (unsigned long)pd & 15)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
-
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
- }
-
- pd++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_mask = load_128_unaligned ((__m128i*)pm);
-
- pack_cmp =
- _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
- if (pack_cmp != 0xffff)
- {
- xmm_dst = load_128_aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_src, &xmm_src,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
- xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
-
- save_128_aligned (
- (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
- }
-
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
-
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
- }
-
- pd++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* ---------------------------------------------------------------------------
- * composite_over_n_8888_8888_ca
- */
-
-static void
-sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint32_t *dst_line, d;
- uint32_t *mask_line, m;
- uint32_t pack_cmp;
- int dst_stride, mask_stride;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- xmm_src = _mm_unpacklo_epi8 (
- create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
-
- while (height--)
- {
- int w = width;
- const uint32_t *pm = (uint32_t *)mask_line;
- uint32_t *pd = (uint32_t *)dst_line;
-
- dst_line += dst_stride;
- mask_line += mask_stride;
-
- while (w && (unsigned long)pd & 15)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
- &mmx_alpha,
- &mmx_mask,
- &mmx_dest));
- }
-
- pd++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_mask = load_128_unaligned ((__m128i*)pm);
-
- pack_cmp =
- _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
- if (pack_cmp != 0xffff)
- {
- xmm_dst = load_128_aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (
- in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
- }
-
- pd++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/*---------------------------------------------------------------------
- * composite_over_8888_n_8888
- */
-
-static void
-sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- uint32_t mask;
- int32_t w;
- int dst_stride, src_stride;
-
- __m128i xmm_mask;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
-
- xmm_mask = create_mask_16_128 (mask >> 24);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t s = *src++;
-
- if (s)
- {
- uint32_t d = *dst;
-
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expand_alpha_1x64 (ms);
- __m64 dest = _mm_movepi64_pi64 (xmm_mask);
- __m64 alpha_dst = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (
- in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
- }
- dst++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
-
- if (!is_zero (xmm_src))
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask, &xmm_mask,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- dst += 4;
- src += 4;
- w -= 4;
- }
-
- while (w)
- {
- uint32_t s = *src++;
-
- if (s)
- {
- uint32_t d = *dst;
-
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expand_alpha_1x64 (ms);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (
- in_over_1x64 (&ms, &alpha, &mask, &dest));
- }
-
- dst++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/*---------------------------------------------------------------------
- * composite_over_8888_n_8888
- */
-
-static void
-sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int32_t w;
- int dst_stride, src_stride;
-
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- *dst++ = *src++ | 0xff000000;
- w--;
- }
-
- while (w >= 16)
- {
- __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
-
- xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
- xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
- xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
- xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
-
- save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
- save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
- save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
- save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
-
- dst += 16;
- src += 16;
- w -= 16;
- }
-
- while (w)
- {
- *dst++ = *src++ | 0xff000000;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* ---------------------------------------------------------------------
- * composite_over_x888_n_8888
- */
-static void
-sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- uint32_t mask;
- int dst_stride, src_stride;
- int32_t w;
-
- __m128i xmm_mask, xmm_alpha;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
-
- xmm_mask = create_mask_16_128 (mask >> 24);
- xmm_alpha = mask_00ff;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t s = (*src++) | 0xff000000;
- uint32_t d = *dst;
-
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
-
- *dst++ = pack_1x64_32 (
- in_over_1x64 (&src, &alpha, &mask, &dest));
-
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src = _mm_or_si128 (
- load_128_unaligned ((__m128i*)src), mask_ff000000);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask, &xmm_mask,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- dst += 4;
- src += 4;
- w -= 4;
-
- }
-
- while (w)
- {
- uint32_t s = (*src++) | 0xff000000;
- uint32_t d = *dst;
-
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
-
- *dst++ = pack_1x64_32 (
- in_over_1x64 (&src, &alpha, &mask, &dest));
-
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* --------------------------------------------------------------------
- * composite_over_8888_8888
- */
-static void
-sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- int dst_stride, src_stride;
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- dst = dst_line;
- src = src_line;
-
- while (height--)
- {
- core_combine_over_u_sse2 (dst, src, NULL, width);
-
- dst += dst_stride;
- src += src_stride;
- }
- _mm_empty ();
-}
-
-/* ------------------------------------------------------------------
- * composite_over_8888_0565
- */
-static force_inline uint16_t
-composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
-{
- __m64 ms;
-
- ms = unpack_32_1x64 (src);
- return pack_565_32_16 (
- pack_1x64_32 (
- over_1x64 (
- ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
-}
-
-static void
-sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst, d;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
-
- __m128i xmm_alpha_lo, xmm_alpha_hi;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (src_image->drawable == mask_image->drawable);
-#endif
-
- while (height--)
- {
- dst = dst_line;
- src = src_line;
-
- dst_line += dst_stride;
- src_line += src_stride;
- w = width;
-
- /* Align dst on a 16-byte boundary */
- while (w &&
- ((unsigned long)dst & 15))
- {
- s = *src++;
- d = *dst;
-
- *dst++ = composite_over_8888_0565pixel (s, d);
- w--;
- }
-
- /* It's a 8 pixel loop */
- while (w >= 8)
- {
- /* I'm loading unaligned because I'm not sure
- * about the address alignment.
- */
- xmm_src = load_128_unaligned ((__m128i*) src);
- xmm_dst = load_128_aligned ((__m128i*) dst);
-
- /* Unpacking */
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- /* I'm loading next 4 pixels from memory
- * before to optimze the memory read.
- */
- xmm_src = load_128_unaligned ((__m128i*) (src + 4));
-
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst0, &xmm_dst1);
-
- /* Unpacking */
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst2, &xmm_dst3);
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- dst += 8;
- src += 8;
- }
-
- while (w--)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = composite_over_8888_0565pixel (s, d);
- }
- }
-
- _mm_empty ();
-}
-
-/* -----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
-static void
-sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t m, d;
-
- __m128i xmm_src, xmm_alpha, xmm_def;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- xmm_def = create_mask_2x32_128 (src, src);
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_pixel_8_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
- &mmx_alpha,
- &mmx_mask,
- &mmx_dest));
- }
-
- w--;
- dst++;
- }
-
- while (w >= 4)
- {
- m = *((uint32_t*)mask);
-
- if (srca == 0xff && m == 0xffffffff)
- {
- save_128_aligned ((__m128i*)dst, xmm_def);
- }
- else if (m)
- {
- xmm_dst = load_128_aligned ((__m128i*) dst);
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- w -= 4;
- dst += 4;
- mask += 4;
- }
-
- while (w)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_pixel_8_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
- &mmx_alpha,
- &mmx_mask,
- &mmx_dest));
- }
-
- w--;
- dst++;
- }
- }
-
- _mm_empty ();
-}
-
-/* ----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
-pixman_bool_t
-pixman_fill_sse2 (uint32_t *bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t data)
-{
- uint32_t byte_width;
- uint8_t *byte_line;
-
- __m128i xmm_def;
-
- if (bpp == 8)
- {
- uint8_t b;
- uint16_t w;
-
- stride = stride * (int) sizeof (uint32_t) / 1;
- byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
- byte_width = width;
- stride *= 1;
-
- b = data & 0xff;
- w = (b << 8) | b;
- data = (w << 16) | w;
- }
- else if (bpp == 16)
- {
- stride = stride * (int) sizeof (uint32_t) / 2;
- byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
- byte_width = 2 * width;
- stride *= 2;
-
- data = (data & 0xffff) * 0x00010001;
- }
- else if (bpp == 32)
- {
- stride = stride * (int) sizeof (uint32_t) / 4;
- byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
- byte_width = 4 * width;
- stride *= 4;
- }
- else
- {
- return FALSE;
- }
-
- xmm_def = create_mask_2x32_128 (data, data);
-
- while (height--)
- {
- int w;
- uint8_t *d = byte_line;
- byte_line += stride;
- w = byte_width;
-
- while (w >= 1 && ((unsigned long)d & 1))
- {
- *(uint8_t *)d = data;
- w -= 1;
- d += 1;
- }
-
- while (w >= 2 && ((unsigned long)d & 3))
- {
- *(uint16_t *)d = data;
- w -= 2;
- d += 2;
- }
-
- while (w >= 4 && ((unsigned long)d & 15))
- {
- *(uint32_t *)d = data;
-
- w -= 4;
- d += 4;
- }
-
- while (w >= 128)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
- save_128_aligned ((__m128i*)(d + 16), xmm_def);
- save_128_aligned ((__m128i*)(d + 32), xmm_def);
- save_128_aligned ((__m128i*)(d + 48), xmm_def);
- save_128_aligned ((__m128i*)(d + 64), xmm_def);
- save_128_aligned ((__m128i*)(d + 80), xmm_def);
- save_128_aligned ((__m128i*)(d + 96), xmm_def);
- save_128_aligned ((__m128i*)(d + 112), xmm_def);
-
- d += 128;
- w -= 128;
- }
-
- if (w >= 64)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
- save_128_aligned ((__m128i*)(d + 16), xmm_def);
- save_128_aligned ((__m128i*)(d + 32), xmm_def);
- save_128_aligned ((__m128i*)(d + 48), xmm_def);
-
- d += 64;
- w -= 64;
- }
-
- if (w >= 32)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
- save_128_aligned ((__m128i*)(d + 16), xmm_def);
-
- d += 32;
- w -= 32;
- }
-
- if (w >= 16)
- {
- save_128_aligned ((__m128i*)(d), xmm_def);
-
- d += 16;
- w -= 16;
- }
-
- while (w >= 4)
- {
- *(uint32_t *)d = data;
-
- w -= 4;
- d += 4;
- }
-
- if (w >= 2)
- {
- *(uint16_t *)d = data;
- w -= 2;
- d += 2;
- }
-
- if (w >= 1)
- {
- *(uint8_t *)d = data;
- w -= 1;
- d += 1;
- }
- }
-
- _mm_empty ();
- return TRUE;
-}
-
-static void
-sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t m;
-
- __m128i xmm_src, xmm_def;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- {
- pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
- dest_x, dest_y, width, height, 0);
- return;
- }
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- xmm_def = create_mask_2x32_128 (src, src);
- xmm_src = expand_pixel_32_1x128 (src);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- *dst = pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
- }
- else
- {
- *dst = 0;
- }
-
- w--;
- dst++;
- }
-
- while (w >= 4)
- {
- m = *((uint32_t*)mask);
-
- if (srca == 0xff && m == 0xffffffff)
- {
- save_128_aligned ((__m128i*)dst, xmm_def);
- }
- else if (m)
- {
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_src, &xmm_src,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
- }
- else
- {
- save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
- }
-
- w -= 4;
- dst += 4;
- mask += 4;
- }
-
- while (w)
- {
- uint8_t m = *mask++;
-
- if (m)
- {
- *dst = pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
- }
- else
- {
- *dst = 0;
- }
-
- w--;
- dst++;
- }
- }
-
- _mm_empty ();
-}
-
-/*-----------------------------------------------------------------------
- * composite_over_n_8_0565
- */
-
-static void
-sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dst_line, *dst, d;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t m;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- mmx_dest = expand565_16_1x64 (d);
-
- *dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- }
-
- while (w >= 8)
- {
- xmm_dst = load_128_aligned ((__m128i*) dst);
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
- m = *((uint32_t*)mask);
- mask += 4;
-
- if (m)
- {
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst0, &xmm_dst1);
- }
-
- m = *((uint32_t*)mask);
- mask += 4;
-
- if (m)
- {
- xmm_mask = unpack_32_1x128 (m);
- xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
-
- /* Unpacking */
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst2, &xmm_dst3);
- }
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- dst += 8;
- }
-
- while (w)
- {
- m = *mask++;
-
- if (m)
- {
- d = *dst;
- mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- mmx_dest = expand565_16_1x64 (d);
-
- *dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- }
- }
-
- _mm_empty ();
-}
-
-/* -----------------------------------------------------------------------
- * composite_over_pixbuf_0565
- */
-
-static void
-sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst, d;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
- uint32_t opaque, zero;
-
- __m64 ms;
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (src_image->drawable == mask_image->drawable);
-#endif
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- s = *src++;
- d = *dst;
-
- ms = unpack_32_1x64 (s);
-
- *dst++ = pack_565_32_16 (
- pack_1x64_32 (
- over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
- w--;
- }
-
- while (w >= 8)
- {
- /* First round */
- xmm_src = load_128_unaligned ((__m128i*)src);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- opaque = is_opaque (xmm_src);
- zero = is_zero (xmm_src);
-
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-
- /* preload next round*/
- xmm_src = load_128_unaligned ((__m128i*)(src + 4));
-
- if (opaque)
- {
- invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst0, &xmm_dst1);
- }
- else if (!zero)
- {
- over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst0, &xmm_dst1);
- }
-
- /* Second round */
- opaque = is_opaque (xmm_src);
- zero = is_zero (xmm_src);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-
- if (opaque)
- {
- invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst2, &xmm_dst3);
- }
- else if (!zero)
- {
- over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst2, &xmm_dst3);
- }
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- src += 8;
- dst += 8;
- }
-
- while (w)
- {
- s = *src++;
- d = *dst;
-
- ms = unpack_32_1x64 (s);
-
- *dst++ = pack_565_32_16 (
- pack_1x64_32 (
- over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------------
- * composite_over_pixbuf_8888
- */
-
-static void
-sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst, d;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
- uint32_t opaque, zero;
-
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (src_image->drawable == mask_image->drawable);
-#endif
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = pack_1x64_32 (
- over_rev_non_pre_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
-
- w--;
- }
-
- while (w >= 4)
- {
- xmm_src_hi = load_128_unaligned ((__m128i*)src);
-
- opaque = is_opaque (xmm_src_hi);
- zero = is_zero (xmm_src_hi);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-
- if (opaque)
- {
- invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
- else if (!zero)
- {
- xmm_dst_hi = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- w -= 4;
- dst += 4;
- src += 4;
- }
-
- while (w)
- {
- s = *src++;
- d = *dst;
-
- *dst++ = pack_1x64_32 (
- over_rev_non_pre_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
-
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------------------------------------
- * composite_over_n_8888_0565_ca
- */
-
-static void
-sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint16_t *dst_line, *dst, d;
- uint32_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int w;
- uint32_t pack_cmp;
-
- __m128i xmm_src, xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
-
- while (height--)
- {
- w = width;
- mask = mask_line;
- dst = dst_line;
- mask_line += mask_stride;
- dst_line += dst_stride;
-
- while (w && ((unsigned long)dst & 15))
- {
- m = *(uint32_t *) mask;
-
- if (m)
- {
- d = *dst;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = expand565_16_1x64 (d);
-
- *dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- mask++;
- }
-
- while (w >= 8)
- {
- /* First round */
- xmm_mask = load_128_unaligned ((__m128i*)mask);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- pack_cmp = _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- unpack_565_128_4x128 (xmm_dst,
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- /* preload next round */
- xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
-
- /* preload next round */
- if (pack_cmp != 0xffff)
- {
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst0, &xmm_dst1);
- }
-
- /* Second round */
- pack_cmp = _mm_movemask_epi8 (
- _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
- if (pack_cmp != 0xffff)
- {
- in_over_2x128 (&xmm_src, &xmm_src,
- &xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst2, &xmm_dst3);
- }
-
- save_128_aligned (
- (__m128i*)dst, pack_565_4x128_128 (
- &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
- w -= 8;
- dst += 8;
- mask += 8;
- }
-
- while (w)
- {
- m = *(uint32_t *) mask;
-
- if (m)
- {
- d = *dst;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = expand565_16_1x64 (d);
-
- *dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
- &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
- }
-
- w--;
- dst++;
- mask++;
- }
- }
-
- _mm_empty ();
-}
-
-/* -----------------------------------------------------------------------
- * composite_in_n_8_8
- */
-
-static void
-sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- uint32_t d, m;
- uint32_t src;
- uint8_t sa;
- int32_t w;
-
- __m128i xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- sa = src >> 24;
-
- xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_mask = load_128_unaligned ((__m128i*)mask);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- mask += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* -----------------------------------------------------------------------
- * composite_in_n_8
- */
-
-static void
-sse2_composite_in_n_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- int dst_stride;
- uint32_t d;
- uint32_t src;
- int32_t w;
-
- __m128i xmm_alpha;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
- src = src >> 24;
-
- if (src == 0xff)
- return;
-
- if (src == 0x00)
- {
- pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
- 8, dest_x, dest_y, width, height, src);
-
- return;
- }
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* ---------------------------------------------------------------------------
- * composite_in_8_8
- */
-
-static void
-sse2_composite_in_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int src_stride, dst_stride;
- int32_t w;
- uint32_t s, d;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- s = (uint32_t) *src++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_dst_lo, &xmm_dst_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- src += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- s = (uint32_t) *src++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------------
- * composite_add_n_8_8
- */
-
-static void
-sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t src;
- uint8_t sa;
- uint32_t m, d;
-
- __m128i xmm_alpha;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- sa = src >> 24;
-
- xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- _mm_adds_pu16 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
-
- while (w >= 16)
- {
- xmm_mask = load_128_unaligned ((__m128i*)mask);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
- &xmm_mask_lo, &xmm_mask_hi,
- &xmm_mask_lo, &xmm_mask_hi);
-
- xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
- xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
- mask += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (
- _mm_adds_pu16 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
-
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------------
- * composite_add_n_8_8
- */
-
-static void
-sse2_composite_add_n_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- int dst_stride;
- int32_t w;
- uint32_t src;
-
- __m128i xmm_src;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- src >>= 24;
-
- if (src == 0x00)
- return;
-
- if (src == 0xff)
- {
- pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
- 8, dest_x, dest_y, width, height, 0xff);
-
- return;
- }
-
- src = (src << 24) | (src << 16) | (src << 8) | src;
- xmm_src = _mm_set_epi32 (src, src, src, src);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- w = width;
-
- while (w && ((unsigned long)dst & 15))
- {
- *dst = (uint8_t)_mm_cvtsi64_si32 (
- _mm_adds_pu8 (
- _mm_movepi64_pi64 (xmm_src),
- _mm_cvtsi32_si64 (*dst)));
-
- w--;
- dst++;
- }
-
- while (w >= 16)
- {
- save_128_aligned (
- (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
-
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- *dst = (uint8_t)_mm_cvtsi64_si32 (
- _mm_adds_pu8 (
- _mm_movepi64_pi64 (xmm_src),
- _mm_cvtsi32_si64 (*dst)));
-
- w--;
- dst++;
- }
- }
-
- _mm_empty ();
-}
-
-/* ----------------------------------------------------------------------
- * composite_add_8_8
- */
-
-static void
-sse2_composite_add_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint16_t t;
-
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- src = src_line;
-
- dst_line += dst_stride;
- src_line += src_stride;
- w = width;
-
- /* Small head */
- while (w && (unsigned long)dst & 3)
- {
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
- w--;
- }
-
- core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
-
- /* Small tail */
- dst += w & 0xfffc;
- src += w & 0xfffc;
-
- w &= 3;
-
- while (w)
- {
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* ---------------------------------------------------------------------
- * composite_add_8888_8888
- */
-static void
-sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
-
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
-
- core_combine_add_u_sse2 (dst, src, NULL, width);
- }
-
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------------------------------------
- * sse2_composite_copy_area
- */
-
-static pixman_bool_t
-pixman_blt_sse2 (uint32_t *src_bits,
- uint32_t *dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x,
- int src_y,
- int dst_x,
- int dst_y,
- int width,
- int height)
-{
- uint8_t * src_bytes;
- uint8_t * dst_bytes;
- int byte_width;
-
- if (src_bpp != dst_bpp)
- return FALSE;
-
- if (src_bpp == 16)
- {
- src_stride = src_stride * (int) sizeof (uint32_t) / 2;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
- src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 2 * width;
- src_stride *= 2;
- dst_stride *= 2;
- }
- else if (src_bpp == 32)
- {
- src_stride = src_stride * (int) sizeof (uint32_t) / 4;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
- src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 4 * width;
- src_stride *= 4;
- dst_stride *= 4;
- }
- else
- {
- return FALSE;
- }
-
- while (height--)
- {
- int w;
- uint8_t *s = src_bytes;
- uint8_t *d = dst_bytes;
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- w = byte_width;
-
- while (w >= 2 && ((unsigned long)d & 3))
- {
- *(uint16_t *)d = *(uint16_t *)s;
- w -= 2;
- s += 2;
- d += 2;
- }
-
- while (w >= 4 && ((unsigned long)d & 15))
- {
- *(uint32_t *)d = *(uint32_t *)s;
-
- w -= 4;
- s += 4;
- d += 4;
- }
-
- while (w >= 64)
- {
- __m128i xmm0, xmm1, xmm2, xmm3;
-
- xmm0 = load_128_unaligned ((__m128i*)(s));
- xmm1 = load_128_unaligned ((__m128i*)(s + 16));
- xmm2 = load_128_unaligned ((__m128i*)(s + 32));
- xmm3 = load_128_unaligned ((__m128i*)(s + 48));
-
- save_128_aligned ((__m128i*)(d), xmm0);
- save_128_aligned ((__m128i*)(d + 16), xmm1);
- save_128_aligned ((__m128i*)(d + 32), xmm2);
- save_128_aligned ((__m128i*)(d + 48), xmm3);
-
- s += 64;
- d += 64;
- w -= 64;
- }
-
- while (w >= 16)
- {
- save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
-
- w -= 16;
- d += 16;
- s += 16;
- }
-
- while (w >= 4)
- {
- *(uint32_t *)d = *(uint32_t *)s;
-
- w -= 4;
- s += 4;
- d += 4;
- }
-
- if (w >= 2)
- {
- *(uint16_t *)d = *(uint16_t *)s;
- w -= 2;
- s += 2;
- d += 2;
- }
- }
-
- _mm_empty ();
-
- return TRUE;
-}
-
-static void
-sse2_composite_copy_area (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- pixman_blt_sse2 (src_image->bits.bits,
- dst_image->bits.bits,
- src_image->bits.rowstride,
- dst_image->bits.rowstride,
- PIXMAN_FORMAT_BPP (src_image->bits.format),
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
- src_x, src_y, dest_x, dest_y, width, height);
-}
-
-static void
-sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line, s;
- uint32_t *dst, *dst_line, d;
- uint8_t *mask, *mask_line;
- uint32_t m;
- int src_stride, mask_stride, dst_stride;
- int32_t w;
- __m64 ms;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- s = 0xff000000 | *src++;
- m = (uint32_t) *mask++;
- d = *dst;
- ms = unpack_32_1x64 (s);
-
- if (m != 0xff)
- {
- __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- __m64 md = unpack_32_1x64 (d);
-
- ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
- }
-
- *dst++ = pack_1x64_32 (ms);
- w--;
- }
-
- while (w >= 4)
- {
- m = *(uint32_t*) mask;
- xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
-
- if (m == 0xffffffff)
- {
- save_128_aligned ((__m128i*)dst, xmm_src);
- }
- else
- {
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- src += 4;
- dst += 4;
- mask += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
-
- if (m)
- {
- s = 0xff000000 | *src;
-
- if (m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m64 ma, md, ms;
-
- d = *dst;
-
- ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- md = unpack_32_1x64 (d);
- ms = unpack_32_1x64 (s);
-
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
- }
-
- }
-
- src++;
- dst++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-static void
-sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line, s;
- uint32_t *dst, *dst_line, d;
- uint8_t *mask, *mask_line;
- uint32_t m;
- int src_stride, mask_stride, dst_stride;
- int32_t w;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t sa;
-
- s = *src++;
- m = (uint32_t) *mask++;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m64 ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
-
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
-
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
-
- while (w >= 4)
- {
- m = *(uint32_t *) mask;
-
- if (m)
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
-
- if (m == 0xffffffff && is_opaque (xmm_src))
- {
- save_128_aligned ((__m128i *)dst, xmm_src);
- }
- else
- {
- xmm_dst = load_128_aligned ((__m128i *)dst);
-
- xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
- &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
- }
-
- src += 4;
- dst += 4;
- mask += 4;
- w -= 4;
- }
-
- while (w)
- {
- uint32_t sa;
-
- s = *src++;
- m = (uint32_t) *mask++;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m64 ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
-
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
-
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-static void
-sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint32_t *dst_line, *dst;
- __m128i xmm_src;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_dsta_hi, xmm_dsta_lo;
- int dst_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- xmm_src = expand_pixel_32_1x128 (src);
-
- while (height--)
- {
- dst = dst_line;
-
- dst_line += dst_stride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- __m64 vd;
-
- vd = unpack_32_1x64 (*dst);
-
- *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
- _mm_movepi64_pi64 (xmm_src)));
- w--;
- dst++;
- }
-
- while (w >= 4)
- {
- __m128i tmp_lo, tmp_hi;
-
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
-
- tmp_lo = xmm_src;
- tmp_hi = xmm_src;
-
- over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
- &xmm_dsta_lo, &xmm_dsta_hi,
- &tmp_lo, &tmp_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
-
- w -= 4;
- dst += 4;
- }
-
- while (w)
- {
- __m64 vd;
-
- vd = unpack_32_1x64 (*dst);
-
- *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
- _mm_movepi64_pi64 (xmm_src)));
- w--;
- dst++;
- }
-
- }
-
- _mm_empty ();
-}
-
-static void
-sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line, s;
- uint32_t *dst, *dst_line, d;
- uint32_t *mask, *mask_line;
- uint32_t m;
- int src_stride, mask_stride, dst_stride;
- int32_t w;
-
- __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
- __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- uint32_t sa;
-
- s = *src++;
- m = (*mask++) >> 24;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m64 ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
-
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
-
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
-
- while (w >= 4)
- {
- xmm_mask = load_128_unaligned ((__m128i*)mask);
-
- if (!is_transparent (xmm_mask))
- {
- xmm_src = load_128_unaligned ((__m128i*)src);
-
- if (is_opaque (xmm_mask) && is_opaque (xmm_src))
- {
- save_128_aligned ((__m128i *)dst, xmm_src);
- }
- else
- {
- xmm_dst = load_128_aligned ((__m128i *)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
- expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
- &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
- }
-
- src += 4;
- dst += 4;
- mask += 4;
- w -= 4;
- }
-
- while (w)
- {
- uint32_t sa;
-
- s = *src++;
- m = (*mask++) >> 24;
- d = *dst;
-
- sa = s >> 24;
-
- if (m)
- {
- if (sa == 0xff && m == 0xff)
- {
- *dst = s;
- }
- else
- {
- __m64 ms, md, ma, msa;
-
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
-
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
-
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
- }
- }
-
- dst++;
- w--;
- }
- }
-
- _mm_empty ();
-}
-
-/* A variant of 'core_combine_over_u_sse2' with minor tweaks */
-static force_inline void
-scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
- const uint32_t* ps,
- int32_t w,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- pixman_fixed_t max_vx)
-{
- uint32_t s, d;
- const uint32_t* pm = NULL;
-
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- /* Align dst on a 16-byte boundary */
- while (w && ((unsigned long)pd & 15))
- {
- d = *pd;
- s = combine1 (ps + (vx >> 16), pm);
- vx += unit_x;
-
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
- w--;
- }
-
- while (w >= 4)
- {
- __m128i tmp;
- uint32_t tmp1, tmp2, tmp3, tmp4;
-
- tmp1 = ps[vx >> 16];
- vx += unit_x;
- tmp2 = ps[vx >> 16];
- vx += unit_x;
- tmp3 = ps[vx >> 16];
- vx += unit_x;
- tmp4 = ps[vx >> 16];
- vx += unit_x;
-
- tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
-
- xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
-
- if (is_opaque (xmm_src_hi))
- {
- save_128_aligned ((__m128i*)pd, xmm_src_hi);
- }
- else if (!is_zero (xmm_src_hi))
- {
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
- expand_alpha_2x128 (
- xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
-
- /* rebuid the 4 pixel data and save*/
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- }
-
- w -= 4;
- pd += 4;
- if (pm)
- pm += 4;
- }
-
- while (w)
- {
- d = *pd;
- s = combine1 (ps + (vx >> 16), pm);
- vx += unit_x;
-
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
- if (pm)
- pm++;
-
- w--;
- }
- _mm_empty ();
-}
-
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
- scaled_nearest_scanline_sse2_8888_8888_OVER,
- uint32_t, uint32_t, COVER);
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
- scaled_nearest_scanline_sse2_8888_8888_OVER,
- uint32_t, uint32_t, NONE);
-FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
- scaled_nearest_scanline_sse2_8888_8888_OVER,
- uint32_t, uint32_t, PAD);
-
-static const pixman_fast_path_t sse2_fast_paths[] =
-{
- /* PIXMAN_OP_OVER */
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
- PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
- PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
-
- /* PIXMAN_OP_OVER_REVERSE */
- PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
- PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
-
- /* PIXMAN_OP_ADD */
- PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
- PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
- PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
-
- /* PIXMAN_OP_SRC */
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
- PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
-
- /* PIXMAN_OP_IN */
- PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
- PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
- PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
-
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
-
- { PIXMAN_OP_NONE },
-};
-
-static pixman_bool_t
-sse2_blt (pixman_implementation_t *imp,
- uint32_t * src_bits,
- uint32_t * dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x,
- int src_y,
- int dst_x,
- int dst_y,
- int width,
- int height)
-{
- if (!pixman_blt_sse2 (
- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
- src_x, src_y, dst_x, dst_y, width, height))
-
- {
- return _pixman_implementation_blt (
- imp->delegate,
- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
- src_x, src_y, dst_x, dst_y, width, height);
- }
-
- return TRUE;
-}
-
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static pixman_bool_t
-sse2_fill (pixman_implementation_t *imp,
- uint32_t * bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
- {
- return _pixman_implementation_fill (
- imp->delegate, bits, stride, bpp, x, y, width, height, xor);
- }
-
- return TRUE;
-}
-
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-pixman_implementation_t *
-_pixman_implementation_create_sse2 (void)
-{
-#ifdef USE_MMX
- pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
-#else
- pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
-#endif
- pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
-
- /* SSE2 constants */
- mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
- mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
- mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
- mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
- mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
- mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
- mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
- mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
- mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
- mask_0080 = create_mask_16_128 (0x0080);
- mask_00ff = create_mask_16_128 (0x00ff);
- mask_0101 = create_mask_16_128 (0x0101);
- mask_ffff = create_mask_16_128 (0xffff);
- mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
- mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
-
- /* MMX constants */
- mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
- mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
-
- mask_x0080 = create_mask_16_64 (0x0080);
- mask_x00ff = create_mask_16_64 (0x00ff);
- mask_x0101 = create_mask_16_64 (0x0101);
- mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
-
- _mm_empty ();
-
- /* Set up function pointers */
-
- /* SSE code patch for fbcompose.c */
- imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
- imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
- imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
- imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
- imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
- imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
- imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
- imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
- imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
- imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
-
- imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
-
- imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
- imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
- imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
- imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
- imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
- imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
- imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
-
- imp->blt = sse2_blt;
- imp->fill = sse2_fill;
-
- return imp;
-}
-
-#endif /* USE_SSE2 */
+/*
+ * Copyright © 2008 Rodrigo Kumpera
+ * Copyright © 2008 André Tupinambá
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. Red Hat makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Rodrigo Kumpera (kumpera@gmail.com)
+ * André Tupinambá (andrelrt@gmail.com)
+ *
+ * Based on work by Owen Taylor and Søren Sandmann
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <mmintrin.h>
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h> /* for SSE2 intrinsics */
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-fast-path.h"
+
+#if defined(_MSC_VER) && defined(_M_AMD64)
+/* Windows 64 doesn't allow MMX to be used, so
+ * the pixman-x64-mmx-emulation.h file contains
+ * implementations of those MMX intrinsics that
+ * are used in the SSE2 implementation.
+ */
+# include "pixman-x64-mmx-emulation.h"
+#endif
+
+#ifdef USE_SSE2
+
+/* --------------------------------------------------------------------
+ * Locals
+ */
+
+static __m64 mask_x0080;
+static __m64 mask_x00ff;
+static __m64 mask_x0101;
+static __m64 mask_x_alpha;
+
+static __m64 mask_x565_rgb;
+static __m64 mask_x565_unpack;
+
+static __m128i mask_0080;
+static __m128i mask_00ff;
+static __m128i mask_0101;
+static __m128i mask_ffff;
+static __m128i mask_ff000000;
+static __m128i mask_alpha;
+
+static __m128i mask_565_r;
+static __m128i mask_565_g1, mask_565_g2;
+static __m128i mask_565_b;
+static __m128i mask_red;
+static __m128i mask_green;
+static __m128i mask_blue;
+
+static __m128i mask_565_fix_rb;
+static __m128i mask_565_fix_g;
+
+/* ----------------------------------------------------------------------
+ * SSE2 Inlines
+ */
+static force_inline __m128i
+unpack_32_1x128 (uint32_t data)
+{
+ return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
+}
+
+static force_inline void
+unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
+{
+ *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
+ *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
+}
+
+static force_inline __m128i
+unpack_565_to_8888 (__m128i lo)
+{
+ __m128i r, g, b, rb, t;
+
+ r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
+ g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
+ b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
+
+ rb = _mm_or_si128 (r, b);
+ t = _mm_and_si128 (rb, mask_565_fix_rb);
+ t = _mm_srli_epi32 (t, 5);
+ rb = _mm_or_si128 (rb, t);
+
+ t = _mm_and_si128 (g, mask_565_fix_g);
+ t = _mm_srli_epi32 (t, 6);
+ g = _mm_or_si128 (g, t);
+
+ return _mm_or_si128 (rb, g);
+}
+
+static force_inline void
+unpack_565_128_4x128 (__m128i data,
+ __m128i* data0,
+ __m128i* data1,
+ __m128i* data2,
+ __m128i* data3)
+{
+ __m128i lo, hi;
+
+ lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
+ hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
+
+ lo = unpack_565_to_8888 (lo);
+ hi = unpack_565_to_8888 (hi);
+
+ unpack_128_2x128 (lo, data0, data1);
+ unpack_128_2x128 (hi, data2, data3);
+}
+
+static force_inline uint16_t
+pack_565_32_16 (uint32_t pixel)
+{
+ return (uint16_t) (((pixel >> 8) & 0xf800) |
+ ((pixel >> 5) & 0x07e0) |
+ ((pixel >> 3) & 0x001f));
+}
+
+static force_inline __m128i
+pack_2x128_128 (__m128i lo, __m128i hi)
+{
+ return _mm_packus_epi16 (lo, hi);
+}
+
+static force_inline __m128i
+pack_565_2x128_128 (__m128i lo, __m128i hi)
+{
+ __m128i data;
+ __m128i r, g1, g2, b;
+
+ data = pack_2x128_128 (lo, hi);
+
+ r = _mm_and_si128 (data, mask_565_r);
+ g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+ g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+ b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
+
+ return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
+}
+
+static force_inline __m128i
+pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
+{
+ return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+ pack_565_2x128_128 (*xmm2, *xmm3));
+}
+
+static force_inline int
+is_opaque (__m128i x)
+{
+ __m128i ffs = _mm_cmpeq_epi8 (x, x);
+
+ return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (__m128i x)
+{
+ return _mm_movemask_epi8 (
+ _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
+}
+
+static force_inline int
+is_transparent (__m128i x)
+{
+ return (_mm_movemask_epi8 (
+ _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
+}
+
+static force_inline __m128i
+expand_pixel_32_1x128 (uint32_t data)
+{
+ return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
+}
+
+static force_inline __m128i
+expand_alpha_1x128 (__m128i data)
+{
+ return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+ _MM_SHUFFLE (3, 3, 3, 3)),
+ _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi)
+{
+ __m128i lo, hi;
+
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+ *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+ *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_rev_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi)
+{
+ __m128i lo, hi;
+
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+ *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+ *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline void
+pix_multiply_2x128 (__m128i* data_lo,
+ __m128i* data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* ret_lo,
+ __m128i* ret_hi)
+{
+ __m128i lo, hi;
+
+ lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
+ hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
+ lo = _mm_adds_epu16 (lo, mask_0080);
+ hi = _mm_adds_epu16 (hi, mask_0080);
+ *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+ *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
+}
+
+static force_inline void
+pix_add_multiply_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_dst_lo,
+ __m128i* alpha_dst_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi,
+ __m128i* alpha_src_lo,
+ __m128i* alpha_src_hi,
+ __m128i* ret_lo,
+ __m128i* ret_hi)
+{
+ __m128i t1_lo, t1_hi;
+ __m128i t2_lo, t2_hi;
+
+ pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
+ pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
+
+ *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
+ *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
+}
+
+static force_inline void
+negate_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* neg_lo,
+ __m128i* neg_hi)
+{
+ *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
+ *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
+}
+
+static force_inline void
+invert_colors_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* inv_lo,
+ __m128i* inv_hi)
+{
+ __m128i lo, hi;
+
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+ *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+ *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline void
+over_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
+{
+ __m128i t1, t2;
+
+ negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+ pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+ *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
+ *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
+}
+
+static force_inline void
+over_rev_non_pre_2x128 (__m128i src_lo,
+ __m128i src_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
+{
+ __m128i lo, hi;
+ __m128i alpha_lo, alpha_hi;
+
+ expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
+
+ lo = _mm_or_si128 (alpha_lo, mask_alpha);
+ hi = _mm_or_si128 (alpha_hi, mask_alpha);
+
+ invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
+
+ pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
+
+ over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
+}
+
+static force_inline void
+in_over_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* mask_lo,
+ __m128i* mask_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
+{
+ __m128i s_lo, s_hi;
+ __m128i a_lo, a_hi;
+
+ pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+ pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
+
+ over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
+}
+
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline __m128i
+load_128_aligned (__m128i* src)
+{
+ return _mm_load_si128 (src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline __m128i
+load_128_unaligned (const __m128i* src)
+{
+ return _mm_loadu_si128 (src);
+}
+
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
+static force_inline void
+save_128_write_combining (__m128i* dst,
+ __m128i data)
+{
+ _mm_stream_si128 (dst, data);
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (__m128i* dst,
+ __m128i data)
+{
+ _mm_store_si128 (dst, data);
+}
+
+/* save 4 pixels on a unaligned address */
+static force_inline void
+save_128_unaligned (__m128i* dst,
+ __m128i data)
+{
+ _mm_storeu_si128 (dst, data);
+}
+
+/* ------------------------------------------------------------------
+ * MMX inlines
+ */
+
+static force_inline __m64
+load_32_1x64 (uint32_t data)
+{
+ return _mm_cvtsi32_si64 (data);
+}
+
+static force_inline __m64
+unpack_32_1x64 (uint32_t data)
+{
+ return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
+}
+
+static force_inline __m64
+expand_alpha_1x64 (__m64 data)
+{
+ return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline __m64
+expand_alpha_rev_1x64 (__m64 data)
+{
+ return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m64
+expand_pixel_8_1x64 (uint8_t data)
+{
+ return _mm_shuffle_pi16 (
+ unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m64
+pix_multiply_1x64 (__m64 data,
+ __m64 alpha)
+{
+ return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
+ mask_x0080),
+ mask_x0101);
+}
+
+static force_inline __m64
+pix_add_multiply_1x64 (__m64* src,
+ __m64* alpha_dst,
+ __m64* dst,
+ __m64* alpha_src)
+{
+ __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
+ __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
+
+ return _mm_adds_pu8 (t1, t2);
+}
+
+static force_inline __m64
+negate_1x64 (__m64 data)
+{
+ return _mm_xor_si64 (data, mask_x00ff);
+}
+
+static force_inline __m64
+invert_colors_1x64 (__m64 data)
+{
+ return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline __m64
+over_1x64 (__m64 src, __m64 alpha, __m64 dst)
+{
+ return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
+}
+
+static force_inline __m64
+in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
+{
+ return over_1x64 (pix_multiply_1x64 (*src, *mask),
+ pix_multiply_1x64 (*alpha, *mask),
+ *dst);
+}
+
+static force_inline __m64
+over_rev_non_pre_1x64 (__m64 src, __m64 dst)
+{
+ __m64 alpha = expand_alpha_1x64 (src);
+
+ return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
+ _mm_or_si64 (alpha, mask_x_alpha)),
+ alpha,
+ dst);
+}
+
+static force_inline uint32_t
+pack_1x64_32 (__m64 data)
+{
+ return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
+}
+
+/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
+ *
+ * 00RR00GG00BB
+ *
+ * --- Expanding 565 in the low word ---
+ *
+ * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
+ * m = m & (01f0003f001f);
+ * m = m * (008404100840);
+ * m = m >> 8;
+ *
+ * Note the trick here - the top word is shifted by another nibble to
+ * avoid it bumping into the middle word
+ */
+static force_inline __m64
+expand565_16_1x64 (uint16_t pixel)
+{
+ __m64 p;
+ __m64 t1, t2;
+
+ p = _mm_cvtsi32_si64 ((uint32_t) pixel);
+
+ t1 = _mm_slli_si64 (p, 36 - 11);
+ t2 = _mm_slli_si64 (p, 16 - 5);
+
+ p = _mm_or_si64 (t1, p);
+ p = _mm_or_si64 (t2, p);
+ p = _mm_and_si64 (p, mask_x565_rgb);
+ p = _mm_mullo_pi16 (p, mask_x565_unpack);
+
+ return _mm_srli_pi16 (p, 8);
+}
+
+/* ----------------------------------------------------------------------------
+ * Compose Core transformations
+ */
+static force_inline uint32_t
+core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+ uint8_t a;
+ __m64 ms;
+
+ a = src >> 24;
+
+ if (a == 0xff)
+ {
+ return src;
+ }
+ else if (src)
+ {
+ ms = unpack_32_1x64 (src);
+ return pack_1x64_32 (
+ over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
+ }
+
+ return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+ uint32_t s = *ps;
+
+ if (pm)
+ {
+ __m64 ms, mm;
+
+ mm = unpack_32_1x64 (*pm);
+ mm = expand_alpha_1x64 (mm);
+
+ ms = unpack_32_1x64 (s);
+ ms = pix_multiply_1x64 (ms, mm);
+
+ s = pack_1x64_32 (ms);
+ }
+
+ return s;
+}
+
+static force_inline __m128i
+combine4 (const __m128i *ps, const __m128i *pm)
+{
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_msk_lo, xmm_msk_hi;
+ __m128i s;
+
+ if (pm)
+ {
+ xmm_msk_lo = load_128_unaligned (pm);
+
+ if (is_transparent (xmm_msk_lo))
+ return _mm_setzero_si128 ();
+ }
+
+ s = load_128_unaligned (ps);
+
+ if (pm)
+ {
+ unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
+
+ expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_msk_lo, &xmm_msk_hi,
+ &xmm_src_lo, &xmm_src_hi);
+
+ s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
+ }
+
+ return s;
+}
+
+static force_inline void
+core_combine_over_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
+{
+ uint32_t s, d;
+
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ /* Align dst on a 16-byte boundary */
+ while (w && ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ ps++;
+ if (pm)
+ pm++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ /* I'm loading unaligned because I'm not sure about
+ * the address alignment.
+ */
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+ if (is_opaque (xmm_src_hi))
+ {
+ save_128_aligned ((__m128i*)pd, xmm_src_hi);
+ }
+ else if (!is_zero (xmm_src_hi))
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (
+ xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ ps += 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ ps++;
+ if (pm)
+ pm++;
+
+ w--;
+ }
+}
+
+static force_inline void
+core_combine_over_reverse_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
+{
+ uint32_t s, d;
+
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ /* Align dst on a 16-byte boundary */
+ while (w &&
+ ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ *pd++ = core_combine_over_u_pixel_sse2 (d, s);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ /* I'm loading unaligned because I'm not sure
+ * about the address alignment.
+ */
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_src_lo, &xmm_src_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_src_lo, xmm_src_hi));
+
+ w -= 4;
+ ps += 4;
+ pd += 4;
+
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps, pm);
+
+ *pd++ = core_combine_over_u_pixel_sse2 (d, s);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline uint32_t
+core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
+{
+ uint32_t maska = src >> 24;
+
+ if (maska == 0)
+ {
+ return 0;
+ }
+ else if (maska != 0xff)
+ {
+ return pack_1x64_32 (
+ pix_multiply_1x64 (unpack_32_1x64 (dst),
+ expand_alpha_1x64 (unpack_32_1x64 (src))));
+ }
+
+ return dst;
+}
+
+static force_inline void
+core_combine_in_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
+{
+ uint32_t s, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ while (w && ((unsigned long) pd & 15))
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixelsse2 (d, s);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixelsse2 (d, s);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline void
+core_combine_reverse_in_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ while (w && ((unsigned long) pd & 15))
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixelsse2 (s, d);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_in_u_pixelsse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline void
+core_combine_reverse_out_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
+{
+ while (w && ((unsigned long) pd & 15))
+ {
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d), negate_1x64 (
+ expand_alpha_1x64 (unpack_32_1x64 (s)))));
+
+ if (pm)
+ pm++;
+ ps++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
+
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d), negate_1x64 (
+ expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ ps++;
+ if (pm)
+ pm++;
+ w--;
+ }
+}
+
+static force_inline void
+core_combine_out_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
+{
+ while (w && ((unsigned long) pd & 15))
+ {
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), negate_1x64 (
+ expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), negate_1x64 (
+ expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline uint32_t
+core_combine_atop_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
+{
+ __m64 s = unpack_32_1x64 (src);
+ __m64 d = unpack_32_1x64 (dst);
+
+ __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
+ __m64 da = expand_alpha_1x64 (d);
+
+ return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+}
+
+static force_inline void
+core_combine_atop_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
+{
+ uint32_t s, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+ while (w && ((unsigned long) pd & 15))
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
+{
+ __m64 s = unpack_32_1x64 (src);
+ __m64 d = unpack_32_1x64 (dst);
+
+ __m64 sa = expand_alpha_1x64 (s);
+ __m64 da = negate_1x64 (expand_alpha_1x64 (d));
+
+ return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+}
+
+static force_inline void
+core_combine_reverse_atop_u_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
+{
+ uint32_t s, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+ while (w && ((unsigned long) pd & 15))
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+ ps++;
+ w--;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline uint32_t
+core_combine_xor_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
+{
+ __m64 s = unpack_32_1x64 (src);
+ __m64 d = unpack_32_1x64 (dst);
+
+ __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
+ __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
+
+ return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
+}
+
+static force_inline void
+core_combine_xor_u_sse2 (uint32_t* dst,
+ const uint32_t* src,
+ const uint32_t *mask,
+ int width)
+{
+ int w = width;
+ uint32_t s, d;
+ uint32_t* pd = dst;
+ const uint32_t* ps = src;
+ const uint32_t* pm = mask;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+ while (w && ((unsigned long) pd & 15))
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
+ xmm_dst = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline void
+core_combine_add_u_sse2 (uint32_t* dst,
+ const uint32_t* src,
+ const uint32_t* mask,
+ int width)
+{
+ int w = width;
+ uint32_t s, d;
+ uint32_t* pd = dst;
+ const uint32_t* ps = src;
+ const uint32_t* pm = mask;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ ps++;
+ if (pm)
+ pm++;
+ *pd++ = _mm_cvtsi64_si32 (
+ _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m128i s;
+
+ s = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+ save_128_aligned (
+ (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
+
+ pd += 4;
+ ps += 4;
+ if (pm)
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w--)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ ps++;
+ *pd++ = _mm_cvtsi64_si32 (
+ _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline uint32_t
+core_combine_saturate_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
+{
+ __m64 ms = unpack_32_1x64 (src);
+ __m64 md = unpack_32_1x64 (dst);
+ uint32_t sa = src >> 24;
+ uint32_t da = ~dst >> 24;
+
+ if (sa > da)
+ {
+ ms = pix_multiply_1x64 (
+ ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
+ }
+
+ return pack_1x64_32 (_mm_adds_pu16 (md, ms));
+}
+
+static force_inline void
+core_combine_saturate_u_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, d;
+
+ uint32_t pack_cmp;
+ __m128i xmm_src, xmm_dst;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ w--;
+ ps++;
+ if (pm)
+ pm++;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)pd);
+ xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpgt_epi32 (
+ _mm_srli_epi32 (xmm_src, 24),
+ _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
+
+ /* if some alpha src is grater than respective ~alpha dst */
+ if (pack_cmp)
+ {
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+ }
+ else
+ {
+ save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
+
+ pd += 4;
+ ps += 4;
+ if (pm)
+ pm += 4;
+ }
+
+ w -= 4;
+ }
+
+ while (w--)
+ {
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ ps++;
+ if (pm)
+ pm++;
+ }
+}
+
+static force_inline void
+core_combine_src_ca_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, m;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+ w--;
+ }
+}
+
+static force_inline uint32_t
+core_combine_over_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
+{
+ __m64 s = unpack_32_1x64 (src);
+ __m64 expAlpha = expand_alpha_1x64 (s);
+ __m64 unpk_mask = unpack_32_1x64 (mask);
+ __m64 unpk_dst = unpack_32_1x64 (dst);
+
+ return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+}
+
+static force_inline void
+core_combine_over_ca_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+}
+
+static force_inline uint32_t
+core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
+{
+ __m64 d = unpack_32_1x64 (dst);
+
+ return pack_1x64_32 (
+ over_1x64 (d, expand_alpha_1x64 (d),
+ pix_multiply_1x64 (unpack_32_1x64 (src),
+ unpack_32_1x64 (mask))));
+}
+
+static force_inline void
+core_combine_over_reverse_ca_sse2 (uint32_t* pd,
+ const uint32_t* ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+}
+
+static force_inline void
+core_combine_in_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
+ expand_alpha_1x64 (unpack_32_1x64 (d))));
+
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (m)),
+ expand_alpha_1x64 (unpack_32_1x64 (d))));
+
+ w--;
+ }
+}
+
+static force_inline void
+core_combine_in_reverse_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d),
+ pix_multiply_1x64 (unpack_32_1x64 (m),
+ expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d),
+ pix_multiply_1x64 (unpack_32_1x64 (m),
+ expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ w--;
+ }
+}
+
+static force_inline void
+core_combine_out_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (m)),
+ negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (m)),
+ negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+
+ w--;
+ }
+}
+
+static force_inline void
+core_combine_out_reverse_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d),
+ negate_1x64 (pix_multiply_1x64 (
+ unpack_32_1x64 (m),
+ expand_alpha_1x64 (unpack_32_1x64 (s))))));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (d),
+ negate_1x64 (pix_multiply_1x64 (
+ unpack_32_1x64 (m),
+ expand_alpha_1x64 (unpack_32_1x64 (s))))));
+ w--;
+ }
+}
+
+static force_inline uint32_t
+core_combine_atop_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
+{
+ __m64 m = unpack_32_1x64 (mask);
+ __m64 s = unpack_32_1x64 (src);
+ __m64 d = unpack_32_1x64 (dst);
+ __m64 sa = expand_alpha_1x64 (s);
+ __m64 da = expand_alpha_1x64 (d);
+
+ s = pix_multiply_1x64 (s, m);
+ m = negate_1x64 (pix_multiply_1x64 (m, sa));
+
+ return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+}
+
+static force_inline void
+core_combine_atop_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
+{
+ __m64 m = unpack_32_1x64 (mask);
+ __m64 s = unpack_32_1x64 (src);
+ __m64 d = unpack_32_1x64 (dst);
+
+ __m64 da = negate_1x64 (expand_alpha_1x64 (d));
+ __m64 sa = expand_alpha_1x64 (s);
+
+ s = pix_multiply_1x64 (s, m);
+ m = pix_multiply_1x64 (m, sa);
+
+ return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+}
+
+static force_inline void
+core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+}
+
+static force_inline uint32_t
+core_combine_xor_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
+{
+ __m64 a = unpack_32_1x64 (mask);
+ __m64 s = unpack_32_1x64 (src);
+ __m64 d = unpack_32_1x64 (dst);
+
+ __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
+ a, expand_alpha_1x64 (s)));
+ __m64 dest = pix_multiply_1x64 (s, a);
+ __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
+
+ return pack_1x64_32 (pix_add_multiply_1x64 (&d,
+ &alpha_dst,
+ &dest,
+ &alpha_src));
+}
+
+static force_inline void
+core_combine_xor_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+ w--;
+ }
+}
+
+static force_inline void
+core_combine_add_ca_sse2 (uint32_t * pd,
+ const uint32_t *ps,
+ const uint32_t *pm,
+ int w)
+{
+ uint32_t s, m, d;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
+ unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (
+ _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+ _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x64_32 (
+ _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
+ unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+ w--;
+ }
+}
+
+/* ---------------------------------------------------
+ * fb_compose_setup_sSE2
+ */
+static force_inline __m64
+create_mask_16_64 (uint16_t mask)
+{
+ return _mm_set1_pi16 (mask);
+}
+
+static force_inline __m128i
+create_mask_16_128 (uint16_t mask)
+{
+ return _mm_set1_epi16 (mask);
+}
+
+static force_inline __m64
+create_mask_2x32_64 (uint32_t mask0,
+ uint32_t mask1)
+{
+ return _mm_set_pi32 (mask0, mask1);
+}
+
+/* Work around a code generation bug in Sun Studio 12. */
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
+# define create_mask_2x32_128(mask0, mask1) \
+ (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+#else
+static force_inline __m128i
+create_mask_2x32_128 (uint32_t mask0,
+ uint32_t mask1)
+{
+ return _mm_set_epi32 (mask0, mask1, mask0, mask1);
+}
+#endif
+
+/* SSE2 code patch for fbcompose.c */
+
+static void
+sse2_combine_over_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_over_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_over_reverse_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_in_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_reverse_in_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_out_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_reverse_out_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_atop_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_xor_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_add_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_add_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_saturate_u_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_src_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_over_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_in_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_out_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_atop_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_xor_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ core_combine_add_ca_sse2 (dst, src, mask, width);
+ _mm_empty ();
+}
+
+/* -------------------------------------------------------------------
+ * composite_over_n_8888
+ */
+
+static void
+sse2_composite_over_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint32_t *dst_line, *dst, d;
+ int32_t w;
+ int dst_stride;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+ while (height--)
+ {
+ dst = dst_line;
+
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ d = *dst;
+ *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+ _mm_movepi64_pi64 (xmm_alpha),
+ unpack_32_1x64 (d)));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ w -= 4;
+ dst += 4;
+ }
+
+ while (w)
+ {
+ d = *dst;
+ *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+ _mm_movepi64_pi64 (xmm_alpha),
+ unpack_32_1x64 (d)));
+ w--;
+ }
+
+ }
+ _mm_empty ();
+}
+
+/* ---------------------------------------------------------------------
+ * composite_over_n_0565
+ */
+static void
+sse2_composite_over_n_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint16_t *dst_line, *dst, d;
+ int32_t w;
+ int dst_stride;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+ while (height--)
+ {
+ dst = dst_line;
+
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ d = *dst;
+
+ *dst++ = pack_565_32_16 (
+ pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+ _mm_movepi64_pi64 (xmm_alpha),
+ expand565_16_1x64 (d))));
+ w--;
+ }
+
+ while (w >= 8)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst0, &xmm_dst1);
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst2, &xmm_dst3);
+
+ xmm_dst = pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ save_128_aligned ((__m128i*)dst, xmm_dst);
+
+ dst += 8;
+ w -= 8;
+ }
+
+ while (w--)
+ {
+ d = *dst;
+ *dst++ = pack_565_32_16 (
+ pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+ _mm_movepi64_pi64 (xmm_alpha),
+ expand565_16_1x64 (d))));
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* ------------------------------
+ * composite_add_n_8888_8888_ca
+ */
+static void
+sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst_line, d;
+ uint32_t *mask_line, m;
+ uint32_t pack_cmp;
+ int dst_stride, mask_stride;
+
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+ srca = src >> 24;
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ xmm_src = _mm_unpacklo_epi8 (
+ create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = _mm_movepi64_pi64 (xmm_src);
+ mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+
+ while (height--)
+ {
+ int w = width;
+ const uint32_t *pm = (uint32_t *)mask_line;
+ uint32_t *pd = (uint32_t *)dst_line;
+
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+
+ mmx_mask = unpack_32_1x64 (m);
+ mmx_dest = unpack_32_1x64 (d);
+
+ *pd = pack_1x64_32 (
+ _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+ pack_cmp =
+ _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+ if (pack_cmp != 0xffff)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)pd);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_src, &xmm_src,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+ xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
+ }
+
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+
+ mmx_mask = unpack_32_1x64 (m);
+ mmx_dest = unpack_32_1x64 (d);
+
+ *pd = pack_1x64_32 (
+ _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* ---------------------------------------------------------------------------
+ * composite_over_n_8888_8888_ca
+ */
+
+static void
+sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint32_t *dst_line, d;
+ uint32_t *mask_line, m;
+ uint32_t pack_cmp;
+ int dst_stride, mask_stride;
+
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ xmm_src = _mm_unpacklo_epi8 (
+ create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = _mm_movepi64_pi64 (xmm_src);
+ mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+
+ while (height--)
+ {
+ int w = width;
+ const uint32_t *pm = (uint32_t *)mask_line;
+ uint32_t *pd = (uint32_t *)dst_line;
+
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ mmx_mask = unpack_32_1x64 (m);
+ mmx_dest = unpack_32_1x64 (d);
+
+ *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+ pack_cmp =
+ _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+ if (pack_cmp != 0xffff)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)pd);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ mmx_mask = unpack_32_1x64 (m);
+ mmx_dest = unpack_32_1x64 (d);
+
+ *pd = pack_1x64_32 (
+ in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/*---------------------------------------------------------------------
+ * composite_over_8888_n_8888
+ */
+
+static void
+sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ uint32_t mask;
+ int32_t w;
+ int dst_stride, src_stride;
+
+ __m128i xmm_mask;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
+
+ xmm_mask = create_mask_16_128 (mask >> 24);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = *src++;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m64 ms = unpack_32_1x64 (s);
+ __m64 alpha = expand_alpha_1x64 (ms);
+ __m64 dest = _mm_movepi64_pi64 (xmm_mask);
+ __m64 alpha_dst = unpack_32_1x64 (d);
+
+ *dst = pack_1x64_32 (
+ in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+ }
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
+
+ if (!is_zero (xmm_src))
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t s = *src++;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m64 ms = unpack_32_1x64 (s);
+ __m64 alpha = expand_alpha_1x64 (ms);
+ __m64 mask = _mm_movepi64_pi64 (xmm_mask);
+ __m64 dest = unpack_32_1x64 (d);
+
+ *dst = pack_1x64_32 (
+ in_over_1x64 (&ms, &alpha, &mask, &dest));
+ }
+
+ dst++;
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/*---------------------------------------------------------------------
+ * composite_over_8888_n_8888
+ */
+
+static void
+sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int32_t w;
+ int dst_stride, src_stride;
+
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
+
+ xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
+ xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
+ xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
+ xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
+
+ save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* ---------------------------------------------------------------------
+ * composite_over_x888_n_8888
+ */
+static void
+sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ uint32_t mask;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ __m128i xmm_mask, xmm_alpha;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
+
+ xmm_mask = create_mask_16_128 (mask >> 24);
+ xmm_alpha = mask_00ff;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = (*src++) | 0xff000000;
+ uint32_t d = *dst;
+
+ __m64 src = unpack_32_1x64 (s);
+ __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
+ __m64 mask = _mm_movepi64_pi64 (xmm_mask);
+ __m64 dest = unpack_32_1x64 (d);
+
+ *dst++ = pack_1x64_32 (
+ in_over_1x64 (&src, &alpha, &mask, &dest));
+
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src = _mm_or_si128 (
+ load_128_unaligned ((__m128i*)src), mask_ff000000);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+
+ }
+
+ while (w)
+ {
+ uint32_t s = (*src++) | 0xff000000;
+ uint32_t d = *dst;
+
+ __m64 src = unpack_32_1x64 (s);
+ __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
+ __m64 mask = _mm_movepi64_pi64 (xmm_mask);
+ __m64 dest = unpack_32_1x64 (d);
+
+ *dst++ = pack_1x64_32 (
+ in_over_1x64 (&src, &alpha, &mask, &dest));
+
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* --------------------------------------------------------------------
+ * composite_over_8888_8888
+ */
+static void
+sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ int dst_stride, src_stride;
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ dst = dst_line;
+ src = src_line;
+
+ while (height--)
+ {
+ core_combine_over_u_sse2 (dst, src, NULL, width);
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+ _mm_empty ();
+}
+
+/* ------------------------------------------------------------------
+ * composite_over_8888_0565
+ */
+static force_inline uint16_t
+composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
+{
+ __m64 ms;
+
+ ms = unpack_32_1x64 (src);
+ return pack_565_32_16 (
+ pack_1x64_32 (
+ over_1x64 (
+ ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
+}
+
+static void
+sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+ /* FIXME
+ *
+ * I copy the code from MMX one and keep the fixme.
+ * If it's a problem there, probably is a problem here.
+ */
+ assert (src_image->drawable == mask_image->drawable);
+#endif
+
+ while (height--)
+ {
+ dst = dst_line;
+ src = src_line;
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Align dst on a 16-byte boundary */
+ while (w &&
+ ((unsigned long)dst & 15))
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = composite_over_8888_0565pixel (s, d);
+ w--;
+ }
+
+ /* It's a 8 pixel loop */
+ while (w >= 8)
+ {
+ /* I'm loading unaligned because I'm not sure
+ * about the address alignment.
+ */
+ xmm_src = load_128_unaligned ((__m128i*) src);
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ /* I'm loading next 4 pixels from memory
+ * before to optimze the memory read.
+ */
+ xmm_src = load_128_unaligned ((__m128i*) (src + 4));
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst0, &xmm_dst1);
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst2, &xmm_dst3);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ src += 8;
+ }
+
+ while (w--)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = composite_over_8888_0565pixel (s, d);
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* -----------------------------------------------------------------
+ * composite_over_n_8_8888
+ */
+
+static void
+sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m, d;
+
+ __m128i xmm_src, xmm_alpha, xmm_def;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ xmm_def = create_mask_2x32_128 (src, src);
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = _mm_movepi64_pi64 (xmm_src);
+ mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_pixel_8_1x64 (m);
+ mmx_dest = unpack_32_1x64 (d);
+
+ *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ m = *((uint32_t*)mask);
+
+ if (srca == 0xff && m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_def);
+ }
+ else if (m)
+ {
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_pixel_8_1x64 (m);
+ mmx_dest = unpack_32_1x64 (d);
+
+ *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ w--;
+ dst++;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* ----------------------------------------------------------------
+ * composite_over_n_8_8888
+ */
+
+pixman_bool_t
+pixman_fill_sse2 (uint32_t *bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t data)
+{
+ uint32_t byte_width;
+ uint8_t *byte_line;
+
+ __m128i xmm_def;
+
+ if (bpp == 8)
+ {
+ uint8_t b;
+ uint16_t w;
+
+ stride = stride * (int) sizeof (uint32_t) / 1;
+ byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+ byte_width = width;
+ stride *= 1;
+
+ b = data & 0xff;
+ w = (b << 8) | b;
+ data = (w << 16) | w;
+ }
+ else if (bpp == 16)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 2;
+ byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+ byte_width = 2 * width;
+ stride *= 2;
+
+ data = (data & 0xffff) * 0x00010001;
+ }
+ else if (bpp == 32)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 4;
+ byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+ byte_width = 4 * width;
+ stride *= 4;
+ }
+ else
+ {
+ return FALSE;
+ }
+
+ xmm_def = create_mask_2x32_128 (data, data);
+
+ while (height--)
+ {
+ int w;
+ uint8_t *d = byte_line;
+ byte_line += stride;
+ w = byte_width;
+
+ while (w >= 1 && ((unsigned long)d & 1))
+ {
+ *(uint8_t *)d = data;
+ w -= 1;
+ d += 1;
+ }
+
+ while (w >= 2 && ((unsigned long)d & 3))
+ {
+ *(uint16_t *)d = data;
+ w -= 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((unsigned long)d & 15))
+ {
+ *(uint32_t *)d = data;
+
+ w -= 4;
+ d += 4;
+ }
+
+ while (w >= 128)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+ save_128_aligned ((__m128i*)(d + 32), xmm_def);
+ save_128_aligned ((__m128i*)(d + 48), xmm_def);
+ save_128_aligned ((__m128i*)(d + 64), xmm_def);
+ save_128_aligned ((__m128i*)(d + 80), xmm_def);
+ save_128_aligned ((__m128i*)(d + 96), xmm_def);
+ save_128_aligned ((__m128i*)(d + 112), xmm_def);
+
+ d += 128;
+ w -= 128;
+ }
+
+ if (w >= 64)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+ save_128_aligned ((__m128i*)(d + 32), xmm_def);
+ save_128_aligned ((__m128i*)(d + 48), xmm_def);
+
+ d += 64;
+ w -= 64;
+ }
+
+ if (w >= 32)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+
+ d += 32;
+ w -= 32;
+ }
+
+ if (w >= 16)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+
+ d += 16;
+ w -= 16;
+ }
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = data;
+
+ w -= 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = data;
+ w -= 2;
+ d += 2;
+ }
+
+ if (w >= 1)
+ {
+ *(uint8_t *)d = data;
+ w -= 1;
+ d += 1;
+ }
+ }
+
+ _mm_empty ();
+ return TRUE;
+}
+
+static void
+sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m;
+
+ __m128i xmm_src, xmm_def;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ {
+ pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
+ PIXMAN_FORMAT_BPP (dst_image->bits.format),
+ dest_x, dest_y, width, height, 0);
+ return;
+ }
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ xmm_def = create_mask_2x32_128 (src, src);
+ xmm_src = expand_pixel_32_1x128 (src);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ *dst = pack_1x64_32 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+ }
+ else
+ {
+ *dst = 0;
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ m = *((uint32_t*)mask);
+
+ if (srca == 0xff && m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_def);
+ }
+ else if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_src, &xmm_src,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+ }
+ else
+ {
+ save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ *dst = pack_1x64_32 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+ }
+ else
+ {
+ *dst = 0;
+ }
+
+ w--;
+ dst++;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/*-----------------------------------------------------------------------
+ * composite_over_n_8_0565
+ */
+
+static void
+sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint16_t *dst_line, *dst, d;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m;
+ __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = _mm_movepi64_pi64 (xmm_src);
+ mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+ mmx_dest = expand565_16_1x64 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x64_32 (
+ in_over_1x64 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 8)
+ {
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ m = *((uint32_t*)mask);
+ mask += 4;
+
+ if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ m = *((uint32_t*)mask);
+ mask += 4;
+
+ if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ }
+
+ while (w)
+ {
+ m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+ mmx_dest = expand565_16_1x64 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x64_32 (
+ in_over_1x64 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* -----------------------------------------------------------------------
+ * composite_over_pixbuf_0565
+ */
+
+static void
+sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint32_t opaque, zero;
+
+ __m64 ms;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+ /* FIXME
+ *
+ * I copy the code from MMX one and keep the fixme.
+ * If it's a problem there, probably is a problem here.
+ */
+ assert (src_image->drawable == mask_image->drawable);
+#endif
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = *src++;
+ d = *dst;
+
+ ms = unpack_32_1x64 (s);
+
+ *dst++ = pack_565_32_16 (
+ pack_1x64_32 (
+ over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+ w--;
+ }
+
+ while (w >= 8)
+ {
+ /* First round */
+ xmm_src = load_128_unaligned ((__m128i*)src);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ opaque = is_opaque (xmm_src);
+ zero = is_zero (xmm_src);
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+ /* preload next round*/
+ xmm_src = load_128_unaligned ((__m128i*)(src + 4));
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+ else if (!zero)
+ {
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ /* Second round */
+ opaque = is_opaque (xmm_src);
+ zero = is_zero (xmm_src);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+ else if (!zero)
+ {
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ src += 8;
+ dst += 8;
+ }
+
+ while (w)
+ {
+ s = *src++;
+ d = *dst;
+
+ ms = unpack_32_1x64 (s);
+
+ *dst++ = pack_565_32_16 (
+ pack_1x64_32 (
+ over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* -------------------------------------------------------------------------
+ * composite_over_pixbuf_8888
+ */
+
+static void
+sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint32_t opaque, zero;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+ /* FIXME
+ *
+ * I copy the code from MMX one and keep the fixme.
+ * If it's a problem there, probably is a problem here.
+ */
+ assert (src_image->drawable == mask_image->drawable);
+#endif
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = pack_1x64_32 (
+ over_rev_non_pre_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (d)));
+
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src_hi = load_128_unaligned ((__m128i*)src);
+
+ opaque = is_opaque (xmm_src_hi);
+ zero = is_zero (xmm_src_hi);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ else if (!zero)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ src += 4;
+ }
+
+ while (w)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = pack_1x64_32 (
+ over_rev_non_pre_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (d)));
+
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* -------------------------------------------------------------------------------------------------
+ * composite_over_n_8888_0565_ca
+ */
+
+static void
+sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint16_t *dst_line, *dst, d;
+ uint32_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
+ int w;
+ uint32_t pack_cmp;
+
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+ __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = _mm_movepi64_pi64 (xmm_src);
+ mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+
+ while (height--)
+ {
+ w = width;
+ mask = mask_line;
+ dst = dst_line;
+ mask_line += mask_stride;
+ dst_line += dst_stride;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = *(uint32_t *) mask;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = unpack_32_1x64 (m);
+ mmx_dest = expand565_16_1x64 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x64_32 (
+ in_over_1x64 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ mask++;
+ }
+
+ while (w >= 8)
+ {
+ /* First round */
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ /* preload next round */
+ xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
+
+ /* preload next round */
+ if (pack_cmp != 0xffff)
+ {
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ /* Second round */
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ if (pack_cmp != 0xffff)
+ {
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ mask += 8;
+ }
+
+ while (w)
+ {
+ m = *(uint32_t *) mask;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = unpack_32_1x64 (m);
+ mmx_dest = expand565_16_1x64 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x64_32 (
+ in_over_1x64 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ mask++;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* -----------------------------------------------------------------------
+ * composite_in_n_8_8
+ */
+
+static void
+sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ uint32_t d, m;
+ uint32_t src;
+ uint8_t sa;
+ int32_t w;
+
+ __m128i xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ sa = src >> 24;
+
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
+ unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ mask += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ pix_multiply_1x64 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* -----------------------------------------------------------------------
+ * composite_in_n_8
+ */
+
+static void
+sse2_composite_in_n_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ int dst_stride;
+ uint32_t d;
+ uint32_t src;
+ int32_t w;
+
+ __m128i xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+ src = src >> 24;
+
+ if (src == 0xff)
+ return;
+
+ if (src == 0x00)
+ {
+ pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+ 8, dest_x, dest_y, width, height, src);
+
+ return;
+ }
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_alpha),
+ unpack_32_1x64 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_alpha),
+ unpack_32_1x64 (d)));
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* ---------------------------------------------------------------------------
+ * composite_in_8_8
+ */
+
+static void
+sse2_composite_in_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int src_stride, dst_stride;
+ int32_t w;
+ uint32_t s, d;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ s = (uint32_t) *src++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ pix_multiply_1x64 (
+ unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ s = (uint32_t) *src++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* -------------------------------------------------------------------------
+ * composite_add_n_8_8
+ */
+
+static void
+sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t src;
+ uint8_t sa;
+ uint32_t m, d;
+
+ __m128i xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ sa = src >> 24;
+
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ _mm_adds_pu16 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+ xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ mask += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x64_32 (
+ _mm_adds_pu16 (
+ pix_multiply_1x64 (
+ _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+ unpack_32_1x64 (d)));
+
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* -------------------------------------------------------------------------
+ * composite_add_n_8_8
+ */
+
+static void
+sse2_composite_add_n_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ int dst_stride;
+ int32_t w;
+ uint32_t src;
+
+ __m128i xmm_src;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ src >>= 24;
+
+ if (src == 0x00)
+ return;
+
+ if (src == 0xff)
+ {
+ pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+ 8, dest_x, dest_y, width, height, 0xff);
+
+ return;
+ }
+
+ src = (src << 24) | (src << 16) | (src << 8) | src;
+ xmm_src = _mm_set_epi32 (src, src, src, src);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ *dst = (uint8_t)_mm_cvtsi64_si32 (
+ _mm_adds_pu8 (
+ _mm_movepi64_pi64 (xmm_src),
+ _mm_cvtsi32_si64 (*dst)));
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 16)
+ {
+ save_128_aligned (
+ (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
+
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst = (uint8_t)_mm_cvtsi64_si32 (
+ _mm_adds_pu8 (
+ _mm_movepi64_pi64 (xmm_src),
+ _mm_cvtsi32_si64 (*dst)));
+
+ w--;
+ dst++;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* ----------------------------------------------------------------------
+ * composite_add_8_8
+ */
+
+static void
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint16_t t;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ src = src_line;
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Small head */
+ while (w && (unsigned long)dst & 3)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+
+ core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+ /* Small tail */
+ dst += w & 0xfffc;
+ src += w & 0xfffc;
+
+ w &= 3;
+
+ while (w)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* ---------------------------------------------------------------------
+ * composite_add_8888_8888
+ */
+static void
+sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+
+ core_combine_add_u_sse2 (dst, src, NULL, width);
+ }
+
+ _mm_empty ();
+}
+
+/* -------------------------------------------------------------------------------------------------
+ * sse2_composite_copy_area
+ */
+
+static pixman_bool_t
+pixman_blt_sse2 (uint32_t *src_bits,
+ uint32_t *dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height)
+{
+ uint8_t * src_bytes;
+ uint8_t * dst_bytes;
+ int byte_width;
+
+ if (src_bpp != dst_bpp)
+ return FALSE;
+
+ if (src_bpp == 16)
+ {
+ src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+ dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+ src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+ dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+ byte_width = 2 * width;
+ src_stride *= 2;
+ dst_stride *= 2;
+ }
+ else if (src_bpp == 32)
+ {
+ src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+ dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+ src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+ dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+ byte_width = 4 * width;
+ src_stride *= 4;
+ dst_stride *= 4;
+ }
+ else
+ {
+ return FALSE;
+ }
+
+ while (height--)
+ {
+ int w;
+ uint8_t *s = src_bytes;
+ uint8_t *d = dst_bytes;
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ w = byte_width;
+
+ while (w >= 2 && ((unsigned long)d & 3))
+ {
+ *(uint16_t *)d = *(uint16_t *)s;
+ w -= 2;
+ s += 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((unsigned long)d & 15))
+ {
+ *(uint32_t *)d = *(uint32_t *)s;
+
+ w -= 4;
+ s += 4;
+ d += 4;
+ }
+
+ while (w >= 64)
+ {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+
+ xmm0 = load_128_unaligned ((__m128i*)(s));
+ xmm1 = load_128_unaligned ((__m128i*)(s + 16));
+ xmm2 = load_128_unaligned ((__m128i*)(s + 32));
+ xmm3 = load_128_unaligned ((__m128i*)(s + 48));
+
+ save_128_aligned ((__m128i*)(d), xmm0);
+ save_128_aligned ((__m128i*)(d + 16), xmm1);
+ save_128_aligned ((__m128i*)(d + 32), xmm2);
+ save_128_aligned ((__m128i*)(d + 48), xmm3);
+
+ s += 64;
+ d += 64;
+ w -= 64;
+ }
+
+ while (w >= 16)
+ {
+ save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
+
+ w -= 16;
+ d += 16;
+ s += 16;
+ }
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = *(uint32_t *)s;
+
+ w -= 4;
+ s += 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = *(uint16_t *)s;
+ w -= 2;
+ s += 2;
+ d += 2;
+ }
+ }
+
+ _mm_empty ();
+
+ return TRUE;
+}
+
+static void
+sse2_composite_copy_area (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ pixman_blt_sse2 (src_image->bits.bits,
+ dst_image->bits.bits,
+ src_image->bits.rowstride,
+ dst_image->bits.rowstride,
+ PIXMAN_FORMAT_BPP (src_image->bits.format),
+ PIXMAN_FORMAT_BPP (dst_image->bits.format),
+ src_x, src_y, dest_x, dest_y, width, height);
+}
+
+static void
+sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint8_t *mask, *mask_line;
+ uint32_t m;
+ int src_stride, mask_stride, dst_stride;
+ int32_t w;
+ __m64 ms;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = 0xff000000 | *src++;
+ m = (uint32_t) *mask++;
+ d = *dst;
+ ms = unpack_32_1x64 (s);
+
+ if (m != 0xff)
+ {
+ __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+ __m64 md = unpack_32_1x64 (d);
+
+ ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
+ }
+
+ *dst++ = pack_1x64_32 (ms);
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ m = *(uint32_t*) mask;
+ xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+ if (m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ src += 4;
+ dst += 4;
+ mask += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+
+ if (m)
+ {
+ s = 0xff000000 | *src;
+
+ if (m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m64 ma, md, ms;
+
+ d = *dst;
+
+ ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+ md = unpack_32_1x64 (d);
+ ms = unpack_32_1x64 (s);
+
+ *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
+ }
+
+ }
+
+ src++;
+ dst++;
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+static void
+sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint8_t *mask, *mask_line;
+ uint32_t m;
+ int src_stride, mask_stride, dst_stride;
+ int32_t w;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t sa;
+
+ s = *src++;
+ m = (uint32_t) *mask++;
+ d = *dst;
+
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m64 ms, md, ma, msa;
+
+ ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
+ ms = unpack_32_1x64 (s);
+ md = unpack_32_1x64 (d);
+
+ msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+
+ *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+ }
+ }
+
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ m = *(uint32_t *) mask;
+
+ if (m)
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
+
+ if (m == 0xffffffff && is_opaque (xmm_src))
+ {
+ save_128_aligned ((__m128i *)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i *)dst);
+
+ xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+ &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ }
+
+ src += 4;
+ dst += 4;
+ mask += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t sa;
+
+ s = *src++;
+ m = (uint32_t) *mask++;
+ d = *dst;
+
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m64 ms, md, ma, msa;
+
+ ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
+ ms = unpack_32_1x64 (s);
+ md = unpack_32_1x64 (d);
+
+ msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+
+ *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+ }
+ }
+
+ dst++;
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+static void
+sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+ uint32_t *dst_line, *dst;
+ __m128i xmm_src;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_dsta_hi, xmm_dsta_lo;
+ int dst_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ xmm_src = expand_pixel_32_1x128 (src);
+
+ while (height--)
+ {
+ dst = dst_line;
+
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ __m64 vd;
+
+ vd = unpack_32_1x64 (*dst);
+
+ *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
+ _mm_movepi64_pi64 (xmm_src)));
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ __m128i tmp_lo, tmp_hi;
+
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
+
+ tmp_lo = xmm_src;
+ tmp_hi = xmm_src;
+
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dsta_lo, &xmm_dsta_hi,
+ &tmp_lo, &tmp_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
+
+ w -= 4;
+ dst += 4;
+ }
+
+ while (w)
+ {
+ __m64 vd;
+
+ vd = unpack_32_1x64 (*dst);
+
+ *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
+ _mm_movepi64_pi64 (xmm_src)));
+ w--;
+ dst++;
+ }
+
+ }
+
+ _mm_empty ();
+}
+
+static void
+sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint32_t *mask, *mask_line;
+ uint32_t m;
+ int src_stride, mask_stride, dst_stride;
+ int32_t w;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t sa;
+
+ s = *src++;
+ m = (*mask++) >> 24;
+ d = *dst;
+
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m64 ms, md, ma, msa;
+
+ ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
+ ms = unpack_32_1x64 (s);
+ md = unpack_32_1x64 (d);
+
+ msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+
+ *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+ }
+ }
+
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+
+ if (!is_transparent (xmm_mask))
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
+
+ if (is_opaque (xmm_mask) && is_opaque (xmm_src))
+ {
+ save_128_aligned ((__m128i *)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i *)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+ expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+ &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ }
+
+ src += 4;
+ dst += 4;
+ mask += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t sa;
+
+ s = *src++;
+ m = (*mask++) >> 24;
+ d = *dst;
+
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m64 ms, md, ma, msa;
+
+ ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
+ ms = unpack_32_1x64 (s);
+ md = unpack_32_1x64 (d);
+
+ msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+
+ *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+ }
+ }
+
+ dst++;
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
+/* A variant of 'core_combine_over_u_sse2' with minor tweaks */
+static force_inline void
+scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
+ const uint32_t* ps,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx)
+{
+ uint32_t s, d;
+ const uint32_t* pm = NULL;
+
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ /* Align dst on a 16-byte boundary */
+ while (w && ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps + (vx >> 16), pm);
+ vx += unit_x;
+
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m128i tmp;
+ uint32_t tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = ps[vx >> 16];
+ vx += unit_x;
+ tmp2 = ps[vx >> 16];
+ vx += unit_x;
+ tmp3 = ps[vx >> 16];
+ vx += unit_x;
+ tmp4 = ps[vx >> 16];
+ vx += unit_x;
+
+ tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+ xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
+
+ if (is_opaque (xmm_src_hi))
+ {
+ save_128_aligned ((__m128i*)pd, xmm_src_hi);
+ }
+ else if (!is_zero (xmm_src_hi))
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (
+ xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps + (vx >> 16), pm);
+ vx += unit_x;
+
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ w--;
+ }
+ _mm_empty ();
+}
+
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, PAD)
+
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+ /* PIXMAN_OP_OVER */
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+
+ /* PIXMAN_OP_OVER_REVERSE */
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
+
+ /* PIXMAN_OP_ADD */
+ PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+
+ /* PIXMAN_OP_SRC */
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
+
+ /* PIXMAN_OP_IN */
+ PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
+ PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
+ PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
+
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+ { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+sse2_blt (pixman_implementation_t *imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height)
+{
+ if (!pixman_blt_sse2 (
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dst_x, dst_y, width, height))
+
+ {
+ return _pixman_implementation_blt (
+ imp->delegate,
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dst_x, dst_y, width, height);
+ }
+
+ return TRUE;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
+ {
+ return _pixman_implementation_fill (
+ imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+ }
+
+ return TRUE;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (void)
+{
+#ifdef USE_MMX
+ pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
+#else
+ pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
+#endif
+ pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
+
+ /* SSE2 constants */
+ mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+ mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+ mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+ mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+ mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+ mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+ mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+ mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+ mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
+ mask_0080 = create_mask_16_128 (0x0080);
+ mask_00ff = create_mask_16_128 (0x00ff);
+ mask_0101 = create_mask_16_128 (0x0101);
+ mask_ffff = create_mask_16_128 (0xffff);
+ mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+ mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+
+ /* MMX constants */
+ mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
+ mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
+
+ mask_x0080 = create_mask_16_64 (0x0080);
+ mask_x00ff = create_mask_16_64 (0x00ff);
+ mask_x0101 = create_mask_16_64 (0x0101);
+ mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
+
+ _mm_empty ();
+
+ /* Set up function pointers */
+
+ /* SSE code patch for fbcompose.c */
+ imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+ imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+ imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+ imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+ imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+ imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+ imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+ imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+ imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+ imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+ imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+ imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+ imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+ imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+ imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+ imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+ imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+ imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
+ imp->blt = sse2_blt;
+ imp->fill = sse2_fill;
+
+ return imp;
+}
+
+#endif /* USE_SSE2 */
diff --git a/pixman/test/Makefile.am b/pixman/test/Makefile.am
index 19c4f8006..71e535374 100644
--- a/pixman/test/Makefile.am
+++ b/pixman/test/Makefile.am
@@ -23,40 +23,61 @@ TESTPROGRAMS = \
composite
a1_trap_test_LDADD = $(TEST_LDADD)
+a1_trap_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
+
fetch_test_LDADD = $(TEST_LDADD)
+fetch_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
+
trap_crasher_LDADD = $(TEST_LDADD)
+trap_crasher_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
+
oob_test_LDADD = $(TEST_LDADD)
+oob_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
+
scaling_crash_test_LDADD = $(TEST_LDADD)
+scaling_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
+
region_translate_test_LDADD = $(TEST_LDADD)
+region_translate_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
pdf_op_test_LDADD = $(TEST_LDADD)
+pdf_op_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
pdf_op_test_SOURCES = pdf-op-test.c utils.c utils.h
region_test_LDADD = $(TEST_LDADD)
+region_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
region_test_SOURCES = region-test.c utils.c utils.h
blitters_test_LDADD = $(TEST_LDADD)
+blitters_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
blitters_test_SOURCES = blitters-test.c utils.c utils.h
scaling_test_LDADD = $(TEST_LDADD)
+scaling_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
scaling_test_SOURCES = scaling-test.c utils.c utils.h
affine_test_LDADD = $(TEST_LDADD)
+affine_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
affine_test_SOURCES = affine-test.c utils.c utils.h
alphamap_LDADD = $(TEST_LDADD)
+alphamap_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
alphamap_SOURCES = alphamap.c utils.c utils.h
alpha_loop_LDADD = $(TEST_LDADD)
+alpha_loop_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
alpha_loop_SOURCES = alpha-loop.c utils.c utils.h
composite_LDADD = $(TEST_LDADD)
+composite_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
composite_SOURCES = composite.c utils.c utils.h
gradient_crash_test_LDADD = $(TEST_LDADD)
+gradient_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h
stress_test_LDADD = $(TEST_LDADD)
+stress_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
stress_test_SOURCES = stress-test.c utils.c utils.h
# GTK using test programs