From c4f44c07c6662d1ce08603945ccc4fa5afaa742a Mon Sep 17 00:00:00 2001 From: marha Date: Mon, 27 Feb 2012 07:24:18 +0100 Subject: fontconfig pixman mesa git update 27 Feb 2012 --- pixman/configure.ac | 48 +++++ pixman/pixman/Makefile.am | 16 ++ pixman/pixman/pixman-cpu.c | 53 +++++ pixman/pixman/pixman-mips-dspr2-asm.S | 205 ++++++++++++++++++ pixman/pixman/pixman-mips-dspr2-asm.h | 206 ++++++++++++++++++ pixman/pixman/pixman-mips-dspr2.c | 84 ++++++++ pixman/pixman/pixman-mips-dspr2.h | 84 ++++++++ pixman/pixman/pixman-mips-memcpy-asm.S | 382 +++++++++++++++++++++++++++++++++ pixman/pixman/pixman-mmx.c | 358 +++++++++++++++--------------- pixman/pixman/pixman-private.h | 5 + pixman/test/lowlevel-blt-bench.c | 2 + 11 files changed, 1274 insertions(+), 169 deletions(-) create mode 100644 pixman/pixman/pixman-mips-dspr2-asm.S create mode 100644 pixman/pixman/pixman-mips-dspr2-asm.h create mode 100644 pixman/pixman/pixman-mips-dspr2.c create mode 100644 pixman/pixman/pixman-mips-dspr2.h create mode 100644 pixman/pixman/pixman-mips-memcpy-asm.S (limited to 'pixman') diff --git a/pixman/configure.ac b/pixman/configure.ac index f39f43739..5eeb6a54e 100644 --- a/pixman/configure.ac +++ b/pixman/configure.ac @@ -294,6 +294,9 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ #if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4)) #error "Need GCC >= 3.4 for MMX intrinsics" #endif +#if defined(__clang__) +#error "clang chokes on the inline assembly in pixman-mmx.c" +#endif #include int main () { __m64 v = _mm_cvtsi32_si64 (1); @@ -592,6 +595,51 @@ fi AM_CONDITIONAL(USE_ARM_IWMMXT, test $have_iwmmxt_intrinsics = yes) +dnl ========================================================================== +dnl Check if assembler is gas compatible and supports MIPS DSPr2 instructions + +have_mips_dspr2=no +AC_MSG_CHECKING(whether to use MIPS DSPr2 assembler) +xserver_save_CFLAGS=$CFLAGS +CFLAGS="-mdspr2 $CFLAGS" + +AC_COMPILE_IFELSE([[ +#if !(defined(__mips__) && __mips_isa_rev >= 2) +#error MIPS DSPr2 is currently only available on MIPS32r2 platforms. +#endif +int +main () +{ + int c = 0, a = 0, b = 0; + __asm__ __volatile__ ( + "precr.qb.ph %[c], %[a], %[b] \n\t" + : [c] "=r" (c) + : [a] "r" (a), [b] "r" (b) + ); + return c; +}]], have_mips_dspr2=yes) +CFLAGS=$xserver_save_CFLAGS + +AC_ARG_ENABLE(mips-dspr2, + [AC_HELP_STRING([--disable-mips-dspr2], + [disable MIPS DSPr2 fast paths])], + [enable_mips_dspr2=$enableval], [enable_mips_dspr2=auto]) + +if test $enable_mips_dspr2 = no ; then + have_mips_dspr2=disabled +fi + +if test $have_mips_dspr2 = yes ; then + AC_DEFINE(USE_MIPS_DSPR2, 1, [use MIPS DSPr2 assembly optimizations]) +fi + +AM_CONDITIONAL(USE_MIPS_DSPR2, test $have_mips_dspr2 = yes) + +AC_MSG_RESULT($have_mips_dspr2) +if test $enable_mips_dspr2 = yes && test $have_mips_dspr2 = no ; then + AC_MSG_ERROR([MIPS DSPr2 instructions not detected]) +fi + dnl ========================================================================================= dnl Check for GNU-style inline assembly support diff --git a/pixman/pixman/Makefile.am b/pixman/pixman/Makefile.am index 286b7cf36..fb7e04723 100644 --- a/pixman/pixman/Makefile.am +++ b/pixman/pixman/Makefile.am @@ -102,5 +102,21 @@ libpixman_1_la_LIBADD += libpixman-iwmmxt.la ASM_CFLAGS_IWMMXT=$(IWMMXT_CFLAGS) endif +# mips dspr2 code +if USE_MIPS_DSPR2 +noinst_LTLIBRARIES += libpixman-mips-dspr2.la +libpixman_mips_dspr2_la_SOURCES = \ + pixman-mips-dspr2.c \ + pixman-mips-dspr2.h \ + pixman-mips-dspr2-asm.S \ + pixman-mips-dspr2-asm.h \ + pixman-mips-memcpy-asm.S +libpixman_mips_dspr2_la_CFLAGS = $(DEP_CFLAGS) +libpixman_mips_dspr2_la_LIBADD = $(DEP_LIBS) +libpixman_1_la_LIBADD += libpixman-mips-dspr2.la + +ASM_CFLAGS_mips_dspr2= +endif + .c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES) $(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $< diff --git a/pixman/pixman/pixman-cpu.c b/pixman/pixman/pixman-cpu.c index 92942b217..fcf591a99 100644 --- a/pixman/pixman/pixman-cpu.c +++ b/pixman/pixman/pixman-cpu.c @@ -427,6 +427,54 @@ pixman_have_arm_iwmmxt (void) #endif /* USE_ARM_SIMD || USE_ARM_NEON || USE_ARM_IWMMXT */ +#if defined(USE_MIPS_DSPR2) + +#if defined (__linux__) /* linux ELF */ + +pixman_bool_t +pixman_have_mips_dspr2 (void) +{ + const char *search_string = "MIPS 74K"; + const char *file_name = "/proc/cpuinfo"; + /* Simple detection of MIPS DSP ASE (revision 2) at runtime for Linux. + * It is based on /proc/cpuinfo, which reveals hardware configuration + * to user-space applications. According to MIPS (early 2010), no similar + * facility is universally available on the MIPS architectures, so it's up + * to individual OSes to provide such. + * + * Only currently available MIPS core that supports DSPr2 is 74K. + */ + + char cpuinfo_line[256]; + + FILE *f = NULL; + + if ((f = fopen (file_name, "r")) == NULL) + return FALSE; + + while (fgets (cpuinfo_line, sizeof (cpuinfo_line), f) != NULL) + { + if (strstr (cpuinfo_line, search_string) != NULL) + { + fclose (f); + return TRUE; + } + } + + fclose (f); + + /* Did not find string in the proc file. */ + return FALSE; +} + +#else /* linux ELF */ + +#define pixman_have_mips_dspr2() FALSE + +#endif /* linux ELF */ + +#endif /* USE_MIPS_DSPR2 */ + #if defined(USE_X86_MMX) || defined(USE_SSE2) /* The CPU detection code needs to be in a file not compiled with * "-mmmx -msse", as gcc would generate CMOV instructions otherwise @@ -696,6 +744,11 @@ _pixman_choose_implementation (void) imp = _pixman_implementation_create_arm_neon (imp); #endif +#ifdef USE_MIPS_DSPR2 + if (pixman_have_mips_dspr2 ()) + imp = _pixman_implementation_create_mips_dspr2 (imp); +#endif + #ifdef USE_VMX if (pixman_have_vmx ()) imp = _pixman_implementation_create_vmx (imp); diff --git a/pixman/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman/pixman-mips-dspr2-asm.S new file mode 100644 index 000000000..0a4c87e37 --- /dev/null +++ b/pixman/pixman/pixman-mips-dspr2-asm.S @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nemanja Lukic (nlukic@mips.com) + */ + +#include "pixman-mips-dspr2-asm.h" + +LEAF_MIPS_DSPR2(pixman_composite_src_8888_0565_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (a8r8g8b8) + * a2 - w + */ + + beqz a2, 3f + nop + addiu t1, a2, -1 + beqz t1, 2f + nop + li t4, 0xf800f800 + li t5, 0x07e007e0 + li t6, 0x001f001f +1: + lw t0, 0(a1) + lw t1, 4(a1) + addiu a1, a1, 8 + addiu a2, a2, -2 + + CONVERT_2x8888_TO_2x0565 t0, t1, t2, t3, t4, t5, t6, t7, t8 + + sh t2, 0(a0) + sh t3, 2(a0) + + addiu t2, a2, -1 + bgtz t2, 1b + addiu a0, a0, 4 +2: + beqz a2, 3f + nop + lw t0, 0(a1) + + CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3 + + sh t1, 0(a0) +3: + j ra + nop + +END(pixman_composite_src_8888_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_0565_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (r5g6b5) + * a2 - w + */ + + beqz a2, 3f + nop + addiu t1, a2, -1 + beqz t1, 2f + nop + li t4, 0x07e007e0 + li t5, 0x001F001F +1: + lhu t0, 0(a1) + lhu t1, 2(a1) + addiu a1, a1, 4 + addiu a2, a2, -2 + + CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9 + + sw t2, 0(a0) + sw t3, 4(a0) + + addiu t2, a2, -1 + bgtz t2, 1b + addiu a0, a0, 8 +2: + beqz a2, 3f + nop + lhu t0, 0(a1) + + CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3 + + sw t1, 0(a0) +3: + j ra + nop + +END(pixman_composite_src_0565_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (x8r8g8b8) + * a2 - w + */ + + beqz a2, 4f + nop + li t9, 0xff000000 + srl t8, a2, 3 /* t1 = how many multiples of 8 src pixels */ + beqz t8, 3f /* branch if less than 8 src pixels */ + nop +1: + addiu t8, t8, -1 + beqz t8, 2f + addiu a2, a2, -8 + pref 0, 32(a1) + lw t0, 0(a1) + lw t1, 4(a1) + lw t2, 8(a1) + lw t3, 12(a1) + lw t4, 16(a1) + lw t5, 20(a1) + lw t6, 24(a1) + lw t7, 28(a1) + addiu a1, a1, 32 + or t0, t0, t9 + or t1, t1, t9 + or t2, t2, t9 + or t3, t3, t9 + or t4, t4, t9 + or t5, t5, t9 + or t6, t6, t9 + or t7, t7, t9 + pref 30, 32(a0) + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + b 1b + addiu a0, a0, 32 +2: + lw t0, 0(a1) + lw t1, 4(a1) + lw t2, 8(a1) + lw t3, 12(a1) + lw t4, 16(a1) + lw t5, 20(a1) + lw t6, 24(a1) + lw t7, 28(a1) + addiu a1, a1, 32 + or t0, t0, t9 + or t1, t1, t9 + or t2, t2, t9 + or t3, t3, t9 + or t4, t4, t9 + or t5, t5, t9 + or t6, t6, t9 + or t7, t7, t9 + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + beqz a2, 4f + addiu a0, a0, 32 +3: + lw t0, 0(a1) + addiu a1, a1, 4 + addiu a2, a2, -1 + or t1, t0, t9 + sw t1, 0(a0) + bnez a2, 3b + addiu a0, a0, 4 +4: + jr ra + nop + +END(pixman_composite_src_x888_8888_asm_mips) diff --git a/pixman/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman/pixman-mips-dspr2-asm.h new file mode 100644 index 000000000..e07cda470 --- /dev/null +++ b/pixman/pixman/pixman-mips-dspr2-asm.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nemanja Lukic (nlukic@mips.com) + */ + +#ifndef PIXMAN_MIPS_DSPR2_ASM_H +#define PIXMAN_MIPS_DSPR2_ASM_H + +#define zero $0 +#define AT $1 +#define v0 $2 +#define v1 $3 +#define a0 $4 +#define a1 $5 +#define a2 $6 +#define a3 $7 +#define t0 $8 +#define t1 $9 +#define t2 $10 +#define t3 $11 +#define t4 $12 +#define t5 $13 +#define t6 $14 +#define t7 $15 +#define s0 $16 +#define s1 $17 +#define s2 $18 +#define s3 $19 +#define s4 $20 +#define s5 $21 +#define s6 $22 +#define s7 $23 +#define t8 $24 +#define t9 $25 +#define k0 $26 +#define k1 $27 +#define gp $28 +#define sp $29 +#define fp $30 +#define s8 $30 +#define ra $31 + +/* + * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2 + */ +#define LEAF_MIPS32R2(symbol) \ + .globl symbol; \ + .align 2; \ + .type symbol, @function; \ + .ent symbol, 0; \ +symbol: .frame sp, 0, ra; \ + .set push; \ + .set arch=mips32r2; \ + .set noreorder; \ + .set noat; + +/* + * LEAF_MIPS32R2 - declare leaf routine for MIPS DSPr2 + */ +#define LEAF_MIPS_DSPR2(symbol) \ +LEAF_MIPS32R2(symbol) \ + .set dspr2; + +/* + * END - mark end of function + */ +#define END(function) \ + .set pop; \ + .end function; \ + .size function,.-function + +/* + * Conversion of single r5g6b5 pixel (in_565) to single a8r8g8b8 pixel + * returned in (out_8888) register. Requires two temporary registers + * (scratch1 and scratch2). + */ +.macro CONVERT_1x0565_TO_1x8888 in_565, \ + out_8888, \ + scratch1, scratch2 + lui \out_8888, 0xff00 + sll \scratch1, \in_565, 0x3 + andi \scratch2, \scratch1, 0xff + ext \scratch1, \in_565, 0x2, 0x3 + or \scratch1, \scratch2, \scratch1 + or \out_8888, \out_8888, \scratch1 + + sll \scratch1, \in_565, 0x5 + andi \scratch1, \scratch1, 0xfc00 + srl \scratch2, \in_565, 0x1 + andi \scratch2, \scratch2, 0x300 + or \scratch2, \scratch1, \scratch2 + or \out_8888, \out_8888, \scratch2 + + andi \scratch1, \in_565, 0xf800 + srl \scratch2, \scratch1, 0x5 + andi \scratch2, \scratch2, 0xff00 + or \scratch1, \scratch1, \scratch2 + sll \scratch1, \scratch1, 0x8 + or \out_8888, \out_8888, \scratch1 +.endm + +/* + * Conversion of two r5g6b5 pixels (in1_565 and in2_565) to two a8r8g8b8 pixels + * returned in (out1_8888 and out2_8888) registers. Requires four scratch + * registers (scratch1 ... scratch4). It also requires maskG and maskB for + * color component extractions. These masks must have following values: + * li maskG, 0x07e007e0 + * li maskB, 0x001F001F + */ +.macro CONVERT_2x0565_TO_2x8888 in1_565, in2_565, \ + out1_8888, out2_8888, \ + maskG, maskB, \ + scratch1, scratch2, scratch3, scratch4 + sll \scratch1, \in1_565, 16 + or \scratch1, \scratch1, \in2_565 + lui \out2_8888, 0xff00 + ori \out2_8888, \out2_8888, 0xff00 + shrl.ph \scratch2, \scratch1, 11 + and \scratch3, \scratch1, \maskG + shra.ph \scratch4, \scratch2, 2 + shll.ph \scratch2, \scratch2, 3 + shll.ph \scratch3, \scratch3, 5 + or \scratch2, \scratch2, \scratch4 + shrl.qb \scratch4, \scratch3, 6 + or \out2_8888, \out2_8888, \scratch2 + or \scratch3, \scratch3, \scratch4 + and \scratch1, \scratch1, \maskB + shll.ph \scratch2, \scratch1, 3 + shra.ph \scratch4, \scratch1, 2 + or \scratch2, \scratch2, \scratch4 + or \scratch3, \scratch2, \scratch3 + precrq.ph.w \out1_8888, \out2_8888, \scratch3 + precr_sra.ph.w \out2_8888, \scratch3, 0 +.endm + +/* + * Conversion of single a8r8g8b8 pixel (in_8888) to single r5g6b5 pixel + * returned in (out_565) register. Requires two temporary registers + * (scratch1 and scratch2). + */ +.macro CONVERT_1x8888_TO_1x0565 in_8888, \ + out_565, \ + scratch1, scratch2 + ext \out_565, \in_8888, 0x3, 0x5 + srl \scratch1, \in_8888, 0x5 + andi \scratch1, \scratch1, 0x07e0 + srl \scratch2, \in_8888, 0x8 + andi \scratch2, \scratch2, 0xf800 + or \out_565, \out_565, \scratch1 + or \out_565, \out_565, \scratch2 +.endm + +/* + * Conversion of two a8r8g8b8 pixels (in1_8888 and in2_8888) to two r5g6b5 + * pixels returned in (out1_565 and out2_565) registers. Requires two temporary + * registers (scratch1 and scratch2). It also requires maskR, maskG and maskB + * for color component extractions. These masks must have following values: + * li maskR, 0xf800f800 + * li maskG, 0x07e007e0 + * li maskB, 0x001F001F + * Value of input register in2_8888 is lost. + */ +.macro CONVERT_2x8888_TO_2x0565 in1_8888, in2_8888, \ + out1_565, out2_565, \ + maskR, maskG, maskB, \ + scratch1, scratch2 + precrq.ph.w \scratch1, \in2_8888, \in1_8888 + precr_sra.ph.w \in2_8888, \in1_8888, 0 + shll.ph \scratch1, \scratch1, 8 + srl \in2_8888, \in2_8888, 3 + and \scratch2, \in2_8888, \maskB + and \scratch1, \scratch1, \maskR + srl \in2_8888, \in2_8888, 2 + and \out2_565, \in2_8888, \maskG + or \out2_565, \out2_565, \scratch2 + or \out1_565, \out2_565, \scratch1 + srl \out2_565, \out1_565, 16 +.endm + +#endif //PIXMAN_MIPS_DSPR2_ASM_H diff --git a/pixman/pixman/pixman-mips-dspr2.c b/pixman/pixman/pixman-mips-dspr2.c new file mode 100644 index 000000000..e331853b7 --- /dev/null +++ b/pixman/pixman/pixman-mips-dspr2.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nemanja Lukic (nlukic@mips.com) + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "pixman-private.h" +#include "pixman-mips-dspr2.h" + +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_x888_8888, + uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_8888_0565, + uint32_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0565_8888, + uint16_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0565_0565, + uint16_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888, + uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888, + uint8_t, 3, uint8_t, 3) + +static const pixman_fast_path_t mips_dspr2_fast_paths[] = +{ + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mips_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mips_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mips_composite_src_8888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mips_composite_src_8888_0565), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mips_composite_src_8888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mips_composite_src_8888_0565), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, mips_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, mips_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, mips_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, mips_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, mips_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, mips_composite_src_0888_0888), + + { PIXMAN_OP_NONE }, +}; + +pixman_implementation_t * +_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback) +{ + pixman_implementation_t *imp = + _pixman_implementation_create (fallback, mips_dspr2_fast_paths); + + return imp; +} diff --git a/pixman/pixman/pixman-mips-dspr2.h b/pixman/pixman/pixman-mips-dspr2.h new file mode 100644 index 000000000..449c42a56 --- /dev/null +++ b/pixman/pixman/pixman-mips-dspr2.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nemanja Lukic (nlukic@mips.com) + */ + +#ifndef PIXMAN_MIPS_DSPR2_H +#define PIXMAN_MIPS_DSPR2_H + +#include "pixman-private.h" +#include "pixman-inlines.h" + +#define SKIP_ZERO_SRC 1 +#define SKIP_ZERO_MASK 2 +#define DO_FAST_MEMCPY 3 + +void +pixman_mips_fast_memcpy (void *dst, void *src, uint32_t n_bytes); + +/****************************************************************/ + +#define PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST(flags, name, \ + src_type, src_cnt, \ + dst_type, dst_cnt) \ +void \ +pixman_composite_##name##_asm_mips (dst_type *dst, \ + src_type *src, \ + int32_t w); \ + \ +static void \ +mips_composite_##name (pixman_implementation_t *imp, \ + pixman_composite_info_t *info) \ +{ \ + PIXMAN_COMPOSITE_ARGS (info); \ + dst_type *dst_line, *dst; \ + src_type *src_line, *src; \ + int32_t dst_stride, src_stride; \ + int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8; \ + \ + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type, \ + src_stride, src_line, src_cnt); \ + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \ + dst_stride, dst_line, dst_cnt); \ + \ + while (height--) \ + { \ + dst = dst_line; \ + dst_line += dst_stride; \ + src = src_line; \ + src_line += src_stride; \ + \ + if (flags == DO_FAST_MEMCPY) \ + pixman_mips_fast_memcpy (dst, src, width * bpp); \ + else \ + pixman_composite_##name##_asm_mips (dst, src, width); \ + } \ +} + +#endif //PIXMAN_MIPS_DSPR2_H diff --git a/pixman/pixman/pixman-mips-memcpy-asm.S b/pixman/pixman/pixman-mips-memcpy-asm.S new file mode 100644 index 000000000..9ad6da537 --- /dev/null +++ b/pixman/pixman/pixman-mips-memcpy-asm.S @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "pixman-mips-dspr2-asm.h" + +/* + * This routine could be optimized for MIPS64. The current code only + * uses MIPS32 instructions. + */ + +#ifdef EB +# define LWHI lwl /* high part is left in big-endian */ +# define SWHI swl /* high part is left in big-endian */ +# define LWLO lwr /* low part is right in big-endian */ +# define SWLO swr /* low part is right in big-endian */ +#else +# define LWHI lwr /* high part is right in little-endian */ +# define SWHI swr /* high part is right in little-endian */ +# define LWLO lwl /* low part is left in big-endian */ +# define SWLO swl /* low part is left in big-endian */ +#endif + +LEAF_MIPS32R2(pixman_mips_fast_memcpy) + + slti AT, a2, 8 + bne AT, zero, $last8 + move v0, a0 /* memcpy returns the dst pointer */ + +/* Test if the src and dst are word-aligned, or can be made word-aligned */ + xor t8, a1, a0 + andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement */ + + bne t8, zero, $unaligned + negu a3, a0 + + andi a3, a3, 0x3 /* we need to copy a3 bytes to make a0/a1 aligned */ + beq a3, zero, $chk16w /* when a3=0 then the dst (a0) is word-aligned */ + subu a2, a2, a3 /* now a2 is the remining bytes count */ + + LWHI t8, 0(a1) + addu a1, a1, a3 + SWHI t8, 0(a0) + addu a0, a0, a3 + +/* Now the dst/src are mutually word-aligned with word-aligned addresses */ +$chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ + /* t8 is the byte count after 64-byte chunks */ + + beq a2, t8, $chk8w /* if a2==t8, no 64-byte chunks */ + /* There will be at most 1 32-byte chunk after it */ + subu a3, a2, t8 /* subtract from a2 the reminder */ + /* Here a3 counts bytes in 16w chunks */ + addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ + + addu t0, a0, a2 /* t0 is the "past the end" address */ + +/* + * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past + * the "t0-32" address + * This means: for x=128 the last "safe" a0 address is "t0-160" + * Alternatively, for x=64 the last "safe" a0 address is "t0-96" + * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit + */ + subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ + + pref 0, 0(a1) /* bring the first line of src, addr 0 */ + pref 0, 32(a1) /* bring the second line of src, addr 32 */ + pref 0, 64(a1) /* bring the third line of src, addr 64 */ + pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ +/* In case the a0 > t9 don't use "pref 30" at all */ + sgtu v1, a0, t9 + bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short arrays */ + nop +/* otherwise, start with using pref30 */ + pref 30, 64(a0) +$loop16w: + pref 0, 96(a1) + lw t0, 0(a1) + bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */ + lw t1, 4(a1) + pref 30, 96(a0) /* continue setting up the dest, addr 96 */ +$skip_pref30_96: + lw t2, 8(a1) + lw t3, 12(a1) + lw t4, 16(a1) + lw t5, 20(a1) + lw t6, 24(a1) + lw t7, 28(a1) + pref 0, 128(a1) /* bring the next lines of src, addr 128 */ + + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + + lw t0, 32(a1) + bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */ + lw t1, 36(a1) + pref 30, 128(a0) /* continue setting up the dest, addr 128 */ +$skip_pref30_128: + lw t2, 40(a1) + lw t3, 44(a1) + lw t4, 48(a1) + lw t5, 52(a1) + lw t6, 56(a1) + lw t7, 60(a1) + pref 0, 160(a1) /* bring the next lines of src, addr 160 */ + + sw t0, 32(a0) + sw t1, 36(a0) + sw t2, 40(a0) + sw t3, 44(a0) + sw t4, 48(a0) + sw t5, 52(a0) + sw t6, 56(a0) + sw t7, 60(a0) + + addiu a0, a0, 64 /* adding 64 to dest */ + sgtu v1, a0, t9 + bne a0, a3, $loop16w + addiu a1, a1, 64 /* adding 64 to src */ + move a2, t8 + +/* Here we have src and dest word-aligned but less than 64-bytes to go */ + +$chk8w: + pref 0, 0x0(a1) + andi t8, a2, 0x1f /* is there a 32-byte chunk? */ + /* the t8 is the reminder count past 32-bytes */ + beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */ + nop + + lw t0, 0(a1) + lw t1, 4(a1) + lw t2, 8(a1) + lw t3, 12(a1) + lw t4, 16(a1) + lw t5, 20(a1) + lw t6, 24(a1) + lw t7, 28(a1) + addiu a1, a1, 32 + + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + addiu a0, a0, 32 + +$chk1w: + andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ + beq a2, t8, $last8 + subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ + addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ + +/* copying in words (4-byte chunks) */ +$wordCopy_loop: + lw t3, 0(a1) /* the first t3 may be equal t0 ... optimize? */ + addiu a1, a1, 4 + addiu a0, a0, 4 + bne a0, a3, $wordCopy_loop + sw t3, -4(a0) + +/* For the last (<8) bytes */ +$last8: + blez a2, leave + addu a3, a0, a2 /* a3 is the last dst address */ +$last8loop: + lb v1, 0(a1) + addiu a1, a1, 1 + addiu a0, a0, 1 + bne a0, a3, $last8loop + sb v1, -1(a0) + +leave: j ra + nop + +/* + * UNALIGNED case + */ + +$unaligned: + /* got here with a3="negu a0" */ + andi a3, a3, 0x3 /* test if the a0 is word aligned */ + beqz a3, $ua_chk16w + subu a2, a2, a3 /* bytes left after initial a3 bytes */ + + LWHI v1, 0(a1) + LWLO v1, 3(a1) + addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */ + SWHI v1, 0(a0) + addu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */ + +$ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ + /* t8 is the byte count after 64-byte chunks */ + beq a2, t8, $ua_chk8w /* if a2==t8, no 64-byte chunks */ + /* There will be at most 1 32-byte chunk after it */ + subu a3, a2, t8 /* subtract from a2 the reminder */ + /* Here a3 counts bytes in 16w chunks */ + addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ + + addu t0, a0, a2 /* t0 is the "past the end" address */ + + subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ + + pref 0, 0(a1) /* bring the first line of src, addr 0 */ + pref 0, 32(a1) /* bring the second line of src, addr 32 */ + pref 0, 64(a1) /* bring the third line of src, addr 64 */ + pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ +/* In case the a0 > t9 don't use "pref 30" at all */ + sgtu v1, a0, t9 + bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */ + nop +/* otherwise, start with using pref30 */ + pref 30, 64(a0) +$ua_loop16w: + pref 0, 96(a1) + LWHI t0, 0(a1) + LWLO t0, 3(a1) + LWHI t1, 4(a1) + bgtz v1, $ua_skip_pref30_96 + LWLO t1, 7(a1) + pref 30, 96(a0) /* continue setting up the dest, addr 96 */ +$ua_skip_pref30_96: + LWHI t2, 8(a1) + LWLO t2, 11(a1) + LWHI t3, 12(a1) + LWLO t3, 15(a1) + LWHI t4, 16(a1) + LWLO t4, 19(a1) + LWHI t5, 20(a1) + LWLO t5, 23(a1) + LWHI t6, 24(a1) + LWLO t6, 27(a1) + LWHI t7, 28(a1) + LWLO t7, 31(a1) + pref 0, 128(a1) /* bring the next lines of src, addr 128 */ + + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + + LWHI t0, 32(a1) + LWLO t0, 35(a1) + LWHI t1, 36(a1) + bgtz v1, $ua_skip_pref30_128 + LWLO t1, 39(a1) + pref 30, 128(a0) /* continue setting up the dest, addr 128 */ +$ua_skip_pref30_128: + LWHI t2, 40(a1) + LWLO t2, 43(a1) + LWHI t3, 44(a1) + LWLO t3, 47(a1) + LWHI t4, 48(a1) + LWLO t4, 51(a1) + LWHI t5, 52(a1) + LWLO t5, 55(a1) + LWHI t6, 56(a1) + LWLO t6, 59(a1) + LWHI t7, 60(a1) + LWLO t7, 63(a1) + pref 0, 160(a1) /* bring the next lines of src, addr 160 */ + + sw t0, 32(a0) + sw t1, 36(a0) + sw t2, 40(a0) + sw t3, 44(a0) + sw t4, 48(a0) + sw t5, 52(a0) + sw t6, 56(a0) + sw t7, 60(a0) + + addiu a0, a0, 64 /* adding 64 to dest */ + sgtu v1, a0, t9 + bne a0, a3, $ua_loop16w + addiu a1, a1, 64 /* adding 64 to src */ + move a2, t8 + +/* Here we have src and dest word-aligned but less than 64-bytes to go */ + +$ua_chk8w: + pref 0, 0x0(a1) + andi t8, a2, 0x1f /* is there a 32-byte chunk? */ + /* the t8 is the reminder count */ + beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk */ + + LWHI t0, 0(a1) + LWLO t0, 3(a1) + LWHI t1, 4(a1) + LWLO t1, 7(a1) + LWHI t2, 8(a1) + LWLO t2, 11(a1) + LWHI t3, 12(a1) + LWLO t3, 15(a1) + LWHI t4, 16(a1) + LWLO t4, 19(a1) + LWHI t5, 20(a1) + LWLO t5, 23(a1) + LWHI t6, 24(a1) + LWLO t6, 27(a1) + LWHI t7, 28(a1) + LWLO t7, 31(a1) + addiu a1, a1, 32 + + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + addiu a0, a0, 32 + +$ua_chk1w: + andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ + beq a2, t8, $ua_smallCopy + subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ + addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ + +/* copying in words (4-byte chunks) */ +$ua_wordCopy_loop: + LWHI v1, 0(a1) + LWLO v1, 3(a1) + addiu a1, a1, 4 + addiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */ + bne a0, a3, $ua_wordCopy_loop + sw v1, -4(a0) + +/* Now less than 4 bytes (value in a2) left to copy */ +$ua_smallCopy: + beqz a2, leave + addu a3, a0, a2 /* a3 is the last dst address */ +$ua_smallCopy_loop: + lb v1, 0(a1) + addiu a1, a1, 1 + addiu a0, a0, 1 + bne a0, a3, $ua_smallCopy_loop + sb v1, -1(a0) + + j ra + nop + +END(pixman_mips_fast_memcpy) diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c index a3500ce7d..bd44f639e 100644 --- a/pixman/pixman/pixman-mmx.c +++ b/pixman/pixman/pixman-mmx.c @@ -345,9 +345,16 @@ static __inline__ uint32_t ldl_u(uint32_t *p) } static force_inline __m64 -load8888 (uint32_t v) +load8888 (const uint32_t *v) { - return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ()); + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (*v), _mm_setzero_si64 ()); +} + +static force_inline __m64 +load8888u (const uint32_t *v) +{ + uint32_t l = ldl_u(v); + return load8888(&l); } static force_inline __m64 @@ -356,10 +363,11 @@ pack8888 (__m64 lo, __m64 hi) return _mm_packs_pu16 (lo, hi); } -static force_inline uint32_t -store8888 (__m64 v) +static force_inline void +store8888 (uint32_t *dest, __m64 v) { - return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ())); + v = pack8888 (v, _mm_setzero_si64()); + *dest = _mm_cvtsi64_si32 (v); } /* Expand 16 bits positioned at @pos (0-3) of a mmx register into @@ -470,13 +478,13 @@ combine (const uint32_t *src, const uint32_t *mask) if (mask) { - __m64 m = load8888 (*mask); - __m64 s = load8888 (ssrc); + __m64 m = load8888 (mask); + __m64 s = load8888 (&ssrc); m = expand_alpha (m); s = pix_multiply (s, m); - ssrc = store8888 (s); + store8888 (&ssrc, s); } return ssrc; @@ -504,9 +512,9 @@ mmx_combine_over_u (pixman_implementation_t *imp, else if (ssrc) { __m64 s, sa; - s = load8888 (ssrc); + s = load8888 (&ssrc); sa = expand_alpha (s); - *dest = store8888 (over (s, sa, load8888 (*dest))); + store8888 (dest, over (s, sa, load8888 (dest))); } ++dest; @@ -532,9 +540,9 @@ mmx_combine_over_reverse_u (pixman_implementation_t *imp, __m64 d, da; uint32_t s = combine (src, mask); - d = load8888 (*dest); + d = load8888 (dest); da = expand_alpha (d); - *dest = store8888 (over (d, da, load8888 (s))); + store8888 (dest, over (d, da, load8888 (&s))); ++dest; ++src; @@ -557,13 +565,14 @@ mmx_combine_in_u (pixman_implementation_t *imp, while (dest < end) { __m64 x, a; + uint32_t ssrc = combine (src, mask); - x = load8888 (combine (src, mask)); - a = load8888 (*dest); + x = load8888 (&ssrc); + a = load8888 (dest); a = expand_alpha (a); x = pix_multiply (x, a); - *dest = store8888 (x); + store8888 (dest, x); ++dest; ++src; @@ -586,12 +595,13 @@ mmx_combine_in_reverse_u (pixman_implementation_t *imp, while (dest < end) { __m64 x, a; + uint32_t ssrc = combine (src, mask); - x = load8888 (*dest); - a = load8888 (combine (src, mask)); + x = load8888 (dest); + a = load8888 (&ssrc); a = expand_alpha (a); x = pix_multiply (x, a); - *dest = store8888 (x); + store8888 (dest, x); ++dest; ++src; @@ -614,13 +624,14 @@ mmx_combine_out_u (pixman_implementation_t *imp, while (dest < end) { __m64 x, a; + uint32_t ssrc = combine (src, mask); - x = load8888 (combine (src, mask)); - a = load8888 (*dest); + x = load8888 (&ssrc); + a = load8888 (dest); a = expand_alpha (a); a = negate (a); x = pix_multiply (x, a); - *dest = store8888 (x); + store8888 (dest, x); ++dest; ++src; @@ -643,14 +654,15 @@ mmx_combine_out_reverse_u (pixman_implementation_t *imp, while (dest < end) { __m64 x, a; + uint32_t ssrc = combine (src, mask); - x = load8888 (*dest); - a = load8888 (combine (src, mask)); + x = load8888 (dest); + a = load8888 (&ssrc); a = expand_alpha (a); a = negate (a); x = pix_multiply (x, a); - *dest = store8888 (x); + store8888 (dest, x); ++dest; ++src; @@ -673,14 +685,15 @@ mmx_combine_atop_u (pixman_implementation_t *imp, while (dest < end) { __m64 s, da, d, sia; + uint32_t ssrc = combine (src, mask); - s = load8888 (combine (src, mask)); - d = load8888 (*dest); + s = load8888 (&ssrc); + d = load8888 (dest); sia = expand_alpha (s); sia = negate (sia); da = expand_alpha (d); s = pix_add_mul (s, da, d, sia); - *dest = store8888 (s); + store8888 (dest, s); ++dest; ++src; @@ -705,14 +718,15 @@ mmx_combine_atop_reverse_u (pixman_implementation_t *imp, while (dest < end) { __m64 s, dia, d, sa; + uint32_t ssrc = combine (src, mask); - s = load8888 (combine (src, mask)); - d = load8888 (*dest); + s = load8888 (&ssrc); + d = load8888 (dest); sa = expand_alpha (s); dia = expand_alpha (d); dia = negate (dia); s = pix_add_mul (s, dia, d, sa); - *dest = store8888 (s); + store8888 (dest, s); ++dest; ++src; @@ -735,15 +749,16 @@ mmx_combine_xor_u (pixman_implementation_t *imp, while (dest < end) { __m64 s, dia, d, sia; + uint32_t ssrc = combine (src, mask); - s = load8888 (combine (src, mask)); - d = load8888 (*dest); + s = load8888 (&ssrc); + d = load8888 (dest); sia = expand_alpha (s); dia = expand_alpha (d); sia = negate (sia); dia = negate (dia); s = pix_add_mul (s, dia, d, sia); - *dest = store8888 (s); + store8888 (dest, s); ++dest; ++src; @@ -766,11 +781,12 @@ mmx_combine_add_u (pixman_implementation_t *imp, while (dest < end) { __m64 s, d; + uint32_t ssrc = combine (src, mask); - s = load8888 (combine (src, mask)); - d = load8888 (*dest); + s = load8888 (&ssrc); + d = load8888 (dest); s = pix_add (s, d); - *dest = store8888 (s); + store8888 (dest, s); ++dest; ++src; @@ -794,20 +810,21 @@ mmx_combine_saturate_u (pixman_implementation_t *imp, { uint32_t s = combine (src, mask); uint32_t d = *dest; - __m64 ms = load8888 (s); - __m64 md = load8888 (d); + __m64 ms = load8888 (&s); + __m64 md = load8888 (&d); uint32_t sa = s >> 24; uint32_t da = ~d >> 24; if (sa > da) { - __m64 msa = load8888 (DIV_UN8 (da, sa) << 24); + uint32_t quot = DIV_UN8 (da, sa) << 24; + __m64 msa = load8888 ("); msa = expand_alpha (msa); ms = pix_multiply (ms, msa); } md = pix_add (md, ms); - *dest = store8888 (md); + store8888 (dest, md); ++src; ++dest; @@ -829,11 +846,11 @@ mmx_combine_src_ca (pixman_implementation_t *imp, while (src < end) { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); s = pix_multiply (s, a); - *dest = store8888 (s); + store8888 (dest, s); ++src; ++mask; @@ -854,12 +871,12 @@ mmx_combine_over_ca (pixman_implementation_t *imp, while (src < end) { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 sa = expand_alpha (s); - *dest = store8888 (in_over (s, sa, a, d)); + store8888 (dest, in_over (s, sa, a, d)); ++src; ++dest; @@ -880,12 +897,12 @@ mmx_combine_over_reverse_ca (pixman_implementation_t *imp, while (src < end) { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); - *dest = store8888 (over (d, da, in (s, a))); + store8888 (dest, over (d, da, in (s, a))); ++src; ++dest; @@ -906,14 +923,14 @@ mmx_combine_in_ca (pixman_implementation_t *imp, while (src < end) { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); s = pix_multiply (s, a); s = pix_multiply (s, da); - *dest = store8888 (s); + store8888 (dest, s); ++src; ++dest; @@ -934,14 +951,14 @@ mmx_combine_in_reverse_ca (pixman_implementation_t *imp, while (src < end) { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 sa = expand_alpha (s); a = pix_multiply (a, sa); d = pix_multiply (d, a); - *dest = store8888 (d); + store8888 (dest, d); ++src; ++dest; @@ -962,15 +979,15 @@ mmx_combine_out_ca (pixman_implementation_t *imp, while (src < end) { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); da = negate (da); s = pix_multiply (s, a); s = pix_multiply (s, da); - *dest = store8888 (s); + store8888 (dest, s); ++src; ++dest; @@ -991,15 +1008,15 @@ mmx_combine_out_reverse_ca (pixman_implementation_t *imp, while (src < end) { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 sa = expand_alpha (s); a = pix_multiply (a, sa); a = negate (a); d = pix_multiply (d, a); - *dest = store8888 (d); + store8888 (dest, d); ++src; ++dest; @@ -1020,9 +1037,9 @@ mmx_combine_atop_ca (pixman_implementation_t *imp, while (src < end) { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); __m64 sa = expand_alpha (s); @@ -1030,7 +1047,7 @@ mmx_combine_atop_ca (pixman_implementation_t *imp, a = pix_multiply (a, sa); a = negate (a); d = pix_add_mul (d, a, s, da); - *dest = store8888 (d); + store8888 (dest, d); ++src; ++dest; @@ -1051,9 +1068,9 @@ mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, while (src < end) { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); __m64 sa = expand_alpha (s); @@ -1061,7 +1078,7 @@ mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, a = pix_multiply (a, sa); da = negate (da); d = pix_add_mul (d, a, s, da); - *dest = store8888 (d); + store8888 (dest, d); ++src; ++dest; @@ -1082,9 +1099,9 @@ mmx_combine_xor_ca (pixman_implementation_t *imp, while (src < end) { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); __m64 sa = expand_alpha (s); @@ -1093,7 +1110,7 @@ mmx_combine_xor_ca (pixman_implementation_t *imp, da = negate (da); a = negate (a); d = pix_add_mul (d, a, s, da); - *dest = store8888 (d); + store8888 (dest, d); ++src; ++dest; @@ -1114,13 +1131,13 @@ mmx_combine_add_ca (pixman_implementation_t *imp, while (src < end) { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); s = pix_multiply (s, a); d = pix_add (s, d); - *dest = store8888 (d); + store8888 (dest, d); ++src; ++dest; @@ -1151,7 +1168,7 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); while (height--) @@ -1164,7 +1181,7 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp, while (w && (unsigned long)dst & 7) { - *dst = store8888 (over (vsrc, vsrca, load8888 (*dst))); + store8888 (dst, over (vsrc, vsrca, load8888 (dst))); w--; dst++; @@ -1190,7 +1207,7 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp, if (w) { - *dst = store8888 (over (vsrc, vsrca, load8888 (*dst))); + store8888 (dst, over (vsrc, vsrca, load8888 (dst))); } } @@ -1217,7 +1234,7 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); while (height--) @@ -1296,7 +1313,7 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); while (height--) @@ -1311,9 +1328,9 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, if (m) { - __m64 vdest = load8888 (*q); - vdest = in_over (vsrc, vsrca, load8888 (m), vdest); - *q = store8888 (vdest); + __m64 vdest = load8888 (q); + vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); + store8888 (q, vdest); } twidth--; @@ -1332,9 +1349,9 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, __m64 dest0, dest1; __m64 vdest = *(__m64 *)q; - dest0 = in_over (vsrc, vsrca, load8888 (m0), + dest0 = in_over (vsrc, vsrca, load8888 (&m0), expand8888 (vdest, 0)); - dest1 = in_over (vsrc, vsrca, load8888 (m1), + dest1 = in_over (vsrc, vsrca, load8888 (&m1), expand8888 (vdest, 1)); *(__m64 *)q = pack8888 (dest0, dest1); @@ -1351,9 +1368,9 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, if (m) { - __m64 vdest = load8888 (*q); - vdest = in_over (vsrc, vsrca, load8888 (m), vdest); - *q = store8888 (vdest); + __m64 vdest = load8888 (q); + vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); + store8888 (q, vdest); } twidth--; @@ -1388,7 +1405,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); mask &= 0xff000000; mask = mask | mask >> 8 | mask >> 16 | mask >> 24; - vmask = load8888 (mask); + vmask = load8888 (&mask); while (height--) { @@ -1400,10 +1417,10 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, while (w && (unsigned long)dst & 7) { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); - *dst = store8888 (in_over (s, expand_alpha (s), vmask, d)); + store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); w--; dst++; @@ -1428,10 +1445,10 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, if (w) { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); - *dst = store8888 (in_over (s, expand_alpha (s), vmask, d)); + store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); } } @@ -1459,7 +1476,7 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, mask &= 0xff000000; mask = mask | mask >> 8 | mask >> 16 | mask >> 24; - vmask = load8888 (mask); + vmask = load8888 (&mask); srca = MC (4x00ff); while (height--) @@ -1472,10 +1489,11 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, while (w && (unsigned long)dst & 7) { - __m64 s = load8888 (*src | 0xff000000); - __m64 d = load8888 (*dst); + uint32_t ssrc = *src | 0xff000000; + __m64 s = load8888 (&ssrc); + __m64 d = load8888 (dst); - *dst = store8888 (in_over (s, srca, vmask, d)); + store8888 (dst, in_over (s, srca, vmask, d)); w--; dst++; @@ -1550,10 +1568,11 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, while (w) { - __m64 s = load8888 (*src | 0xff000000); - __m64 d = load8888 (*dst); + uint32_t ssrc = *src | 0xff000000; + __m64 s = load8888 (&ssrc); + __m64 d = load8888 (dst); - *dst = store8888 (in_over (s, srca, vmask, d)); + store8888 (dst, in_over (s, srca, vmask, d)); w--; dst++; @@ -1601,9 +1620,9 @@ mmx_composite_over_8888_8888 (pixman_implementation_t *imp, else if (s) { __m64 ms, sa; - ms = load8888 (s); + ms = load8888 (&s); sa = expand_alpha (ms); - *dst = store8888 (over (ms, sa, load8888 (*dst))); + store8888 (dst, over (ms, sa, load8888 (dst))); } dst++; @@ -1644,7 +1663,7 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp, while (w && (unsigned long)dst & 7) { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; __m64 vdest = expand565 (to_m64 (d), 0); @@ -1665,10 +1684,10 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp, __m64 vsrc0, vsrc1, vsrc2, vsrc3; __m64 vdest; - vsrc0 = load8888 (*(src + 0)); - vsrc1 = load8888 (*(src + 1)); - vsrc2 = load8888 (*(src + 2)); - vsrc3 = load8888 (*(src + 3)); + vsrc0 = load8888 ((src + 0)); + vsrc1 = load8888 ((src + 1)); + vsrc2 = load8888 ((src + 2)); + vsrc3 = load8888 ((src + 3)); vdest = *(__m64 *)dst; @@ -1688,7 +1707,7 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp, while (w) { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; __m64 vdest = expand565 (to_m64 (d), 0); @@ -1731,7 +1750,7 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); while (height--) @@ -1752,9 +1771,9 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, { __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), - load8888 (*dst)); + load8888 (dst)); - *dst = store8888 (vdest); + store8888 (dst, vdest); } w--; @@ -1803,11 +1822,11 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, if (m) { - __m64 vdest = load8888 (*dst); + __m64 vdest = load8888 (dst); vdest = in_over ( vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest); - *dst = store8888 (vdest); + store8888 (dst, vdest); } } } @@ -1996,7 +2015,7 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); while (height--) { @@ -2016,7 +2035,7 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, { __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); - *dst = store8888 (vdest); + store8888 (dst, vdest); } else { @@ -2067,10 +2086,10 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, if (m) { - __m64 vdest = load8888 (*dst); + __m64 vdest = load8888 (dst); vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); - *dst = store8888 (vdest); + store8888 (dst, vdest); } else { @@ -2106,7 +2125,7 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0); @@ -2245,7 +2264,7 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, while (w && (unsigned long)dst & 7) { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; __m64 vdest = expand565 (to_m64 (d), 0); @@ -2278,10 +2297,10 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, if ((a0 & a1 & a2 & a3) == 0xFF) { __m64 vdest; - vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0); - vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1); - vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2); - vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3); + vdest = pack_565 (invert_colors (load8888 (&s0)), _mm_setzero_si64 (), 0); + vdest = pack_565 (invert_colors (load8888 (&s1)), vdest, 1); + vdest = pack_565 (invert_colors (load8888 (&s2)), vdest, 2); + vdest = pack_565 (invert_colors (load8888 (&s3)), vdest, 3); *(__m64 *)dst = vdest; } @@ -2289,10 +2308,10 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, { __m64 vdest = *(__m64 *)dst; - vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3); + vdest = pack_565 (over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)), vdest, 0); + vdest = pack_565 (over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)), vdest, 1); + vdest = pack_565 (over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)), vdest, 2); + vdest = pack_565 (over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)), vdest, 3); *(__m64 *)dst = vdest; } @@ -2306,7 +2325,7 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, while (w) { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; __m64 vdest = expand565 (to_m64 (d), 0); @@ -2353,10 +2372,10 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, while (w && (unsigned long)dst & 7) { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); - *dst = store8888 (over_rev_non_pre (s, d)); + store8888 (dst, over_rev_non_pre (s, d)); w--; dst++; @@ -2365,7 +2384,7 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, while (w >= 2) { - uint64_t s0, s1; + uint32_t s0, s1; unsigned char a0, a1; __m64 d0, d1; @@ -2377,8 +2396,8 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, if ((a0 & a1) == 0xFF) { - d0 = invert_colors (load8888 (s0)); - d1 = invert_colors (load8888 (s1)); + d0 = invert_colors (load8888 (&s0)); + d1 = invert_colors (load8888 (&s1)); *(__m64 *)dst = pack8888 (d0, d1); } @@ -2386,8 +2405,8 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, { __m64 vdest = *(__m64 *)dst; - d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0)); - d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1)); + d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0)); + d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1)); *(__m64 *)dst = pack8888 (d0, d1); } @@ -2399,10 +2418,10 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, if (w) { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); - *dst = store8888 (over_rev_non_pre (s, d)); + store8888 (dst, over_rev_non_pre (s, d)); } } @@ -2430,7 +2449,7 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); while (height--) @@ -2447,7 +2466,7 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, { uint64_t d = *q; __m64 vdest = expand565 (to_m64 (d), 0); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); *q = to_uint64 (vdest); } @@ -2469,10 +2488,10 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, { __m64 vdest = *(__m64 *)q; - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)), vdest, 0); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)), vdest, 1); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)), vdest, 2); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)), vdest, 3); *(__m64 *)q = vdest; } @@ -2490,7 +2509,7 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, { uint64_t d = *q; __m64 vdest = expand565 (to_m64 (d), 0); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); *q = to_uint64 (vdest); } @@ -2526,7 +2545,7 @@ mmx_composite_in_n_8_8 (pixman_implementation_t *imp, sa = src >> 24; - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); while (height--) @@ -2558,10 +2577,10 @@ mmx_composite_in_n_8_8 (pixman_implementation_t *imp, __m64 vmask; __m64 vdest; - vmask = load8888 (ldl_u((uint32_t *)mask)); - vdest = load8888 (*(uint32_t *)dst); + vmask = load8888u ((uint32_t *)mask); + vdest = load8888 ((uint32_t *)dst); - *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest)); + store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest)); dst += 4; mask += 4; @@ -2628,7 +2647,7 @@ mmx_composite_in_8_8 (pixman_implementation_t *imp, uint32_t *s = (uint32_t *)src; uint32_t *d = (uint32_t *)dst; - *d = store8888 (in (load8888 (ldl_u((uint32_t *)s)), load8888 (*d))); + store8888 (d, in (load8888u (s), load8888 (d))); w -= 4; dst += 4; @@ -2676,7 +2695,7 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp, if (src == 0) return; - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); while (height--) @@ -2709,10 +2728,10 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp, __m64 vmask; __m64 vdest; - vmask = load8888 (ldl_u((uint32_t *)mask)); - vdest = load8888 (*(uint32_t *)dst); + vmask = load8888u ((uint32_t *)mask); + vdest = load8888 ((uint32_t *)dst); - *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest)); + store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest)); dst += 4; mask += 4; @@ -3053,19 +3072,20 @@ mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, if (m) { - __m64 s = load8888 (*src | 0xff000000); + uint32_t ssrc = *src | 0xff000000; + __m64 s = load8888 (&ssrc); if (m == 0xff) { - *dst = store8888 (s); + store8888 (dst, s); } else { __m64 sa = expand_alpha (s); __m64 vm = expand_alpha_rev (to_m64 (m)); - __m64 vdest = in_over (s, sa, vm, load8888 (*dst)); + __m64 vdest = in_over (s, sa, vm, load8888 (dst)); - *dst = store8888 (vdest); + store8888 (dst, vdest); } } diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h index 856038547..9d96a9312 100644 --- a/pixman/pixman/pixman-private.h +++ b/pixman/pixman/pixman-private.h @@ -559,6 +559,11 @@ pixman_implementation_t * _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback); #endif +#ifdef USE_MIPS_DSPR2 +pixman_implementation_t * +_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback); +#endif + #ifdef USE_VMX pixman_implementation_t * _pixman_implementation_create_vmx (pixman_implementation_t *fallback); diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c index ba7f30716..95513ba10 100644 --- a/pixman/test/lowlevel-blt-bench.c +++ b/pixman/test/lowlevel-blt-bench.c @@ -626,6 +626,7 @@ tests_tbl[] = { "over_n_0565", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_OVER, PIXMAN_null, 0, PIXMAN_r5g6b5 }, { "over_n_1555", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_OVER, PIXMAN_null, 0, PIXMAN_a1r5g5b5 }, { "over_8888_0565", PIXMAN_a8r8g8b8, 0, PIXMAN_OP_OVER, PIXMAN_null, 0, PIXMAN_r5g6b5 }, + { "over_8888_8888", PIXMAN_a8r8g8b8, 0, PIXMAN_OP_OVER, PIXMAN_null, 0, PIXMAN_a8r8g8b8 }, { "over_8888_x888", PIXMAN_a8r8g8b8, 0, PIXMAN_OP_OVER, PIXMAN_null, 0, PIXMAN_x8r8g8b8 }, { "over_x888_8_0565", PIXMAN_x8r8g8b8, 0, PIXMAN_OP_OVER, PIXMAN_a8, 0, PIXMAN_r5g6b5 }, { "over_x888_8_8888", PIXMAN_x8r8g8b8, 0, PIXMAN_OP_OVER, PIXMAN_a8, 0, PIXMAN_a8r8g8b8 }, @@ -649,6 +650,7 @@ tests_tbl[] = { "over_8888_n_x888", PIXMAN_a8r8g8b8, 0, PIXMAN_OP_OVER, PIXMAN_a8, 1, PIXMAN_x8r8g8b8 }, { "over_8888_n_0565", PIXMAN_a8r8g8b8, 0, PIXMAN_OP_OVER, PIXMAN_a8, 1, PIXMAN_r5g6b5 }, { "over_8888_n_1555", PIXMAN_a8r8g8b8, 0, PIXMAN_OP_OVER, PIXMAN_a8, 1, PIXMAN_a1r5g5b5 }, + { "over_x888_n_8888", PIXMAN_x8r8g8b8, 0, PIXMAN_OP_OVER, PIXMAN_a8, 1, PIXMAN_a8r8g8b8 }, { "outrev_n_8_0565", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_OUT_REV, PIXMAN_a8, 0, PIXMAN_r5g6b5 }, { "outrev_n_8_1555", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_OUT_REV, PIXMAN_a8, 0, PIXMAN_a1r5g5b5 }, { "outrev_n_8_x888", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_OUT_REV, PIXMAN_a8, 0, PIXMAN_x8r8g8b8 }, -- cgit v1.2.3