diff options
Diffstat (limited to 'pixman')
-rw-r--r-- | pixman/configure.ac | 42 | ||||
-rw-r--r-- | pixman/pixman/pixman-compiler.h | 4 | ||||
-rw-r--r-- | pixman/pixman/pixman-cpu.c | 2 | ||||
-rw-r--r-- | pixman/pixman/pixman-mips-dspr2-asm.S | 219 | ||||
-rw-r--r-- | pixman/pixman/pixman-mips-dspr2-asm.h | 296 | ||||
-rw-r--r-- | pixman/pixman/pixman-mips-dspr2.c | 12 | ||||
-rw-r--r-- | pixman/pixman/pixman-mips-dspr2.h | 42 | ||||
-rw-r--r-- | pixman/pixman/pixman-mmx.c | 4 |
8 files changed, 601 insertions, 20 deletions
diff --git a/pixman/configure.ac b/pixman/configure.ac index 17e30f5e4..1cf9eb4e5 100644 --- a/pixman/configure.ac +++ b/pixman/configure.ac @@ -294,12 +294,18 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ #if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4)) #error "Need GCC >= 3.4 for MMX intrinsics" #endif -#if defined(__clang__) -#error "clang chokes on the inline assembly in pixman-mmx.c" -#endif #include <mmintrin.h> int main () { __m64 v = _mm_cvtsi32_si64 (1); + __m64 w; + signed char x = 0; + + /* Some versions of clang will choke on K */ + asm ("pshufw %2, %1, %0\n\t" + : "=y" (w) + : "y" (v), "K" (x) + ); + return _mm_cvtsi64_si32 (v); }]])], have_mmx_intrinsics=yes) CFLAGS=$xserver_save_CFLAGS @@ -603,7 +609,7 @@ AC_MSG_CHECKING(whether to use MIPS DSPr2 assembler) xserver_save_CFLAGS=$CFLAGS CFLAGS="-mdspr2 $CFLAGS" -AC_COMPILE_IFELSE([[ +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ #if !(defined(__mips__) && __mips_isa_rev >= 2) #error MIPS DSPr2 is currently only available on MIPS32r2 platforms. #endif @@ -617,7 +623,7 @@ main () : [a] "r" (a), [b] "r" (b) ); return c; -}]], have_mips_dspr2=yes) +}]])], have_mips_dspr2=yes) CFLAGS=$xserver_save_CFLAGS AC_ARG_ENABLE(mips-dspr2, @@ -782,25 +788,27 @@ fi dnl ===================================== dnl Thread local storage -support_for__thread=no - -AC_MSG_CHECKING(for __thread) -AC_LINK_IFELSE([AC_LANG_SOURCE([[ +AC_MSG_CHECKING(for thread local storage (TLS) support) +AC_CACHE_VAL(ac_cv_tls, [ + ac_cv_tls=none + keywords="__thread __declspec(thread)" + for kw in $keywords ; do + AC_TRY_COMPILE([ #if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)) #error This MinGW version has broken __thread support #endif #ifdef __OpenBSD__ #error OpenBSD has broken __thread support #endif -static __thread int x ; -int main () { x = 123; return x; } -]])], support_for__thread=yes) -if test $support_for__thread = yes; then - AC_DEFINE([TOOLCHAIN_SUPPORTS__THREAD],[],[Whether the tool chain supports __thread]) -fi +int $kw test;], [], ac_cv_tls=$kw) + done +]) +AC_MSG_RESULT($ac_cv_tls) -AC_MSG_RESULT($support_for__thread) +if test "$ac_cv_tls" != "none"; then + AC_DEFINE_UNQUOTED([TLS], $ac_cv_tls, [The compiler supported TLS storage class]) +fi dnl dnl posix tls @@ -852,7 +860,7 @@ AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl fi ]) -if test $support_for__thread = no; then +if test $ac_cv_tls = none ; then support_for_pthread_setspecific=no AC_MSG_CHECKING(for pthread_setspecific) diff --git a/pixman/pixman/pixman-compiler.h b/pixman/pixman/pixman-compiler.h index 5b568e114..ffd51720e 100644 --- a/pixman/pixman/pixman-compiler.h +++ b/pixman/pixman/pixman-compiler.h @@ -97,10 +97,10 @@ # define PIXMAN_GET_THREAD_LOCAL(name) \ (&name) -#elif defined(TOOLCHAIN_SUPPORTS__THREAD) +#elif defined(TLS) # define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \ - static __thread type name + static TLS type name # define PIXMAN_GET_THREAD_LOCAL(name) \ (&name) diff --git a/pixman/pixman/pixman-cpu.c b/pixman/pixman/pixman-cpu.c index bb97ae3e6..1060f4776 100644 --- a/pixman/pixman/pixman-cpu.c +++ b/pixman/pixman/pixman-cpu.c @@ -666,6 +666,7 @@ detect_cpu_features (void) return features; } +#ifdef USE_X86_MMX static pixman_bool_t pixman_have_mmx (void) { @@ -681,6 +682,7 @@ pixman_have_mmx (void) return mmx_present; } +#endif #ifdef USE_SSE2 static pixman_bool_t diff --git a/pixman/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman/pixman-mips-dspr2-asm.S index f1087a776..6a0fc1803 100644 --- a/pixman/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman/pixman-mips-dspr2-asm.S @@ -308,3 +308,222 @@ LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips) nop END(pixman_composite_src_x888_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (32bit constant) + * a2 - mask (a8r8g8b8) + * a3 - w + */ + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3, s4, s5, s6, s7 + beqz a3, 4f + nop + li t6, 0xff + addiu t7, zero, -1 /* t7 = 0xffffffff */ + srl t8, a1, 24 /* t8 = srca */ + li t9, 0x00ff00ff + addiu t1, a3, -1 + beqz t1, 3f /* last pixel */ + nop + beq t8, t6, 2f /* if (srca == 0xff) */ + nop +1: + /* a1 = src */ + lw t0, 0(a2) /* t0 = mask */ + lw t1, 4(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 12f /* if (t0 == 0) && (t1 == 0) */ + addiu a2, a2, 8 + and t3, t0, t1 + move s0, t8 /* s0 = srca */ + move s1, t8 /* s1 = srca */ + move t4, a1 /* t4 = src */ + move t5, a1 /* t5 = src */ + lw t2, 0(a0) /* t2 = dst */ + beq t3, t7, 11f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */ + lw t3, 4(a0) /* t0 = dst */ + MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5 + MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t8, t8, s0, s1, t9, s2, s3, s4, s5, s6, s7 +11: + not s0, s0 + not s1, s1 + MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, s0, s1, s2, s3, t9, t0, t1, s4, s5, s6, s7 + addu_s.qb t0, t4, s2 + addu_s.qb t1, t5, s3 + sw t0, 0(a0) + sw t1, 4(a0) +12: + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 1b + addiu a0, a0, 8 + b 3f + nop +2: + /* a1 = src */ + lw t0, 0(a2) /* t0 = mask */ + lw t1, 4(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 22f /* if (t0 == 0) & (t1 == 0) */ + addiu a2, a2, 8 + and t2, t0, t1 + move s0, a1 + beq t2, t7, 21f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */ + move s1, a1 + lw t2, 0(a0) /* t2 = dst */ + lw t3, 4(a0) /* t3 = dst */ + MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5 + not t0, t0 + not t1, t1 + MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, s0, s1, t9, s2, s3, s4, s5, s6, s7 + addu_s.qb s0, t4, s0 + addu_s.qb s1, t5, s1 +21: + sw s0, 0(a0) + sw s1, 4(a0) +22: + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 2b + addiu a0, a0, 8 +3: + blez a3, 4f + nop + /* a1 = src */ + lw t1, 0(a2) /* t1 = mask */ + beqz t1, 4f + nop + move s0, t8 /* s0 = srca */ + move t2, a1 /* t2 = src */ + beq t1, t7, 31f + lw t0, 0(a0) /* t0 = dst */ + + MIPS_UN8x4_MUL_UN8x4 a1, t1, t2, t9, t3, t4, t5, t6 + MIPS_UN8x4_MUL_UN8 t1, t8, s0, t9, t3, t4, t5 +31: + not s0, s0 + MIPS_UN8x4_MUL_UN8x4 t0, s0, t3, t9, t4, t5, t6, t1 + addu_s.qb t0, t2, t3 + sw t0, 0(a0) +4: + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3, s4, s5, s6, s7 + j ra + nop + +END(pixman_composite_over_n_8888_8888_ca_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (32bit constant) + * a2 - mask (a8r8g8b8) + * a3 - w + */ + + SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8 + beqz a3, 4f + nop + li t5, 0xf800f800 + li t6, 0x07e007e0 + li t7, 0x001F001F + li t9, 0x00ff00ff + + srl t8, a1, 24 /* t8 = srca */ + addiu t1, a3, -1 + beqz t1, 3f /* last pixel */ + nop + li s0, 0xff /* s0 = 0xff */ + addiu s1, zero, -1 /* s1 = 0xffffffff */ + + beq t8, s0, 2f /* if (srca == 0xff) */ + nop +1: + /* a1 = src */ + lw t0, 0(a2) /* t0 = mask */ + lw t1, 4(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 12f /* if (t0 == 0) && (t1 == 0) */ + addiu a2, a2, 8 + and t3, t0, t1 + move t0, t8 + move t1, a1 + lhu t2, 0(a0) /* t2 = dst */ + beq t3, s1, 11f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */ + lhu t3, 2(a0) /* t3 = dst */ + MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8 + MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t8, t8, t0, t1, t9, t4, s4, s5, s6, s7, s8 +11: + not t0, t0 + not t1, t1 + CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8 + MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t0, t1 + addu_s.qb s2, s2, s4 + addu_s.qb s3, s3, s5 + CONVERT_2x8888_TO_2x0565 s2, s3, t2, t3, t5, t6, t7, s1, s2 + sh t2, 0(a0) + sh t3, 2(a0) +12: + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 1b + addiu a0, a0, 4 + b 3f + nop +2: + /* a1 = src */ + lw t0, 0(a2) /* t0 = mask */ + lw t1, 4(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 22f /* if (t0 == 0) & (t1 == 0) */ + addiu a2, a2, 8 + and t3, t0, t1 + move t2, a1 + beq t3, s1, 21f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */ + move t3, a1 + lhu t2, 0(a0) /* t2 = dst */ + lhu t3, 2(a0) /* t3 = dst */ + MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8 + not t0, t0 + not t1, t1 + CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8 + MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t2, t3 + addu_s.qb t2, s2, s4 + addu_s.qb t3, s3, s5 +21: + CONVERT_2x8888_TO_2x0565 t2, t3, t0, t1, t5, t6, t7, s2, s3 + sh t0, 0(a0) + sh t1, 2(a0) +22: + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 2b + addiu a0, a0, 4 +3: + blez a3, 4f + nop + /* a1 = src */ + lw t1, 0(a2) /* t1 = mask */ + beqz t1, 4f + nop + move s0, t8 /* s0 = srca */ + move t2, a1 /* t2 = src */ + beq t1, t7, 31f + lhu t0, 0(a0) /* t0 = dst */ + + MIPS_UN8x4_MUL_UN8x4 a1, t1, t2, t9, t3, t4, t5, t6 + MIPS_UN8x4_MUL_UN8 t1, t8, s0, t9, t3, t4, t5 +31: + not s0, s0 + CONVERT_1x0565_TO_1x8888 t0, s1, s2, s3 + MIPS_UN8x4_MUL_UN8x4 s1, s0, t3, t9, t4, t5, t6, t1 + addu_s.qb t0, t2, t3 + CONVERT_1x8888_TO_1x0565 t0, s1, s2, s3 + sh s1, 0(a0) +4: + RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8 + j ra + nop + +END(pixman_composite_over_n_8888_0565_ca_asm_mips) diff --git a/pixman/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman/pixman-mips-dspr2-asm.h index e07cda470..12ff42c57 100644 --- a/pixman/pixman/pixman-mips-dspr2-asm.h +++ b/pixman/pixman/pixman-mips-dspr2-asm.h @@ -96,6 +96,170 @@ LEAF_MIPS32R2(symbol) \ .size function,.-function /* + * Checks if stack offset is big enough for storing/restoring regs_num + * number of register to/from stack. Stack offset must be greater than + * or equal to the number of bytes needed for storing registers (regs_num*4). + * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is + * preserved for input arguments of the functions, already stored in a0-a3), + * stack size can be further optimized by utilizing this space. + */ +.macro CHECK_STACK_OFFSET regs_num, stack_offset +.if \stack_offset < \regs_num * 4 - 16 +.error "Stack offset too small." +.endif +.endm + +/* + * Saves set of registers on stack. Maximum number of registers that + * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * before registers are pushed in order to provide enough space on stack + * (offset must be multiple of 4, and must be big enough, as described by + * CHECK_STACK_OFFSET macro). This macro is intended to be used in + * combination with RESTORE_REGS_FROM_STACK macro. Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) + .error "Stack offset must be pozitive and multiple of 4." + .endif + .if \stack_offset != 0 + addiu sp, sp, -\stack_offset + .endif + sw \r1, 0(sp) + .if \r2 != 0 + sw \r2, 4(sp) + .endif + .if \r3 != 0 + sw \r3, 8(sp) + .endif + .if \r4 != 0 + sw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + sw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + sw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + sw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + sw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + sw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + sw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + sw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + sw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + sw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + sw \r14, 52(sp) + .endif +.endm + +/* + * Restores set of registers from stack. Maximum number of registers that + * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * after registers are restored (offset must be multiple of 4, and must + * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is + * intended to be used in combination with RESTORE_REGS_FROM_STACK macro. + * Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4) + .error "Stack offset must be pozitive and multiple of 4." + .endif + lw \r1, 0(sp) + .if \r2 != 0 + lw \r2, 4(sp) + .endif + .if \r3 != 0 + lw \r3, 8(sp) + .endif + .if \r4 != 0 + lw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + lw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + lw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + lw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + lw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + lw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + lw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + lw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + lw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + lw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + lw \r14, 52(sp) + .endif + .if \stack_offset != 0 + addiu sp, sp, \stack_offset + .endif +.endm + +/* * Conversion of single r5g6b5 pixel (in_565) to single a8r8g8b8 pixel * returned in (out_8888) register. Requires two temporary registers * (scratch1 and scratch2). @@ -203,4 +367,136 @@ LEAF_MIPS32R2(symbol) \ srl \out2_565, \out1_565, 16 .endm +/* + * Multiply pixel (a8) with single pixel (a8r8g8b8). It requires maskLSR needed + * for rounding process. maskLSR must have following value: + * li maskLSR, 0x00ff00ff + */ +.macro MIPS_UN8x4_MUL_UN8 s_8888, \ + m_8, \ + d_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3 + replv.ph \m_8, \m_8 /* 0 | M | 0 | M */ + muleu_s.ph.qbl \scratch1, \s_8888, \m_8 /* A*M | R*M */ + muleu_s.ph.qbr \scratch2, \s_8888, \m_8 /* G*M | B*M */ + shra_r.ph \scratch3, \scratch1, 8 + shra_r.ph \d_8888, \scratch2, 8 + and \scratch3, \scratch3, \maskLSR /* 0 |A*M| 0 |R*M */ + and \d_8888, \d_8888, \maskLSR /* 0 |G*M| 0 |B*M */ + addq.ph \scratch1, \scratch1, \scratch3 /* A*M+A*M | R*M+R*M */ + addq.ph \scratch2, \scratch2, \d_8888 /* G*M+G*M | B*M+B*M */ + shra_r.ph \scratch1, \scratch1, 8 + shra_r.ph \scratch2, \scratch2, 8 + precr.qb.ph \d_8888, \scratch1, \scratch2 +.endm + +/* + * Multiply two pixels (a8) with two pixels (a8r8g8b8). It requires maskLSR + * needed for rounding process. maskLSR must have following value: + * li maskLSR, 0x00ff00ff + */ +.macro MIPS_2xUN8x4_MUL_2xUN8 s1_8888, \ + s2_8888, \ + m1_8, \ + m2_8, \ + d1_8888, \ + d2_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3, \ + scratch4, scratch5, scratch6 + replv.ph \m1_8, \m1_8 /* 0 | M1 | 0 | M1 */ + replv.ph \m2_8, \m2_8 /* 0 | M2 | 0 | M2 */ + muleu_s.ph.qbl \scratch1, \s1_8888, \m1_8 /* A1*M1 | R1*M1 */ + muleu_s.ph.qbr \scratch2, \s1_8888, \m1_8 /* G1*M1 | B1*M1 */ + muleu_s.ph.qbl \scratch3, \s2_8888, \m2_8 /* A2*M2 | R2*M2 */ + muleu_s.ph.qbr \scratch4, \s2_8888, \m2_8 /* G2*M2 | B2*M2 */ + shra_r.ph \scratch5, \scratch1, 8 + shra_r.ph \d1_8888, \scratch2, 8 + shra_r.ph \scratch6, \scratch3, 8 + shra_r.ph \d2_8888, \scratch4, 8 + and \scratch5, \scratch5, \maskLSR /* 0 |A1*M1| 0 |R1*M1 */ + and \d1_8888, \d1_8888, \maskLSR /* 0 |G1*M1| 0 |B1*M1 */ + and \scratch6, \scratch6, \maskLSR /* 0 |A2*M2| 0 |R2*M2 */ + and \d2_8888, \d2_8888, \maskLSR /* 0 |G2*M2| 0 |B2*M2 */ + addq.ph \scratch1, \scratch1, \scratch5 + addq.ph \scratch2, \scratch2, \d1_8888 + addq.ph \scratch3, \scratch3, \scratch6 + addq.ph \scratch4, \scratch4, \d2_8888 + shra_r.ph \scratch1, \scratch1, 8 + shra_r.ph \scratch2, \scratch2, 8 + shra_r.ph \scratch3, \scratch3, 8 + shra_r.ph \scratch4, \scratch4, 8 + precr.qb.ph \d1_8888, \scratch1, \scratch2 + precr.qb.ph \d2_8888, \scratch3, \scratch4 +.endm + +/* + * Multiply pixel (a8r8g8b8) with single pixel (a8r8g8b8). It requires maskLSR + * needed for rounding process. maskLSR must have following value: + * li maskLSR, 0x00ff00ff + */ +.macro MIPS_UN8x4_MUL_UN8x4 s_8888, \ + m_8888, \ + d_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3, scratch4 + preceu.ph.qbl \scratch1, \m_8888 /* 0 | A | 0 | R */ + preceu.ph.qbr \scratch2, \m_8888 /* 0 | G | 0 | B */ + muleu_s.ph.qbl \scratch3, \s_8888, \scratch1 /* A*A | R*R */ + muleu_s.ph.qbr \scratch4, \s_8888, \scratch2 /* G*G | B*B */ + shra_r.ph \scratch1, \scratch3, 8 + shra_r.ph \scratch2, \scratch4, 8 + and \scratch1, \scratch1, \maskLSR /* 0 |A*A| 0 |R*R */ + and \scratch2, \scratch2, \maskLSR /* 0 |G*G| 0 |B*B */ + addq.ph \scratch1, \scratch1, \scratch3 + addq.ph \scratch2, \scratch2, \scratch4 + shra_r.ph \scratch1, \scratch1, 8 + shra_r.ph \scratch2, \scratch2, 8 + precr.qb.ph \d_8888, \scratch1, \scratch2 +.endm + +/* + * Multiply two pixels (a8r8g8b8) with two pixels (a8r8g8b8). It requires + * maskLSR needed for rounding process. maskLSR must have following value: + * li maskLSR, 0x00ff00ff + */ + +.macro MIPS_2xUN8x4_MUL_2xUN8x4 s1_8888, \ + s2_8888, \ + m1_8888, \ + m2_8888, \ + d1_8888, \ + d2_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3, \ + scratch4, scratch5, scratch6 + preceu.ph.qbl \scratch1, \m1_8888 /* 0 | A | 0 | R */ + preceu.ph.qbr \scratch2, \m1_8888 /* 0 | G | 0 | B */ + preceu.ph.qbl \scratch3, \m2_8888 /* 0 | A | 0 | R */ + preceu.ph.qbr \scratch4, \m2_8888 /* 0 | G | 0 | B */ + muleu_s.ph.qbl \scratch5, \s1_8888, \scratch1 /* A*A | R*R */ + muleu_s.ph.qbr \scratch6, \s1_8888, \scratch2 /* G*G | B*B */ + muleu_s.ph.qbl \scratch1, \s2_8888, \scratch3 /* A*A | R*R */ + muleu_s.ph.qbr \scratch2, \s2_8888, \scratch4 /* G*G | B*B */ + shra_r.ph \scratch3, \scratch5, 8 + shra_r.ph \scratch4, \scratch6, 8 + shra_r.ph \d1_8888, \scratch1, 8 + shra_r.ph \d2_8888, \scratch2, 8 + and \scratch3, \scratch3, \maskLSR /* 0 |A*A| 0 |R*R */ + and \scratch4, \scratch4, \maskLSR /* 0 |G*G| 0 |B*B */ + and \d1_8888, \d1_8888, \maskLSR /* 0 |A*A| 0 |R*R */ + and \d2_8888, \d2_8888, \maskLSR /* 0 |G*G| 0 |B*B */ + addq.ph \scratch3, \scratch3, \scratch5 + addq.ph \scratch4, \scratch4, \scratch6 + addq.ph \d1_8888, \d1_8888, \scratch1 + addq.ph \d2_8888, \d2_8888, \scratch2 + shra_r.ph \scratch3, \scratch3, 8 + shra_r.ph \scratch4, \scratch4, 8 + shra_r.ph \scratch5, \d1_8888, 8 + shra_r.ph \scratch6, \d2_8888, 8 + precr.qb.ph \d1_8888, \scratch3, \scratch4 + precr.qb.ph \d2_8888, \scratch5, \scratch6 +.endm + #endif //PIXMAN_MIPS_DSPR2_ASM_H diff --git a/pixman/pixman/pixman-mips-dspr2.c b/pixman/pixman/pixman-mips-dspr2.c index 2beada390..018770a4a 100644 --- a/pixman/pixman/pixman-mips-dspr2.c +++ b/pixman/pixman/pixman-mips-dspr2.c @@ -49,6 +49,11 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888, PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888, uint8_t, 3, uint8_t, 3) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca, + uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca, + uint32_t, 1, uint16_t, 1) + static pixman_bool_t pixman_fill_mips (uint32_t *bits, int stride, @@ -184,6 +189,13 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888), PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, mips_composite_src_0888_0888), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mips_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mips_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mips_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mips_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mips_composite_over_n_8888_0565_ca), + { PIXMAN_OP_NONE }, }; diff --git a/pixman/pixman/pixman-mips-dspr2.h b/pixman/pixman/pixman-mips-dspr2.h index a40e7c87a..cc35d02ed 100644 --- a/pixman/pixman/pixman-mips-dspr2.h +++ b/pixman/pixman/pixman-mips-dspr2.h @@ -85,4 +85,46 @@ mips_composite_##name (pixman_implementation_t *imp, \ } \ } +/*******************************************************************/ + +#define PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST(flags, name, \ + mask_type, mask_cnt, \ + dst_type, dst_cnt) \ +void \ +pixman_composite_##name##_asm_mips (dst_type *dst, \ + uint32_t src, \ + mask_type *mask, \ + int32_t w); \ + \ +static void \ +mips_composite_##name (pixman_implementation_t *imp, \ + pixman_composite_info_t *info) \ +{ \ + PIXMAN_COMPOSITE_ARGS (info); \ + dst_type *dst_line, *dst; \ + mask_type *mask_line, *mask; \ + int32_t dst_stride, mask_stride; \ + uint32_t src; \ + \ + src = _pixman_image_get_solid ( \ + imp, src_image, dest_image->bits.format); \ + \ + if ((flags & SKIP_ZERO_SRC) && src == 0) \ + return; \ + \ + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \ + dst_stride, dst_line, dst_cnt); \ + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type, \ + mask_stride, mask_line, mask_cnt); \ + \ + while (height--) \ + { \ + dst = dst_line; \ + dst_line += dst_stride; \ + mask = mask_line; \ + mask_line += mask_stride; \ + pixman_composite_##name##_asm_mips (dst, src, mask, width); \ + } \ +} + #endif //PIXMAN_MIPS_DSPR2_H diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c index f9efd73d3..9d1f6afd4 100644 --- a/pixman/pixman/pixman-mmx.c +++ b/pixman/pixman/pixman-mmx.c @@ -57,7 +57,7 @@ _mm_empty (void) #endif #ifdef USE_X86_MMX -# ifdef __SUNPRO_C +# if (defined(__SUNPRO_C) || defined(_MSC_VER)) # include <xmmintrin.h> # else /* We have to compile with -msse to use xmmintrin.h, but that causes SSE @@ -88,8 +88,10 @@ _mm_shuffle_pi16 (__m64 __A, int8_t const __N) # endif #endif +#ifndef _MSC_VER #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) +#endif /* Notes about writing mmx code * |