diff options
author | marha <marha@users.sourceforge.net> | 2012-07-02 08:51:35 +0200 |
---|---|---|
committer | marha <marha@users.sourceforge.net> | 2012-07-02 08:51:35 +0200 |
commit | fcc2cc7dabb46c39a76351fc12da4e9ad9d1d817 (patch) | |
tree | bcc60a76e7cc205710b91c67ff700c26886ea3e8 /pixman | |
parent | 393178cdbca247c6ad077f7dab9a97d6817c625c (diff) | |
parent | fdef5bff99e6079f64bc6b91c91b42195c85adeb (diff) | |
download | vcxsrv-fcc2cc7dabb46c39a76351fc12da4e9ad9d1d817.tar.gz vcxsrv-fcc2cc7dabb46c39a76351fc12da4e9ad9d1d817.tar.bz2 vcxsrv-fcc2cc7dabb46c39a76351fc12da4e9ad9d1d817.zip |
Merge remote-tracking branch 'origin/released'
Conflicts:
pixman/pixman/pixman-sse2.c
Diffstat (limited to 'pixman')
-rw-r--r-- | pixman/pixman/Makefile.am | 1 | ||||
-rw-r--r-- | pixman/pixman/loongson-mmintrin.h | 116 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-neon-asm-bilinear.S | 119 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-neon-asm.S | 159 | ||||
-rw-r--r-- | pixman/pixman/pixman-bits-image.c | 16 | ||||
-rw-r--r-- | pixman/pixman/pixman-inlines.h | 37 | ||||
-rw-r--r-- | pixman/pixman/pixman-mips-dspr2-asm.S | 9 | ||||
-rw-r--r-- | pixman/pixman/pixman-mmx.c | 280 | ||||
-rw-r--r-- | pixman/pixman/pixman-private.h | 22 | ||||
-rw-r--r-- | pixman/pixman/pixman-sse2.c | 45 | ||||
-rw-r--r-- | pixman/test/affine-test.c | 12 | ||||
-rw-r--r-- | pixman/test/scaling-test.c | 12 |
12 files changed, 645 insertions, 183 deletions
diff --git a/pixman/pixman/Makefile.am b/pixman/pixman/Makefile.am index 1b232ad0f..deacf8728 100644 --- a/pixman/pixman/Makefile.am +++ b/pixman/pixman/Makefile.am @@ -92,6 +92,7 @@ endif # iwmmxt code if USE_ARM_IWMMXT +libpixman_iwmmxt_la_SOURCES = pixman-mmx.c noinst_LTLIBRARIES += libpixman-iwmmxt.la libpixman_1_la_LIBADD += libpixman-iwmmxt.la diff --git a/pixman/pixman/loongson-mmintrin.h b/pixman/pixman/loongson-mmintrin.h index 1a114fe0f..086c6e0f1 100644 --- a/pixman/pixman/loongson-mmintrin.h +++ b/pixman/pixman/loongson-mmintrin.h @@ -45,6 +45,28 @@ _mm_setzero_si64 (void) } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("paddh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi32 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("paddw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pu16 (__m64 __m1, __m64 __m2) { __m64 ret; @@ -150,6 +172,78 @@ _mm_packs_pu16 (__m64 __m1, __m64 __m2) } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pi32 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("packsswh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0) +{ + if (__builtin_constant_p (__w3) && + __builtin_constant_p (__w2) && + __builtin_constant_p (__w1) && + __builtin_constant_p (__w0)) + { + uint64_t val = ((uint64_t)__w3 << 48) + | ((uint64_t)__w2 << 32) + | ((uint64_t)__w1 << 16) + | ((uint64_t)__w0 << 0); + return *(__m64 *)&val; + } + else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0) + { + /* TODO: handle other cases */ + uint64_t val = __w3; + uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0); + __m64 ret; + asm("pshufh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm) + ); + return ret; + } + uint64_t val = ((uint64_t)__w3 << 48) + | ((uint64_t)__w2 << 32) + | ((uint64_t)__w1 << 16) + | ((uint64_t)__w0 << 0); + return *(__m64 *)&val; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi32 (unsigned __i1, unsigned __i0) +{ + if (__builtin_constant_p (__i1) && + __builtin_constant_p (__i0)) + { + uint64_t val = ((uint64_t)__i1 << 32) + | ((uint64_t)__i0 << 0); + return *(__m64 *)&val; + } + else if (__i1 == __i0) + { + uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0); + __m64 ret; + asm("pshufh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm) + ); + return ret; + } + uint64_t val = ((uint64_t)__i1 << 32) + | ((uint64_t)__i0 << 0); + return *(__m64 *)&val; +} +#undef _MM_SHUFFLE + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shuffle_pi16 (__m64 __m, int64_t __n) { __m64 ret; @@ -193,6 +287,17 @@ _mm_srli_pi16 (__m64 __m, int64_t __count) } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi32 (__m64 __m, int64_t __count) +{ + __m64 ret; + asm("psrlw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_si64 (__m64 __m, int64_t __count) { __m64 ret; @@ -204,6 +309,17 @@ _mm_srli_si64 (__m64 __m, int64_t __count) } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("psubh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) { __m64 ret; diff --git a/pixman/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman/pixman-arm-neon-asm-bilinear.S index f7913adb7..e37b5c298 100644 --- a/pixman/pixman/pixman-arm-neon-asm-bilinear.S +++ b/pixman/pixman/pixman-arm-neon-asm-bilinear.S @@ -64,6 +64,7 @@ .altmacro .p2align 2 +#include "pixman-private.h" #include "pixman-arm-neon-asm.h" /* @@ -488,12 +489,12 @@ fname: vmull.u8 q1, d0, d28 vmlal.u8 q1, d1, d29 /* 5 cycles bubble */ - vshll.u16 q0, d2, #8 + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 /* 5 cycles bubble */ bilinear_duplicate_mask mask_fmt, 1, d4 - vshrn.u32 d0, q0, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) /* 3 cycles bubble */ vmovn.u16 d0, q0 /* 1 cycle bubble */ @@ -514,16 +515,16 @@ fname: q1, q11, d0, d1, d20, d21, d22, d23 bilinear_load_mask mask_fmt, 2, d4 bilinear_load_dst dst_fmt, op, 2, d18, d19, q9 - vshll.u16 q0, d2, #8 + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #8 + vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q10, d22, d31 vmlal.u16 q10, d23, d31 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q10, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) bilinear_duplicate_mask mask_fmt, 2, d4 - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 vmovn.u16 d0, q0 bilinear_interleave_src_dst \ @@ -544,29 +545,29 @@ fname: q3, q9, d4, d5, d16, d17, d18, d19 pld [TMP1, PF_OFFS] sub TMP1, TMP1, STRIDE - vshll.u16 q0, d2, #8 + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #8 + vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q10, d22, d31 vmlal.u16 q10, d23, d31 - vshr.u16 q15, q12, #8 - vshll.u16 q2, d6, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q2, d6, d30 vmlal.u16 q2, d7, d30 - vshll.u16 q8, d18, #8 + vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS bilinear_load_mask mask_fmt, 4, d22 bilinear_load_dst dst_fmt, op, 4, d2, d3, q1 pld [TMP1, PF_OFFS] vmlsl.u16 q8, d18, d31 vmlal.u16 q8, d19, d31 vadd.u16 q12, q12, q13 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q10, #16 - vshrn.u32 d4, q2, #16 - vshrn.u32 d5, q8, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) bilinear_duplicate_mask mask_fmt, 4, d22 - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vmovn.u16 d0, q0 vmovn.u16 d1, q2 vadd.u16 q12, q12, q13 @@ -694,13 +695,13 @@ pixman_asm_function fname blt 0f tst OUT, #(1 << dst_bpp_shift) beq 0f - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 bilinear_process_last_pixel sub WIDTH, WIDTH, #1 0: vadd.u16 q13, q13, q13 - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 cmp WIDTH, #2 @@ -921,7 +922,7 @@ pixman_asm_function fname vmull.u8 q10, d22, d28 vmlal.u8 q10, d23, d29 - vshll.u16 q0, d16, #8 + vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d16, d30 vmlal.u16 q0, d17, d30 @@ -932,27 +933,27 @@ pixman_asm_function fname vmull.u8 q11, d16, d28 vmlal.u8 q11, d17, d29 - vshll.u16 q1, d18, #8 + vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q1, d18, d31 vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 .endm .macro bilinear_over_8888_8888_process_pixblock_tail - vshll.u16 q2, d20, #8 + vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q2, d20, d30 vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #8 + vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q3, d22, d31 vmlal.u16 q3, d23, d31 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q1, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) vld1.32 {d2, d3}, [OUT, :128] pld [OUT, #(prefetch_offset * 4)] - vshrn.u32 d4, q2, #16 - vshr.u16 q15, q12, #8 - vshrn.u32 d5, q3, #16 + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) vmovn.u16 d6, q0 vmovn.u16 d7, q2 vuzp.8 d6, d7 @@ -975,7 +976,7 @@ pixman_asm_function fname .endm .macro bilinear_over_8888_8888_process_pixblock_tail_head - vshll.u16 q2, d20, #8 + vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #2 @@ -984,21 +985,21 @@ pixman_asm_function fname add X, X, UX add TMP2, TOP, TMP2, asl #2 vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #8 + vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS vld1.32 {d20}, [TMP1], STRIDE vmlsl.u16 q3, d22, d31 vmlal.u16 q3, d23, d31 vld1.32 {d21}, [TMP1] vmull.u8 q8, d20, d28 vmlal.u8 q8, d21, d29 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q1, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) vld1.32 {d2, d3}, [OUT, :128] pld [OUT, PF_OFFS] - vshrn.u32 d4, q2, #16 - vshr.u16 q15, q12, #8 + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vld1.32 {d22}, [TMP2], STRIDE - vshrn.u32 d5, q3, #16 + vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) vmovn.u16 d6, q0 vld1.32 {d23}, [TMP2] vmull.u8 q9, d22, d28 @@ -1022,7 +1023,7 @@ pixman_asm_function fname vmlal.u8 q10, d23, d29 vmull.u8 q11, d2, d4 vmull.u8 q2, d3, d4 - vshll.u16 q0, d16, #8 + vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d16, d30 vrshr.u16 q1, q11, #8 vmlal.u16 q0, d17, d30 @@ -1037,12 +1038,12 @@ pixman_asm_function fname vmull.u8 q11, d16, d28 vmlal.u8 q11, d17, d29 vuzp.8 d6, d7 - vshll.u16 q1, d18, #8 + vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS vuzp.8 d6, d7 vmlsl.u16 q1, d18, d31 vadd.u16 q12, q12, q13 vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 vst1.32 {d6, d7}, [OUT, :128]! .endm @@ -1081,14 +1082,14 @@ pixman_asm_function fname vmull.u8 q3, d2, d28 vmlal.u8 q2, d1, d29 vmlal.u8 q3, d3, d29 - vshll.u16 q0, d4, #8 - vshll.u16 q1, d6, #8 + vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS + vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d4, d30 vmlsl.u16 q1, d6, d31 vmlal.u16 q0, d5, d30 vmlal.u16 q1, d7, d31 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q1, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) vld1.32 {d2}, [TMP3], STRIDE vld1.32 {d3}, [TMP3] pld [TMP4, PF_OFFS] @@ -1099,7 +1100,7 @@ pixman_asm_function fname vmlal.u8 q3, d3, d29 vmull.u8 q1, d4, d28 vmlal.u8 q1, d5, d29 - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vld1.32 {d22[0]}, [MASK]! pld [MASK, #prefetch_offset] vadd.u16 q12, q12, q13 @@ -1107,17 +1108,17 @@ pixman_asm_function fname .endm .macro bilinear_over_8888_8_8888_process_pixblock_tail - vshll.u16 q9, d6, #8 - vshll.u16 q10, d2, #8 + vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS + vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q9, d6, d30 vmlsl.u16 q10, d2, d31 vmlal.u16 q9, d7, d30 vmlal.u16 q10, d3, d31 - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 vdup.32 d22, d22[0] - vshrn.u32 d18, q9, #16 - vshrn.u32 d19, q10, #16 + vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) vmovn.u16 d17, q9 vld1.32 {d18, d19}, [OUT, :128] pld [OUT, PF_OFFS] @@ -1146,11 +1147,11 @@ pixman_asm_function fname .endm .macro bilinear_over_8888_8_8888_process_pixblock_tail_head - vshll.u16 q9, d6, #8 + vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #2 - vshll.u16 q10, d2, #8 + vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS vld1.32 {d0}, [TMP1], STRIDE mov TMP2, X, asr #16 add X, X, UX @@ -1167,12 +1168,12 @@ pixman_asm_function fname mov TMP4, X, asr #16 add X, X, UX add TMP4, TOP, TMP4, asl #2 - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 vld1.32 {d3}, [TMP2] vdup.32 d22, d22[0] - vshrn.u32 d18, q9, #16 - vshrn.u32 d19, q10, #16 + vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) vmull.u8 q2, d0, d28 vmull.u8 q3, d2, d28 vmovn.u16 d17, q9 @@ -1182,8 +1183,8 @@ pixman_asm_function fname vmlal.u8 q3, d3, d29 vuzp.8 d16, d17 vuzp.8 d18, d19 - vshll.u16 q0, d4, #8 - vshll.u16 q1, d6, #8 + vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS + vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS vuzp.8 d16, d17 vuzp.8 d18, d19 vmlsl.u16 q0, d4, d30 @@ -1194,8 +1195,8 @@ pixman_asm_function fname vmlal.u16 q1, d7, d31 vrsra.u16 q10, q10, #8 vrsra.u16 q11, q11, #8 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q1, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) vrshrn.u16 d16, q10, #8 vrshrn.u16 d17, q11, #8 vld1.32 {d2}, [TMP3], STRIDE @@ -1216,7 +1217,7 @@ pixman_asm_function fname vraddhn.u16 d18, q9, q10 vraddhn.u16 d19, q15, q11 vmlal.u8 q1, d5, d29 - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vqadd.u8 q9, q8, q9 vld1.32 {d22[0]}, [MASK]! vuzp.8 d18, d19 diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S index 87aae1d55..187197dc3 100644 --- a/pixman/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman/pixman-arm-neon-asm.S @@ -49,6 +49,7 @@ .altmacro .p2align 2 +#include "pixman-private.h" #include "pixman-arm-neon-asm.h" /* Global configuration options and preferences */ @@ -2986,11 +2987,11 @@ fname: vmull.u8 q1, d0, d28 vmlal.u8 q1, d1, d29 /* 5 cycles bubble */ - vshll.u16 q0, d2, #8 + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 /* 5 cycles bubble */ - vshrn.u32 d0, q0, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) /* 3 cycles bubble */ vmovn.u16 d0, q0 /* 1 cycle bubble */ @@ -3000,15 +3001,15 @@ fname: .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt bilinear_load_and_vertical_interpolate_two_&src_fmt \ q1, q11, d0, d1, d20, d21, d22, d23 - vshll.u16 q0, d2, #8 + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #8 + vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q10, d22, d31 vmlal.u16 q10, d23, d31 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q10, #16 - vshr.u16 q15, q12, #8 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 vmovn.u16 d0, q0 bilinear_store_&dst_fmt 2, q2, q3 @@ -3020,26 +3021,26 @@ fname: q3, q9, d4, d5, d16, d17, d18, d19 pld [TMP1, PF_OFFS] sub TMP1, TMP1, STRIDE - vshll.u16 q0, d2, #8 + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #8 + vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q10, d22, d31 vmlal.u16 q10, d23, d31 - vshr.u16 q15, q12, #8 - vshll.u16 q2, d6, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q2, d6, d30 vmlal.u16 q2, d7, d30 - vshll.u16 q8, d18, #8 + vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS pld [TMP2, PF_OFFS] vmlsl.u16 q8, d18, d31 vmlal.u16 q8, d19, d31 vadd.u16 q12, q12, q13 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q10, #16 - vshrn.u32 d4, q2, #16 - vshrn.u32 d5, q8, #16 - vshr.u16 q15, q12, #8 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vmovn.u16 d0, q0 vmovn.u16 d1, q2 vadd.u16 q12, q12, q13 @@ -3158,13 +3159,13 @@ pixman_asm_function fname blt 0f tst OUT, #(1 << dst_bpp_shift) beq 0f - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 bilinear_interpolate_last_pixel src_fmt, dst_fmt sub WIDTH, WIDTH, #1 0: vadd.u16 q13, q13, q13 - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 cmp WIDTH, #2 @@ -3282,7 +3283,7 @@ pixman_asm_function fname vmull.u8 q10, d22, d28 vmlal.u8 q10, d23, d29 - vshll.u16 q0, d16, #8 + vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d16, d30 vmlal.u16 q0, d17, d30 @@ -3293,25 +3294,25 @@ pixman_asm_function fname vmull.u8 q11, d16, d28 vmlal.u8 q11, d17, d29 - vshll.u16 q1, d18, #8 + vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q1, d18, d31 .endm .macro bilinear_interpolate_four_pixels_8888_8888_tail vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #8 - vshll.u16 q2, d20, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q2, d20, d30 vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #8 + vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q3, d22, d31 vmlal.u16 q3, d23, d31 vadd.u16 q12, q12, q13 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q1, #16 - vshrn.u32 d4, q2, #16 - vshr.u16 q15, q12, #8 - vshrn.u32 d5, q3, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) vmovn.u16 d6, q0 vmovn.u16 d7, q2 vadd.u16 q12, q12, q13 @@ -3326,22 +3327,22 @@ pixman_asm_function fname add X, X, UX add TMP2, TOP, TMP2, asl #2 vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #8 - vshll.u16 q2, d20, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q2, d20, d30 vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #8 + vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS vld1.32 {d20}, [TMP1], STRIDE vmlsl.u16 q3, d22, d31 vmlal.u16 q3, d23, d31 vld1.32 {d21}, [TMP1] vmull.u8 q8, d20, d28 vmlal.u8 q8, d21, d29 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q1, #16 - vshrn.u32 d4, q2, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) vld1.32 {d22}, [TMP2], STRIDE - vshrn.u32 d5, q3, #16 + vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 vld1.32 {d23}, [TMP2] vmull.u8 q9, d22, d28 @@ -3353,12 +3354,12 @@ pixman_asm_function fname add TMP4, TOP, TMP4, asl #2 vmlal.u8 q9, d23, d29 vld1.32 {d22}, [TMP3], STRIDE - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vld1.32 {d23}, [TMP3] vmull.u8 q10, d22, d28 vmlal.u8 q10, d23, d29 vmovn.u16 d6, q0 - vshll.u16 q0, d16, #8 + vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS vmovn.u16 d7, q2 vmlsl.u16 q0, d16, d30 vmlal.u16 q0, d17, d30 @@ -3370,7 +3371,7 @@ pixman_asm_function fname vmull.u8 q11, d16, d28 vmlal.u8 q11, d17, d29 vst1.32 {d6, d7}, [OUT, :128]! - vshll.u16 q1, d18, #8 + vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q1, d18, d31 .endm @@ -3403,7 +3404,7 @@ pixman_asm_function fname vld1.32 {d23}, [TMP3] vmull.u8 q10, d22, d28 vmlal.u8 q10, d23, d29 - vshll.u16 q0, d16, #8 + vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d16, d30 vmlal.u16 q0, d17, d30 pld [TMP4, PF_OFFS] @@ -3412,7 +3413,7 @@ pixman_asm_function fname pld [TMP4, PF_OFFS] vmull.u8 q11, d16, d28 vmlal.u8 q11, d17, d29 - vshll.u16 q1, d18, #8 + vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q1, d18, d31 mov TMP1, X, asr #16 @@ -3422,22 +3423,22 @@ pixman_asm_function fname add X, X, UX add TMP2, TOP, TMP2, asl #2 vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #8 - vshll.u16 q2, d20, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q2, d20, d30 vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #8 + vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS vld1.32 {d20}, [TMP1], STRIDE vmlsl.u16 q3, d22, d31 vmlal.u16 q3, d23, d31 vld1.32 {d21}, [TMP1] vmull.u8 q8, d20, d28 vmlal.u8 q8, d21, d29 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q1, #16 - vshrn.u32 d4, q2, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) vld1.32 {d22}, [TMP2], STRIDE - vshrn.u32 d5, q3, #16 + vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 vld1.32 {d23}, [TMP2] vmull.u8 q9, d22, d28 @@ -3449,12 +3450,12 @@ pixman_asm_function fname add TMP4, TOP, TMP4, asl #2 vmlal.u8 q9, d23, d29 vld1.32 {d22}, [TMP3], STRIDE - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vld1.32 {d23}, [TMP3] vmull.u8 q10, d22, d28 vmlal.u8 q10, d23, d29 vmovn.u16 d8, q0 - vshll.u16 q0, d16, #8 + vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS vmovn.u16 d9, q2 vmlsl.u16 q0, d16, d30 vmlal.u16 q0, d17, d30 @@ -3465,25 +3466,25 @@ pixman_asm_function fname pld [TMP4, PF_OFFS] vmull.u8 q11, d16, d28 vmlal.u8 q11, d17, d29 - vshll.u16 q1, d18, #8 + vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q1, d18, d31 .endm .macro bilinear_interpolate_eight_pixels_8888_0565_tail vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #8 - vshll.u16 q2, d20, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q2, d20, d30 vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #8 + vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q3, d22, d31 vmlal.u16 q3, d23, d31 vadd.u16 q12, q12, q13 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q1, #16 - vshrn.u32 d4, q2, #16 - vshr.u16 q15, q12, #8 - vshrn.u32 d5, q3, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) vmovn.u16 d10, q0 vmovn.u16 d11, q2 vadd.u16 q12, q12, q13 @@ -3508,23 +3509,23 @@ pixman_asm_function fname add X, X, UX add TMP2, TOP, TMP2, asl #2 vmlal.u16 q1, d19, d31 - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vuzp.u8 d8, d9 - vshll.u16 q2, d20, #8 + vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q2, d20, d30 vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #8 + vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS vld1.32 {d20}, [TMP1], STRIDE vmlsl.u16 q3, d22, d31 vmlal.u16 q3, d23, d31 vld1.32 {d21}, [TMP1] vmull.u8 q8, d20, d28 vmlal.u8 q8, d21, d29 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q1, #16 - vshrn.u32 d4, q2, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) vld1.32 {d22}, [TMP2], STRIDE - vshrn.u32 d5, q3, #16 + vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 vld1.32 {d23}, [TMP2] vmull.u8 q9, d22, d28 @@ -3536,12 +3537,12 @@ pixman_asm_function fname add TMP4, TOP, TMP4, asl #2 vmlal.u8 q9, d23, d29 vld1.32 {d22}, [TMP3], STRIDE - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vld1.32 {d23}, [TMP3] vmull.u8 q10, d22, d28 vmlal.u8 q10, d23, d29 vmovn.u16 d10, q0 - vshll.u16 q0, d16, #8 + vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS vmovn.u16 d11, q2 vmlsl.u16 q0, d16, d30 vmlal.u16 q0, d17, d30 @@ -3553,7 +3554,7 @@ pixman_asm_function fname vmull.u8 q11, d16, d28 vmlal.u8 q11, d17, d29 vuzp.u8 d10, d11 - vshll.u16 q1, d18, #8 + vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q1, d18, d31 mov TMP1, X, asr #16 @@ -3564,12 +3565,12 @@ pixman_asm_function fname add TMP2, TOP, TMP2, asl #2 vmlal.u16 q1, d19, d31 vuzp.u8 d9, d11 - vshr.u16 q15, q12, #8 - vshll.u16 q2, d20, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS vuzp.u8 d8, d10 vmlsl.u16 q2, d20, d30 vmlal.u16 q2, d21, d30 - vshll.u16 q3, d22, #8 + vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS vld1.32 {d20}, [TMP1], STRIDE vmlsl.u16 q3, d22, d31 vmlal.u16 q3, d23, d31 @@ -3579,13 +3580,13 @@ pixman_asm_function fname vshll.u8 q6, d9, #8 vshll.u8 q5, d10, #8 vshll.u8 q7, d8, #8 - vshrn.u32 d0, q0, #16 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) vsri.u16 q5, q6, #5 - vshrn.u32 d1, q1, #16 + vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) vsri.u16 q5, q7, #11 - vshrn.u32 d4, q2, #16 + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) vld1.32 {d22}, [TMP2], STRIDE - vshrn.u32 d5, q3, #16 + vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 vld1.32 {d23}, [TMP2] vmull.u8 q9, d22, d28 @@ -3597,12 +3598,12 @@ pixman_asm_function fname add TMP4, TOP, TMP4, asl #2 vmlal.u8 q9, d23, d29 vld1.32 {d22}, [TMP3], STRIDE - vshr.u16 q15, q12, #8 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vld1.32 {d23}, [TMP3] vmull.u8 q10, d22, d28 vmlal.u8 q10, d23, d29 vmovn.u16 d8, q0 - vshll.u16 q0, d16, #8 + vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS vmovn.u16 d9, q2 vmlsl.u16 q0, d16, d30 vmlal.u16 q0, d17, d30 @@ -3613,7 +3614,7 @@ pixman_asm_function fname pld [TMP4, PF_OFFS] vmull.u8 q11, d16, d28 vmlal.u8 q11, d17, d29 - vshll.u16 q1, d18, #8 + vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS vst1.32 {d10, d11}, [OUT, :128]! vmlsl.u16 q1, d18, d31 .endm diff --git a/pixman/pixman/pixman-bits-image.c b/pixman/pixman/pixman-bits-image.c index 05eab9634..b6c8630f4 100644 --- a/pixman/pixman/pixman-bits-image.c +++ b/pixman/pixman/pixman-bits-image.c @@ -131,8 +131,8 @@ bits_image_fetch_pixel_bilinear (bits_image_t *image, x1 = x - pixman_fixed_1 / 2; y1 = y - pixman_fixed_1 / 2; - distx = (x1 >> 8) & 0xff; - disty = (y1 >> 8) & 0xff; + distx = pixman_fixed_to_bilinear_weight (x1); + disty = pixman_fixed_to_bilinear_weight (y1); x1 = pixman_fixed_to_int (x1); y1 = pixman_fixed_to_int (y1); @@ -200,7 +200,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter, x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2; y = v.vector[1] - pixman_fixed_1/2; - disty = (y >> 8) & 0xff; + disty = pixman_fixed_to_bilinear_weight (y); /* Load the pointers to the first and second lines from the source * image that bilinear code must read. @@ -309,7 +309,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter, tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask; br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; - distx = (x >> 8) & 0xff; + distx = pixman_fixed_to_bilinear_weight (x); *buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty); @@ -334,7 +334,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter, bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask; br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; - distx = (x >> 8) & 0xff; + distx = pixman_fixed_to_bilinear_weight (x); *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty); } @@ -358,7 +358,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter, tl = top_row [pixman_fixed_to_int (x_top)] | top_mask; bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask; - distx = (x >> 8) & 0xff; + distx = pixman_fixed_to_bilinear_weight (x); *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty); } @@ -695,8 +695,8 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image, x1 = x - pixman_fixed_1 / 2; y1 = y - pixman_fixed_1 / 2; - distx = (x1 >> 8) & 0xff; - disty = (y1 >> 8) & 0xff; + distx = pixman_fixed_to_bilinear_weight (x1); + disty = pixman_fixed_to_bilinear_weight (y1); y1 = pixman_fixed_to_int (y1); y2 = y1 + 1; diff --git a/pixman/pixman/pixman-inlines.h b/pixman/pixman/pixman-inlines.h index 3532867a4..5517de5a5 100644 --- a/pixman/pixman/pixman-inlines.h +++ b/pixman/pixman/pixman-inlines.h @@ -81,6 +81,13 @@ repeat (pixman_repeat_t repeat, int *c, int size) return TRUE; } +static force_inline int +pixman_fixed_to_bilinear_weight (pixman_fixed_t x) +{ + return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) & + ((1 << BILINEAR_INTERPOLATION_BITS) - 1); +} + #if SIZEOF_LONG > 4 static force_inline uint32_t @@ -92,6 +99,9 @@ bilinear_interpolation (uint32_t tl, uint32_t tr, uint64_t tl64, tr64, bl64, br64; uint64_t f, r; + distx <<= (8 - BILINEAR_INTERPOLATION_BITS); + disty <<= (8 - BILINEAR_INTERPOLATION_BITS); + distxy = distx * disty; distxiy = distx * (256 - disty); distixy = (256 - distx) * disty; @@ -135,6 +145,9 @@ bilinear_interpolation (uint32_t tl, uint32_t tr, int distxy, distxiy, distixy, distixiy; uint32_t f, r; + distx <<= (8 - BILINEAR_INTERPOLATION_BITS); + disty <<= (8 - BILINEAR_INTERPOLATION_BITS); + distxy = distx * disty; distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */ distixy = (disty << 8) - distxy; /* disty * (256 - distx) */ @@ -758,12 +771,14 @@ bilinear_pad_repeat_get_scanline_bounds (int32_t source_image_width, * all source pixels are fetched from zero padding * zone for NONE repeat * - * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256, - * but sometimes it may be less than that for NONE repeat when handling - * fuzzy antialiased top or bottom image edges. Also both top and - * bottom weight variables are guaranteed to have value in 0-255 - * range and can fit into unsigned byte or be used with 8-bit SIMD - * multiplication instructions. + * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to + * BILINEAR_INTERPOLATION_RANGE, but sometimes it may be less than that + * for NONE repeat when handling fuzzy antialiased top or bottom image + * edges. Also both top and bottom weight variables are guaranteed to + * have value, which is less than BILINEAR_INTERPOLATION_RANGE. + * For example, the weights can fit into unsigned byte or be used + * with 8-bit SIMD multiplication instructions for 8-bit interpolation + * precision. */ #define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \ dst_type_t, repeat_mode, flags) \ @@ -877,18 +892,18 @@ fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp, } \ \ y1 = pixman_fixed_to_int (vy); \ - weight2 = (vy >> 8) & 0xff; \ + weight2 = pixman_fixed_to_bilinear_weight (vy); \ if (weight2) \ { \ - /* normal case, both row weights are in 0-255 range and fit unsigned byte */ \ + /* both weight1 and weight2 are smaller than BILINEAR_INTERPOLATION_RANGE */ \ y2 = y1 + 1; \ - weight1 = 256 - weight2; \ + weight1 = BILINEAR_INTERPOLATION_RANGE - weight2; \ } \ else \ { \ - /* set both top and bottom row to the same scanline, and weights to 128+128 */ \ + /* set both top and bottom row to the same scanline and tweak weights */ \ y2 = y1; \ - weight1 = weight2 = 128; \ + weight1 = weight2 = BILINEAR_INTERPOLATION_RANGE / 2; \ } \ vy += unit_y; \ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \ diff --git a/pixman/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman/pixman-mips-dspr2-asm.S index 87558f032..48f108ed9 100644 --- a/pixman/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman/pixman-mips-dspr2-asm.S @@ -29,6 +29,7 @@ * Author: Nemanja Lukic (nlukic@mips.com) */ +#include "pixman-private.h" #include "pixman-mips-dspr2-asm.h" LEAF_MIPS_DSPR2(pixman_fill_buff16_mips) @@ -771,11 +772,15 @@ LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips) lw s1, 48(sp) /* s1 = wb */ lw s2, 52(sp) /* s2 = vx */ lw s3, 56(sp) /* s3 = unit_x */ - li v0, 256 + li v0, BILINEAR_INTERPOLATION_RANGE li s8, 0x00ff00ff + + sll s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) + sll s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) + 0: andi t4, s2, 0xffff /* t4 = (short)vx */ - srl t4, t4, 8 /* t4 = vx >> 8 */ + srl t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ subu t5, v0, t4 /* t5 = ( 256 - (vx>>8)) */ mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */ diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c index 7c1f4fe24..b3ef2495b 100644 --- a/pixman/pixman/pixman-mmx.c +++ b/pixman/pixman/pixman-mmx.c @@ -42,6 +42,7 @@ #endif #include "pixman-private.h" #include "pixman-combine32.h" +#include "pixman-inlines.h" #define no_vERBOSE @@ -718,6 +719,24 @@ combine (const uint32_t *src, const uint32_t *mask) return vsrc; } +static force_inline __m64 +core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) +{ + vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ()); + + if (is_opaque (vsrc)) + { + return vsrc; + } + else if (!is_zero (vsrc)) + { + return over (vsrc, expand_alpha (vsrc), + _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ())); + } + + return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()); +} + static void mmx_combine_over_u (pixman_implementation_t *imp, pixman_op_t op, @@ -1623,9 +1642,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); - mask &= 0xff000000; - mask = mask | mask >> 8 | mask >> 16 | mask >> 24; - vmask = load8888 (&mask); + vmask = expand_alpha (load8888 (&mask)); while (height--) { @@ -1694,9 +1711,7 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); - mask &= 0xff000000; - mask = mask | mask >> 8 | mask >> 16 | mask >> 24; - vmask = load8888 (&mask); + vmask = expand_alpha (load8888 (&mask)); srca = MC (4x00ff); while (height--) @@ -3532,6 +3547,242 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp, _mm_empty (); } +#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS)) +#define BMSK (BSHIFT - 1) + +#define BILINEAR_DECLARE_VARIABLES \ + const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \ + const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \ + const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \ + const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \ + const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \ + const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \ + const __m64 mm_zero = _mm_setzero_si64 (); \ + __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx) + +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ +do { \ + /* fetch 2x2 pixel block into 2 mmx registers */ \ + __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \ + __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \ + vx += unit_x; \ + /* vertical interpolation */ \ + __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \ + __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \ + __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \ + __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \ + __m64 hi = _mm_add_pi16 (t_hi, b_hi); \ + __m64 lo = _mm_add_pi16 (t_lo, b_lo); \ + if (BILINEAR_INTERPOLATION_BITS < 8) \ + { \ + /* calculate horizontal weights */ \ + __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \ + _mm_srli_pi16 (mm_x, \ + 16 - BILINEAR_INTERPOLATION_BITS))); \ + mm_x = _mm_add_pi16 (mm_x, mm_ux); \ + /* horizontal interpolation */ \ + __m64 p = _mm_unpacklo_pi16 (lo, hi); \ + __m64 q = _mm_unpackhi_pi16 (lo, hi); \ + lo = _mm_madd_pi16 (p, mm_wh); \ + hi = _mm_madd_pi16 (q, mm_wh); \ + } \ + else \ + { \ + /* calculate horizontal weights */ \ + __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \ + 16 - BILINEAR_INTERPOLATION_BITS)); \ + __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \ + 16 - BILINEAR_INTERPOLATION_BITS); \ + mm_x = _mm_add_pi16 (mm_x, mm_ux); \ + /* horizontal interpolation */ \ + __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \ + __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \ + __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \ + __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \ + lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \ + _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \ + hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \ + _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \ + } \ + /* shift and pack the result */ \ + hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \ + lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \ + lo = _mm_packs_pi32 (lo, hi); \ + lo = _mm_packs_pu16 (lo, lo); \ + pix = lo; \ +} while (0) + +#define BILINEAR_SKIP_ONE_PIXEL() \ +do { \ + vx += unit_x; \ + mm_x = _mm_add_pi16 (mm_x, mm_ux); \ +} while(0) + +static force_inline void +scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst, + const uint32_t * mask, + const uint32_t * src_top, + const uint32_t * src_bottom, + int32_t w, + int wt, + int wb, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + BILINEAR_DECLARE_VARIABLES; + __m64 pix; + + while (w--) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix); + store (dst, pix); + dst++; + } + + _mm_empty (); +} + +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC, + scaled_bilinear_scanline_mmx_8888_8888_SRC, + uint32_t, uint32_t, uint32_t, + COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC, + scaled_bilinear_scanline_mmx_8888_8888_SRC, + uint32_t, uint32_t, uint32_t, + PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC, + scaled_bilinear_scanline_mmx_8888_8888_SRC, + uint32_t, uint32_t, uint32_t, + NONE, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC, + scaled_bilinear_scanline_mmx_8888_8888_SRC, + uint32_t, uint32_t, uint32_t, + NORMAL, FLAG_NONE) + +static force_inline void +scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst, + const uint32_t * mask, + const uint32_t * src_top, + const uint32_t * src_bottom, + int32_t w, + int wt, + int wb, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + BILINEAR_DECLARE_VARIABLES; + __m64 pix1, pix2; + + while (w) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + + if (!is_zero (pix1)) + { + pix2 = load (dst); + store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2)); + } + + w--; + dst++; + } + + _mm_empty (); +} + +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER, + scaled_bilinear_scanline_mmx_8888_8888_OVER, + uint32_t, uint32_t, uint32_t, + COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER, + scaled_bilinear_scanline_mmx_8888_8888_OVER, + uint32_t, uint32_t, uint32_t, + PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER, + scaled_bilinear_scanline_mmx_8888_8888_OVER, + uint32_t, uint32_t, uint32_t, + NONE, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER, + scaled_bilinear_scanline_mmx_8888_8888_OVER, + uint32_t, uint32_t, uint32_t, + NORMAL, FLAG_NONE) + +static force_inline void +scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst, + const uint8_t * mask, + const uint32_t * src_top, + const uint32_t * src_bottom, + int32_t w, + int wt, + int wb, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + BILINEAR_DECLARE_VARIABLES; + __m64 pix1, pix2; + uint32_t m; + + while (w) + { + m = (uint32_t) *mask++; + + if (m) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + + if (m == 0xff && is_opaque (pix1)) + { + store (dst, pix1); + } + else + { + __m64 ms, md, ma, msa; + + pix2 = load (dst); + ma = expand_alpha_rev (to_m64 (m)); + ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ()); + md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ()); + + msa = expand_alpha (ms); + + store8888 (dst, (in_over (ms, msa, ma, md))); + } + } + else + { + BILINEAR_SKIP_ONE_PIXEL (); + } + + w--; + dst++; + } + + _mm_empty (); +} + +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER, + scaled_bilinear_scanline_mmx_8888_8_8888_OVER, + uint32_t, uint8_t, uint32_t, + COVER, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER, + scaled_bilinear_scanline_mmx_8888_8_8888_OVER, + uint32_t, uint8_t, uint32_t, + PAD, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER, + scaled_bilinear_scanline_mmx_8888_8_8888_OVER, + uint32_t, uint8_t, uint32_t, + NONE, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER, + scaled_bilinear_scanline_mmx_8888_8_8888_OVER, + uint32_t, uint8_t, uint32_t, + NORMAL, FLAG_HAVE_NON_SOLID_MASK) + static uint32_t * mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) { @@ -3787,6 +4038,23 @@ static const pixman_fast_path_t mmx_fast_paths[] = PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ), PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ), + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), + SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ), + SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), + SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), + SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ), + + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), + + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ), + { PIXMAN_OP_NONE }, }; diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h index 72e3b4f6d..0c27798b0 100644 --- a/pixman/pixman/pixman-private.h +++ b/pixman/pixman/pixman-private.h @@ -1,10 +1,24 @@ +#ifndef PIXMAN_PRIVATE_H +#define PIXMAN_PRIVATE_H + +/* + * The defines which are shared between C and assembly code + */ + +/* bilinear interpolation precision (must be <= 8) */ +#define BILINEAR_INTERPOLATION_BITS 7 +#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS) + +/* + * C specific part + */ + +#ifndef __ASSEMBLER__ + #ifndef PACKAGE # error config.h must be included before pixman-private.h #endif -#ifndef PIXMAN_PRIVATE_H -#define PIXMAN_PRIVATE_H - #define PIXMAN_DISABLE_DEPRECATED #define PIXMAN_USE_INTERNAL_API @@ -1052,4 +1066,6 @@ void pixman_timer_register (pixman_timer_t *timer); #endif /* PIXMAN_TIMERS */ +#endif /* __ASSEMBLER__ */ + #endif /* PIXMAN_PRIVATE_H */ diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index b656d17d4..ba067bc31 100644 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -5364,11 +5364,15 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, scaled_nearest_scanline_sse2_8888_n_8888_OVER, uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) +#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1) + #define BILINEAR_DECLARE_VARIABLES \ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ - const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\ - const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \ + const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\ + const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \ + const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\ + const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ const __m128i xmm_ux = _mm_set_epi16 (unit_x&0xffff, unit_x&0xffff, unit_x&0xffff, unit_x&0xffff, \ unit_x&0xffff, unit_x&0xffff, unit_x&0xffff, unit_x&0xffff); \ const __m128i xmm_zero = _mm_setzero_si128 (); \ @@ -5388,18 +5392,30 @@ do { \ xmm_wt), \ _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \ xmm_wb)); \ - /* calculate horizontal weights */ \ - xmm_wh = _mm_add_epi16 (xmm_addc, \ - _mm_xor_si128 (xmm_xorc, \ - _mm_srli_epi16 (xmm_x, 8))); \ - xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ - /* horizontal interpolation */ \ - xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ - xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ - a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ - _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ + if (BILINEAR_INTERPOLATION_BITS < 8) \ + { \ + /* calculate horizontal weights */ \ + xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7, \ + _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ + /* horizontal interpolation */ \ + a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ + a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \ + } \ + else \ + { \ + /* calculate horizontal weights */ \ + xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8, \ + _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ + /* horizontal interpolation */ \ + xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ + xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ + a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ + _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ + } \ /* shift and pack the result */ \ - a = _mm_srli_epi32 (a, 16); \ + a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \ a = _mm_packs_epi32 (a, a); \ a = _mm_packus_epi16 (a, a); \ pix = _mm_cvtsi128_si32 (a); \ @@ -5845,6 +5861,9 @@ static const pixman_fast_path_t sse2_fast_paths[] = SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), diff --git a/pixman/test/affine-test.c b/pixman/test/affine-test.c index a4ceed3da..6827cc3a8 100644 --- a/pixman/test/affine-test.c +++ b/pixman/test/affine-test.c @@ -301,11 +301,21 @@ test_composite (int testnum, return crc32; } +#if BILINEAR_INTERPOLATION_BITS == 8 +#define CHECKSUM 0x1EF2175A +#elif BILINEAR_INTERPOLATION_BITS == 7 +#define CHECKSUM 0x74050F50 +#elif BILINEAR_INTERPOLATION_BITS == 4 +#define CHECKSUM 0x4362EAE8 +#else +#define CHECKSUM 0x00000000 +#endif + int main (int argc, const char *argv[]) { pixman_disable_out_of_bounds_workaround (); - return fuzzer_test_main ("affine", 8000000, 0x1EF2175A, + return fuzzer_test_main ("affine", 8000000, CHECKSUM, test_composite, argc, argv); } diff --git a/pixman/test/scaling-test.c b/pixman/test/scaling-test.c index 6f2da1432..44c4f3de4 100644 --- a/pixman/test/scaling-test.c +++ b/pixman/test/scaling-test.c @@ -357,11 +357,21 @@ test_composite (int testnum, return crc32; } +#if BILINEAR_INTERPOLATION_BITS == 8 +#define CHECKSUM 0x80DF1CB2 +#elif BILINEAR_INTERPOLATION_BITS == 7 +#define CHECKSUM 0x2818D5FB +#elif BILINEAR_INTERPOLATION_BITS == 4 +#define CHECKSUM 0x387540A5 +#else +#define CHECKSUM 0x00000000 +#endif + int main (int argc, const char *argv[]) { pixman_disable_out_of_bounds_workaround (); - return fuzzer_test_main("scaling", 8000000, 0x80DF1CB2, + return fuzzer_test_main("scaling", 8000000, CHECKSUM, test_composite, argc, argv); } |