aboutsummaryrefslogtreecommitdiff
path: root/pixman
diff options
context:
space:
mode:
Diffstat (limited to 'pixman')
-rw-r--r--pixman/pixman/Makefile.am1
-rw-r--r--pixman/pixman/loongson-mmintrin.h116
-rw-r--r--pixman/pixman/pixman-arm-neon-asm-bilinear.S119
-rw-r--r--pixman/pixman/pixman-arm-neon-asm.S159
-rw-r--r--pixman/pixman/pixman-bits-image.c16
-rw-r--r--pixman/pixman/pixman-inlines.h37
-rw-r--r--pixman/pixman/pixman-mips-dspr2-asm.S9
-rw-r--r--pixman/pixman/pixman-mmx.c280
-rw-r--r--pixman/pixman/pixman-private.h22
-rw-r--r--pixman/pixman/pixman-sse2.c45
-rw-r--r--pixman/test/affine-test.c12
-rw-r--r--pixman/test/scaling-test.c12
12 files changed, 645 insertions, 183 deletions
diff --git a/pixman/pixman/Makefile.am b/pixman/pixman/Makefile.am
index 1b232ad0f..deacf8728 100644
--- a/pixman/pixman/Makefile.am
+++ b/pixman/pixman/Makefile.am
@@ -92,6 +92,7 @@ endif
# iwmmxt code
if USE_ARM_IWMMXT
+libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
noinst_LTLIBRARIES += libpixman-iwmmxt.la
libpixman_1_la_LIBADD += libpixman-iwmmxt.la
diff --git a/pixman/pixman/loongson-mmintrin.h b/pixman/pixman/loongson-mmintrin.h
index 1a114fe0f..086c6e0f1 100644
--- a/pixman/pixman/loongson-mmintrin.h
+++ b/pixman/pixman/loongson-mmintrin.h
@@ -45,6 +45,28 @@ _mm_setzero_si64 (void)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi16 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("paddh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi32 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("paddw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu16 (__m64 __m1, __m64 __m2)
{
__m64 ret;
@@ -150,6 +172,78 @@ _mm_packs_pu16 (__m64 __m1, __m64 __m2)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi32 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("packsswh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
+{
+ if (__builtin_constant_p (__w3) &&
+ __builtin_constant_p (__w2) &&
+ __builtin_constant_p (__w1) &&
+ __builtin_constant_p (__w0))
+ {
+ uint64_t val = ((uint64_t)__w3 << 48)
+ | ((uint64_t)__w2 << 32)
+ | ((uint64_t)__w1 << 16)
+ | ((uint64_t)__w0 << 0);
+ return *(__m64 *)&val;
+ }
+ else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0)
+ {
+ /* TODO: handle other cases */
+ uint64_t val = __w3;
+ uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0);
+ __m64 ret;
+ asm("pshufh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
+ );
+ return ret;
+ }
+ uint64_t val = ((uint64_t)__w3 << 48)
+ | ((uint64_t)__w2 << 32)
+ | ((uint64_t)__w1 << 16)
+ | ((uint64_t)__w0 << 0);
+ return *(__m64 *)&val;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi32 (unsigned __i1, unsigned __i0)
+{
+ if (__builtin_constant_p (__i1) &&
+ __builtin_constant_p (__i0))
+ {
+ uint64_t val = ((uint64_t)__i1 << 32)
+ | ((uint64_t)__i0 << 0);
+ return *(__m64 *)&val;
+ }
+ else if (__i1 == __i0)
+ {
+ uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0);
+ __m64 ret;
+ asm("pshufh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+ );
+ return ret;
+ }
+ uint64_t val = ((uint64_t)__i1 << 32)
+ | ((uint64_t)__i0 << 0);
+ return *(__m64 *)&val;
+}
+#undef _MM_SHUFFLE
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi16 (__m64 __m, int64_t __n)
{
__m64 ret;
@@ -193,6 +287,17 @@ _mm_srli_pi16 (__m64 __m, int64_t __count)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi32 (__m64 __m, int64_t __count)
+{
+ __m64 ret;
+ asm("psrlw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si64 (__m64 __m, int64_t __count)
{
__m64 ret;
@@ -204,6 +309,17 @@ _mm_srli_si64 (__m64 __m, int64_t __count)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi16 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("psubh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
{
__m64 ret;
diff --git a/pixman/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman/pixman-arm-neon-asm-bilinear.S
index f7913adb7..e37b5c298 100644
--- a/pixman/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman/pixman-arm-neon-asm-bilinear.S
@@ -64,6 +64,7 @@
.altmacro
.p2align 2
+#include "pixman-private.h"
#include "pixman-arm-neon-asm.h"
/*
@@ -488,12 +489,12 @@ fname:
vmull.u8 q1, d0, d28
vmlal.u8 q1, d1, d29
/* 5 cycles bubble */
- vshll.u16 q0, d2, #8
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
/* 5 cycles bubble */
bilinear_duplicate_mask mask_fmt, 1, d4
- vshrn.u32 d0, q0, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
/* 3 cycles bubble */
vmovn.u16 d0, q0
/* 1 cycle bubble */
@@ -514,16 +515,16 @@ fname:
q1, q11, d0, d1, d20, d21, d22, d23
bilinear_load_mask mask_fmt, 2, d4
bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
- vshll.u16 q0, d2, #8
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
- vshll.u16 q10, d22, #8
+ vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q10, d22, d31
vmlal.u16 q10, d23, d31
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q10, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
bilinear_duplicate_mask mask_fmt, 2, d4
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vmovn.u16 d0, q0
bilinear_interleave_src_dst \
@@ -544,29 +545,29 @@ fname:
q3, q9, d4, d5, d16, d17, d18, d19
pld [TMP1, PF_OFFS]
sub TMP1, TMP1, STRIDE
- vshll.u16 q0, d2, #8
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
- vshll.u16 q10, d22, #8
+ vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q10, d22, d31
vmlal.u16 q10, d23, d31
- vshr.u16 q15, q12, #8
- vshll.u16 q2, d6, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q2, d6, d30
vmlal.u16 q2, d7, d30
- vshll.u16 q8, d18, #8
+ vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
bilinear_load_mask mask_fmt, 4, d22
bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
pld [TMP1, PF_OFFS]
vmlsl.u16 q8, d18, d31
vmlal.u16 q8, d19, d31
vadd.u16 q12, q12, q13
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q10, #16
- vshrn.u32 d4, q2, #16
- vshrn.u32 d5, q8, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
bilinear_duplicate_mask mask_fmt, 4, d22
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vmovn.u16 d0, q0
vmovn.u16 d1, q2
vadd.u16 q12, q12, q13
@@ -694,13 +695,13 @@ pixman_asm_function fname
blt 0f
tst OUT, #(1 << dst_bpp_shift)
beq 0f
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
bilinear_process_last_pixel
sub WIDTH, WIDTH, #1
0:
vadd.u16 q13, q13, q13
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
cmp WIDTH, #2
@@ -921,7 +922,7 @@ pixman_asm_function fname
vmull.u8 q10, d22, d28
vmlal.u8 q10, d23, d29
- vshll.u16 q0, d16, #8
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d16, d30
vmlal.u16 q0, d17, d30
@@ -932,27 +933,27 @@ pixman_asm_function fname
vmull.u8 q11, d16, d28
vmlal.u8 q11, d17, d29
- vshll.u16 q1, d18, #8
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q1, d18, d31
vmlal.u16 q1, d19, d31
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail
- vshll.u16 q2, d20, #8
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q2, d20, d30
vmlal.u16 q2, d21, d30
- vshll.u16 q3, d22, #8
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q3, d22, d31
vmlal.u16 q3, d23, d31
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q1, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
vld1.32 {d2, d3}, [OUT, :128]
pld [OUT, #(prefetch_offset * 4)]
- vshrn.u32 d4, q2, #16
- vshr.u16 q15, q12, #8
- vshrn.u32 d5, q3, #16
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
vmovn.u16 d6, q0
vmovn.u16 d7, q2
vuzp.8 d6, d7
@@ -975,7 +976,7 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail_head
- vshll.u16 q2, d20, #8
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #2
@@ -984,21 +985,21 @@ pixman_asm_function fname
add X, X, UX
add TMP2, TOP, TMP2, asl #2
vmlal.u16 q2, d21, d30
- vshll.u16 q3, d22, #8
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
vld1.32 {d20}, [TMP1], STRIDE
vmlsl.u16 q3, d22, d31
vmlal.u16 q3, d23, d31
vld1.32 {d21}, [TMP1]
vmull.u8 q8, d20, d28
vmlal.u8 q8, d21, d29
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q1, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
vld1.32 {d2, d3}, [OUT, :128]
pld [OUT, PF_OFFS]
- vshrn.u32 d4, q2, #16
- vshr.u16 q15, q12, #8
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vld1.32 {d22}, [TMP2], STRIDE
- vshrn.u32 d5, q3, #16
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
vmovn.u16 d6, q0
vld1.32 {d23}, [TMP2]
vmull.u8 q9, d22, d28
@@ -1022,7 +1023,7 @@ pixman_asm_function fname
vmlal.u8 q10, d23, d29
vmull.u8 q11, d2, d4
vmull.u8 q2, d3, d4
- vshll.u16 q0, d16, #8
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d16, d30
vrshr.u16 q1, q11, #8
vmlal.u16 q0, d17, d30
@@ -1037,12 +1038,12 @@ pixman_asm_function fname
vmull.u8 q11, d16, d28
vmlal.u8 q11, d17, d29
vuzp.8 d6, d7
- vshll.u16 q1, d18, #8
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
vuzp.8 d6, d7
vmlsl.u16 q1, d18, d31
vadd.u16 q12, q12, q13
vmlal.u16 q1, d19, d31
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vst1.32 {d6, d7}, [OUT, :128]!
.endm
@@ -1081,14 +1082,14 @@ pixman_asm_function fname
vmull.u8 q3, d2, d28
vmlal.u8 q2, d1, d29
vmlal.u8 q3, d3, d29
- vshll.u16 q0, d4, #8
- vshll.u16 q1, d6, #8
+ vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS
+ vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d4, d30
vmlsl.u16 q1, d6, d31
vmlal.u16 q0, d5, d30
vmlal.u16 q1, d7, d31
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q1, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
vld1.32 {d2}, [TMP3], STRIDE
vld1.32 {d3}, [TMP3]
pld [TMP4, PF_OFFS]
@@ -1099,7 +1100,7 @@ pixman_asm_function fname
vmlal.u8 q3, d3, d29
vmull.u8 q1, d4, d28
vmlal.u8 q1, d5, d29
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vld1.32 {d22[0]}, [MASK]!
pld [MASK, #prefetch_offset]
vadd.u16 q12, q12, q13
@@ -1107,17 +1108,17 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_tail
- vshll.u16 q9, d6, #8
- vshll.u16 q10, d2, #8
+ vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS
+ vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q9, d6, d30
vmlsl.u16 q10, d2, d31
vmlal.u16 q9, d7, d30
vmlal.u16 q10, d3, d31
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vdup.32 d22, d22[0]
- vshrn.u32 d18, q9, #16
- vshrn.u32 d19, q10, #16
+ vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
vmovn.u16 d17, q9
vld1.32 {d18, d19}, [OUT, :128]
pld [OUT, PF_OFFS]
@@ -1146,11 +1147,11 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
- vshll.u16 q9, d6, #8
+ vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #2
- vshll.u16 q10, d2, #8
+ vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS
vld1.32 {d0}, [TMP1], STRIDE
mov TMP2, X, asr #16
add X, X, UX
@@ -1167,12 +1168,12 @@ pixman_asm_function fname
mov TMP4, X, asr #16
add X, X, UX
add TMP4, TOP, TMP4, asl #2
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vld1.32 {d3}, [TMP2]
vdup.32 d22, d22[0]
- vshrn.u32 d18, q9, #16
- vshrn.u32 d19, q10, #16
+ vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
vmull.u8 q2, d0, d28
vmull.u8 q3, d2, d28
vmovn.u16 d17, q9
@@ -1182,8 +1183,8 @@ pixman_asm_function fname
vmlal.u8 q3, d3, d29
vuzp.8 d16, d17
vuzp.8 d18, d19
- vshll.u16 q0, d4, #8
- vshll.u16 q1, d6, #8
+ vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS
+ vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS
vuzp.8 d16, d17
vuzp.8 d18, d19
vmlsl.u16 q0, d4, d30
@@ -1194,8 +1195,8 @@ pixman_asm_function fname
vmlal.u16 q1, d7, d31
vrsra.u16 q10, q10, #8
vrsra.u16 q11, q11, #8
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q1, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
vrshrn.u16 d16, q10, #8
vrshrn.u16 d17, q11, #8
vld1.32 {d2}, [TMP3], STRIDE
@@ -1216,7 +1217,7 @@ pixman_asm_function fname
vraddhn.u16 d18, q9, q10
vraddhn.u16 d19, q15, q11
vmlal.u8 q1, d5, d29
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vqadd.u8 q9, q8, q9
vld1.32 {d22[0]}, [MASK]!
vuzp.8 d18, d19
diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S
index 87aae1d55..187197dc3 100644
--- a/pixman/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman/pixman-arm-neon-asm.S
@@ -49,6 +49,7 @@
.altmacro
.p2align 2
+#include "pixman-private.h"
#include "pixman-arm-neon-asm.h"
/* Global configuration options and preferences */
@@ -2986,11 +2987,11 @@ fname:
vmull.u8 q1, d0, d28
vmlal.u8 q1, d1, d29
/* 5 cycles bubble */
- vshll.u16 q0, d2, #8
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
/* 5 cycles bubble */
- vshrn.u32 d0, q0, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
/* 3 cycles bubble */
vmovn.u16 d0, q0
/* 1 cycle bubble */
@@ -3000,15 +3001,15 @@ fname:
.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
bilinear_load_and_vertical_interpolate_two_&src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23
- vshll.u16 q0, d2, #8
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
- vshll.u16 q10, d22, #8
+ vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q10, d22, d31
vmlal.u16 q10, d23, d31
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q10, #16
- vshr.u16 q15, q12, #8
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vmovn.u16 d0, q0
bilinear_store_&dst_fmt 2, q2, q3
@@ -3020,26 +3021,26 @@ fname:
q3, q9, d4, d5, d16, d17, d18, d19
pld [TMP1, PF_OFFS]
sub TMP1, TMP1, STRIDE
- vshll.u16 q0, d2, #8
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
- vshll.u16 q10, d22, #8
+ vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q10, d22, d31
vmlal.u16 q10, d23, d31
- vshr.u16 q15, q12, #8
- vshll.u16 q2, d6, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q2, d6, d30
vmlal.u16 q2, d7, d30
- vshll.u16 q8, d18, #8
+ vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
pld [TMP2, PF_OFFS]
vmlsl.u16 q8, d18, d31
vmlal.u16 q8, d19, d31
vadd.u16 q12, q12, q13
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q10, #16
- vshrn.u32 d4, q2, #16
- vshrn.u32 d5, q8, #16
- vshr.u16 q15, q12, #8
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vmovn.u16 d0, q0
vmovn.u16 d1, q2
vadd.u16 q12, q12, q13
@@ -3158,13 +3159,13 @@ pixman_asm_function fname
blt 0f
tst OUT, #(1 << dst_bpp_shift)
beq 0f
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
bilinear_interpolate_last_pixel src_fmt, dst_fmt
sub WIDTH, WIDTH, #1
0:
vadd.u16 q13, q13, q13
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
cmp WIDTH, #2
@@ -3282,7 +3283,7 @@ pixman_asm_function fname
vmull.u8 q10, d22, d28
vmlal.u8 q10, d23, d29
- vshll.u16 q0, d16, #8
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d16, d30
vmlal.u16 q0, d17, d30
@@ -3293,25 +3294,25 @@ pixman_asm_function fname
vmull.u8 q11, d16, d28
vmlal.u8 q11, d17, d29
- vshll.u16 q1, d18, #8
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q1, d18, d31
.endm
.macro bilinear_interpolate_four_pixels_8888_8888_tail
vmlal.u16 q1, d19, d31
- vshr.u16 q15, q12, #8
- vshll.u16 q2, d20, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q2, d20, d30
vmlal.u16 q2, d21, d30
- vshll.u16 q3, d22, #8
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q3, d22, d31
vmlal.u16 q3, d23, d31
vadd.u16 q12, q12, q13
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q1, #16
- vshrn.u32 d4, q2, #16
- vshr.u16 q15, q12, #8
- vshrn.u32 d5, q3, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
vmovn.u16 d6, q0
vmovn.u16 d7, q2
vadd.u16 q12, q12, q13
@@ -3326,22 +3327,22 @@ pixman_asm_function fname
add X, X, UX
add TMP2, TOP, TMP2, asl #2
vmlal.u16 q1, d19, d31
- vshr.u16 q15, q12, #8
- vshll.u16 q2, d20, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q2, d20, d30
vmlal.u16 q2, d21, d30
- vshll.u16 q3, d22, #8
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
vld1.32 {d20}, [TMP1], STRIDE
vmlsl.u16 q3, d22, d31
vmlal.u16 q3, d23, d31
vld1.32 {d21}, [TMP1]
vmull.u8 q8, d20, d28
vmlal.u8 q8, d21, d29
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q1, #16
- vshrn.u32 d4, q2, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
vld1.32 {d22}, [TMP2], STRIDE
- vshrn.u32 d5, q3, #16
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vld1.32 {d23}, [TMP2]
vmull.u8 q9, d22, d28
@@ -3353,12 +3354,12 @@ pixman_asm_function fname
add TMP4, TOP, TMP4, asl #2
vmlal.u8 q9, d23, d29
vld1.32 {d22}, [TMP3], STRIDE
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vld1.32 {d23}, [TMP3]
vmull.u8 q10, d22, d28
vmlal.u8 q10, d23, d29
vmovn.u16 d6, q0
- vshll.u16 q0, d16, #8
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
vmovn.u16 d7, q2
vmlsl.u16 q0, d16, d30
vmlal.u16 q0, d17, d30
@@ -3370,7 +3371,7 @@ pixman_asm_function fname
vmull.u8 q11, d16, d28
vmlal.u8 q11, d17, d29
vst1.32 {d6, d7}, [OUT, :128]!
- vshll.u16 q1, d18, #8
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q1, d18, d31
.endm
@@ -3403,7 +3404,7 @@ pixman_asm_function fname
vld1.32 {d23}, [TMP3]
vmull.u8 q10, d22, d28
vmlal.u8 q10, d23, d29
- vshll.u16 q0, d16, #8
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d16, d30
vmlal.u16 q0, d17, d30
pld [TMP4, PF_OFFS]
@@ -3412,7 +3413,7 @@ pixman_asm_function fname
pld [TMP4, PF_OFFS]
vmull.u8 q11, d16, d28
vmlal.u8 q11, d17, d29
- vshll.u16 q1, d18, #8
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q1, d18, d31
mov TMP1, X, asr #16
@@ -3422,22 +3423,22 @@ pixman_asm_function fname
add X, X, UX
add TMP2, TOP, TMP2, asl #2
vmlal.u16 q1, d19, d31
- vshr.u16 q15, q12, #8
- vshll.u16 q2, d20, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q2, d20, d30
vmlal.u16 q2, d21, d30
- vshll.u16 q3, d22, #8
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
vld1.32 {d20}, [TMP1], STRIDE
vmlsl.u16 q3, d22, d31
vmlal.u16 q3, d23, d31
vld1.32 {d21}, [TMP1]
vmull.u8 q8, d20, d28
vmlal.u8 q8, d21, d29
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q1, #16
- vshrn.u32 d4, q2, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
vld1.32 {d22}, [TMP2], STRIDE
- vshrn.u32 d5, q3, #16
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vld1.32 {d23}, [TMP2]
vmull.u8 q9, d22, d28
@@ -3449,12 +3450,12 @@ pixman_asm_function fname
add TMP4, TOP, TMP4, asl #2
vmlal.u8 q9, d23, d29
vld1.32 {d22}, [TMP3], STRIDE
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vld1.32 {d23}, [TMP3]
vmull.u8 q10, d22, d28
vmlal.u8 q10, d23, d29
vmovn.u16 d8, q0
- vshll.u16 q0, d16, #8
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
vmovn.u16 d9, q2
vmlsl.u16 q0, d16, d30
vmlal.u16 q0, d17, d30
@@ -3465,25 +3466,25 @@ pixman_asm_function fname
pld [TMP4, PF_OFFS]
vmull.u8 q11, d16, d28
vmlal.u8 q11, d17, d29
- vshll.u16 q1, d18, #8
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q1, d18, d31
.endm
.macro bilinear_interpolate_eight_pixels_8888_0565_tail
vmlal.u16 q1, d19, d31
- vshr.u16 q15, q12, #8
- vshll.u16 q2, d20, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q2, d20, d30
vmlal.u16 q2, d21, d30
- vshll.u16 q3, d22, #8
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q3, d22, d31
vmlal.u16 q3, d23, d31
vadd.u16 q12, q12, q13
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q1, #16
- vshrn.u32 d4, q2, #16
- vshr.u16 q15, q12, #8
- vshrn.u32 d5, q3, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
vmovn.u16 d10, q0
vmovn.u16 d11, q2
vadd.u16 q12, q12, q13
@@ -3508,23 +3509,23 @@ pixman_asm_function fname
add X, X, UX
add TMP2, TOP, TMP2, asl #2
vmlal.u16 q1, d19, d31
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vuzp.u8 d8, d9
- vshll.u16 q2, d20, #8
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q2, d20, d30
vmlal.u16 q2, d21, d30
- vshll.u16 q3, d22, #8
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
vld1.32 {d20}, [TMP1], STRIDE
vmlsl.u16 q3, d22, d31
vmlal.u16 q3, d23, d31
vld1.32 {d21}, [TMP1]
vmull.u8 q8, d20, d28
vmlal.u8 q8, d21, d29
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q1, #16
- vshrn.u32 d4, q2, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
vld1.32 {d22}, [TMP2], STRIDE
- vshrn.u32 d5, q3, #16
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vld1.32 {d23}, [TMP2]
vmull.u8 q9, d22, d28
@@ -3536,12 +3537,12 @@ pixman_asm_function fname
add TMP4, TOP, TMP4, asl #2
vmlal.u8 q9, d23, d29
vld1.32 {d22}, [TMP3], STRIDE
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vld1.32 {d23}, [TMP3]
vmull.u8 q10, d22, d28
vmlal.u8 q10, d23, d29
vmovn.u16 d10, q0
- vshll.u16 q0, d16, #8
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
vmovn.u16 d11, q2
vmlsl.u16 q0, d16, d30
vmlal.u16 q0, d17, d30
@@ -3553,7 +3554,7 @@ pixman_asm_function fname
vmull.u8 q11, d16, d28
vmlal.u8 q11, d17, d29
vuzp.u8 d10, d11
- vshll.u16 q1, d18, #8
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q1, d18, d31
mov TMP1, X, asr #16
@@ -3564,12 +3565,12 @@ pixman_asm_function fname
add TMP2, TOP, TMP2, asl #2
vmlal.u16 q1, d19, d31
vuzp.u8 d9, d11
- vshr.u16 q15, q12, #8
- vshll.u16 q2, d20, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
vuzp.u8 d8, d10
vmlsl.u16 q2, d20, d30
vmlal.u16 q2, d21, d30
- vshll.u16 q3, d22, #8
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
vld1.32 {d20}, [TMP1], STRIDE
vmlsl.u16 q3, d22, d31
vmlal.u16 q3, d23, d31
@@ -3579,13 +3580,13 @@ pixman_asm_function fname
vshll.u8 q6, d9, #8
vshll.u8 q5, d10, #8
vshll.u8 q7, d8, #8
- vshrn.u32 d0, q0, #16
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
vsri.u16 q5, q6, #5
- vshrn.u32 d1, q1, #16
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
vsri.u16 q5, q7, #11
- vshrn.u32 d4, q2, #16
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
vld1.32 {d22}, [TMP2], STRIDE
- vshrn.u32 d5, q3, #16
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vld1.32 {d23}, [TMP2]
vmull.u8 q9, d22, d28
@@ -3597,12 +3598,12 @@ pixman_asm_function fname
add TMP4, TOP, TMP4, asl #2
vmlal.u8 q9, d23, d29
vld1.32 {d22}, [TMP3], STRIDE
- vshr.u16 q15, q12, #8
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vld1.32 {d23}, [TMP3]
vmull.u8 q10, d22, d28
vmlal.u8 q10, d23, d29
vmovn.u16 d8, q0
- vshll.u16 q0, d16, #8
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
vmovn.u16 d9, q2
vmlsl.u16 q0, d16, d30
vmlal.u16 q0, d17, d30
@@ -3613,7 +3614,7 @@ pixman_asm_function fname
pld [TMP4, PF_OFFS]
vmull.u8 q11, d16, d28
vmlal.u8 q11, d17, d29
- vshll.u16 q1, d18, #8
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
vst1.32 {d10, d11}, [OUT, :128]!
vmlsl.u16 q1, d18, d31
.endm
diff --git a/pixman/pixman/pixman-bits-image.c b/pixman/pixman/pixman-bits-image.c
index 05eab9634..b6c8630f4 100644
--- a/pixman/pixman/pixman-bits-image.c
+++ b/pixman/pixman/pixman-bits-image.c
@@ -131,8 +131,8 @@ bits_image_fetch_pixel_bilinear (bits_image_t *image,
x1 = x - pixman_fixed_1 / 2;
y1 = y - pixman_fixed_1 / 2;
- distx = (x1 >> 8) & 0xff;
- disty = (y1 >> 8) & 0xff;
+ distx = pixman_fixed_to_bilinear_weight (x1);
+ disty = pixman_fixed_to_bilinear_weight (y1);
x1 = pixman_fixed_to_int (x1);
y1 = pixman_fixed_to_int (y1);
@@ -200,7 +200,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
y = v.vector[1] - pixman_fixed_1/2;
- disty = (y >> 8) & 0xff;
+ disty = pixman_fixed_to_bilinear_weight (y);
/* Load the pointers to the first and second lines from the source
* image that bilinear code must read.
@@ -309,7 +309,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
- distx = (x >> 8) & 0xff;
+ distx = pixman_fixed_to_bilinear_weight (x);
*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
@@ -334,7 +334,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
- distx = (x >> 8) & 0xff;
+ distx = pixman_fixed_to_bilinear_weight (x);
*buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
}
@@ -358,7 +358,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
- distx = (x >> 8) & 0xff;
+ distx = pixman_fixed_to_bilinear_weight (x);
*buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
}
@@ -695,8 +695,8 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,
x1 = x - pixman_fixed_1 / 2;
y1 = y - pixman_fixed_1 / 2;
- distx = (x1 >> 8) & 0xff;
- disty = (y1 >> 8) & 0xff;
+ distx = pixman_fixed_to_bilinear_weight (x1);
+ disty = pixman_fixed_to_bilinear_weight (y1);
y1 = pixman_fixed_to_int (y1);
y2 = y1 + 1;
diff --git a/pixman/pixman/pixman-inlines.h b/pixman/pixman/pixman-inlines.h
index 3532867a4..5517de5a5 100644
--- a/pixman/pixman/pixman-inlines.h
+++ b/pixman/pixman/pixman-inlines.h
@@ -81,6 +81,13 @@ repeat (pixman_repeat_t repeat, int *c, int size)
return TRUE;
}
+static force_inline int
+pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
+{
+ return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
+ ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
+}
+
#if SIZEOF_LONG > 4
static force_inline uint32_t
@@ -92,6 +99,9 @@ bilinear_interpolation (uint32_t tl, uint32_t tr,
uint64_t tl64, tr64, bl64, br64;
uint64_t f, r;
+ distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
+ disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
distxy = distx * disty;
distxiy = distx * (256 - disty);
distixy = (256 - distx) * disty;
@@ -135,6 +145,9 @@ bilinear_interpolation (uint32_t tl, uint32_t tr,
int distxy, distxiy, distixy, distixiy;
uint32_t f, r;
+ distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
+ disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
distxy = distx * disty;
distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */
distixy = (disty << 8) - distxy; /* disty * (256 - distx) */
@@ -758,12 +771,14 @@ bilinear_pad_repeat_get_scanline_bounds (int32_t source_image_width,
* all source pixels are fetched from zero padding
* zone for NONE repeat
*
- * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
- * but sometimes it may be less than that for NONE repeat when handling
- * fuzzy antialiased top or bottom image edges. Also both top and
- * bottom weight variables are guaranteed to have value in 0-255
- * range and can fit into unsigned byte or be used with 8-bit SIMD
- * multiplication instructions.
+ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to
+ * BILINEAR_INTERPOLATION_RANGE, but sometimes it may be less than that
+ * for NONE repeat when handling fuzzy antialiased top or bottom image
+ * edges. Also both top and bottom weight variables are guaranteed to
+ * have value, which is less than BILINEAR_INTERPOLATION_RANGE.
+ * For example, the weights can fit into unsigned byte or be used
+ * with 8-bit SIMD multiplication instructions for 8-bit interpolation
+ * precision.
*/
#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \
dst_type_t, repeat_mode, flags) \
@@ -877,18 +892,18 @@ fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,
} \
\
y1 = pixman_fixed_to_int (vy); \
- weight2 = (vy >> 8) & 0xff; \
+ weight2 = pixman_fixed_to_bilinear_weight (vy); \
if (weight2) \
{ \
- /* normal case, both row weights are in 0-255 range and fit unsigned byte */ \
+ /* both weight1 and weight2 are smaller than BILINEAR_INTERPOLATION_RANGE */ \
y2 = y1 + 1; \
- weight1 = 256 - weight2; \
+ weight1 = BILINEAR_INTERPOLATION_RANGE - weight2; \
} \
else \
{ \
- /* set both top and bottom row to the same scanline, and weights to 128+128 */ \
+ /* set both top and bottom row to the same scanline and tweak weights */ \
y2 = y1; \
- weight1 = weight2 = 128; \
+ weight1 = weight2 = BILINEAR_INTERPOLATION_RANGE / 2; \
} \
vy += unit_y; \
if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \
diff --git a/pixman/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman/pixman-mips-dspr2-asm.S
index 87558f032..48f108ed9 100644
--- a/pixman/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman/pixman-mips-dspr2-asm.S
@@ -29,6 +29,7 @@
* Author: Nemanja Lukic (nlukic@mips.com)
*/
+#include "pixman-private.h"
#include "pixman-mips-dspr2-asm.h"
LEAF_MIPS_DSPR2(pixman_fill_buff16_mips)
@@ -771,11 +772,15 @@ LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
lw s1, 48(sp) /* s1 = wb */
lw s2, 52(sp) /* s2 = vx */
lw s3, 56(sp) /* s3 = unit_x */
- li v0, 256
+ li v0, BILINEAR_INTERPOLATION_RANGE
li s8, 0x00ff00ff
+
+ sll s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+ sll s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+
0:
andi t4, s2, 0xffff /* t4 = (short)vx */
- srl t4, t4, 8 /* t4 = vx >> 8 */
+ srl t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
subu t5, v0, t4 /* t5 = ( 256 - (vx>>8)) */
mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */
diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c
index 7c1f4fe24..b3ef2495b 100644
--- a/pixman/pixman/pixman-mmx.c
+++ b/pixman/pixman/pixman-mmx.c
@@ -42,6 +42,7 @@
#endif
#include "pixman-private.h"
#include "pixman-combine32.h"
+#include "pixman-inlines.h"
#define no_vERBOSE
@@ -718,6 +719,24 @@ combine (const uint32_t *src, const uint32_t *mask)
return vsrc;
}
+static force_inline __m64
+core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
+{
+ vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
+
+ if (is_opaque (vsrc))
+ {
+ return vsrc;
+ }
+ else if (!is_zero (vsrc))
+ {
+ return over (vsrc, expand_alpha (vsrc),
+ _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
+ }
+
+ return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
+}
+
static void
mmx_combine_over_u (pixman_implementation_t *imp,
pixman_op_t op,
@@ -1623,9 +1642,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
- mask &= 0xff000000;
- mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
- vmask = load8888 (&mask);
+ vmask = expand_alpha (load8888 (&mask));
while (height--)
{
@@ -1694,9 +1711,7 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
- mask &= 0xff000000;
- mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
- vmask = load8888 (&mask);
+ vmask = expand_alpha (load8888 (&mask));
srca = MC (4x00ff);
while (height--)
@@ -3532,6 +3547,242 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
_mm_empty ();
}
+#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
+#define BMSK (BSHIFT - 1)
+
+#define BILINEAR_DECLARE_VARIABLES \
+ const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
+ const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
+ const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \
+ const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
+ const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
+ const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
+ const __m64 mm_zero = _mm_setzero_si64 (); \
+ __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
+do { \
+ /* fetch 2x2 pixel block into 2 mmx registers */ \
+ __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
+ __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
+ vx += unit_x; \
+ /* vertical interpolation */ \
+ __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
+ __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
+ __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
+ __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
+ __m64 hi = _mm_add_pi16 (t_hi, b_hi); \
+ __m64 lo = _mm_add_pi16 (t_lo, b_lo); \
+ if (BILINEAR_INTERPOLATION_BITS < 8) \
+ { \
+ /* calculate horizontal weights */ \
+ __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
+ _mm_srli_pi16 (mm_x, \
+ 16 - BILINEAR_INTERPOLATION_BITS))); \
+ mm_x = _mm_add_pi16 (mm_x, mm_ux); \
+ /* horizontal interpolation */ \
+ __m64 p = _mm_unpacklo_pi16 (lo, hi); \
+ __m64 q = _mm_unpackhi_pi16 (lo, hi); \
+ lo = _mm_madd_pi16 (p, mm_wh); \
+ hi = _mm_madd_pi16 (q, mm_wh); \
+ } \
+ else \
+ { \
+ /* calculate horizontal weights */ \
+ __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \
+ 16 - BILINEAR_INTERPOLATION_BITS)); \
+ __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \
+ 16 - BILINEAR_INTERPOLATION_BITS); \
+ mm_x = _mm_add_pi16 (mm_x, mm_ux); \
+ /* horizontal interpolation */ \
+ __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \
+ __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \
+ __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \
+ __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \
+ lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \
+ _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \
+ hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \
+ _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \
+ } \
+ /* shift and pack the result */ \
+ hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
+ lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
+ lo = _mm_packs_pi32 (lo, hi); \
+ lo = _mm_packs_pu16 (lo, lo); \
+ pix = lo; \
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL() \
+do { \
+ vx += unit_x; \
+ mm_x = _mm_add_pi16 (mm_x, mm_ux); \
+} while(0)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ __m64 pix;
+
+ while (w--)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
+ store (dst, pix);
+ dst++;
+ }
+
+ _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ __m64 pix1, pix2;
+
+ while (w)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+ if (!is_zero (pix1))
+ {
+ pix2 = load (dst);
+ store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
+ }
+
+ w--;
+ dst++;
+ }
+
+ _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
+ scaled_bilinear_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
+ scaled_bilinear_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
+ scaled_bilinear_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
+ scaled_bilinear_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst,
+ const uint8_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ __m64 pix1, pix2;
+ uint32_t m;
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+
+ if (m)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+ if (m == 0xff && is_opaque (pix1))
+ {
+ store (dst, pix1);
+ }
+ else
+ {
+ __m64 ms, md, ma, msa;
+
+ pix2 = load (dst);
+ ma = expand_alpha_rev (to_m64 (m));
+ ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
+ md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
+
+ msa = expand_alpha (ms);
+
+ store8888 (dst, (in_over (ms, msa, ma, md)));
+ }
+ }
+ else
+ {
+ BILINEAR_SKIP_ONE_PIXEL ();
+ }
+
+ w--;
+ dst++;
+ }
+
+ _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
+ scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
+ scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
+ scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
+ scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
static uint32_t *
mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
{
@@ -3787,6 +4038,23 @@ static const pixman_fast_path_t mmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
+
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
+
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ),
+
{ PIXMAN_OP_NONE },
};
diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h
index 72e3b4f6d..0c27798b0 100644
--- a/pixman/pixman/pixman-private.h
+++ b/pixman/pixman/pixman-private.h
@@ -1,10 +1,24 @@
+#ifndef PIXMAN_PRIVATE_H
+#define PIXMAN_PRIVATE_H
+
+/*
+ * The defines which are shared between C and assembly code
+ */
+
+/* bilinear interpolation precision (must be <= 8) */
+#define BILINEAR_INTERPOLATION_BITS 7
+#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
+
+/*
+ * C specific part
+ */
+
+#ifndef __ASSEMBLER__
+
#ifndef PACKAGE
# error config.h must be included before pixman-private.h
#endif
-#ifndef PIXMAN_PRIVATE_H
-#define PIXMAN_PRIVATE_H
-
#define PIXMAN_DISABLE_DEPRECATED
#define PIXMAN_USE_INTERNAL_API
@@ -1052,4 +1066,6 @@ void pixman_timer_register (pixman_timer_t *timer);
#endif /* PIXMAN_TIMERS */
+#endif /* __ASSEMBLER__ */
+
#endif /* PIXMAN_PRIVATE_H */
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index b656d17d4..ba067bc31 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -5364,11 +5364,15 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
scaled_nearest_scanline_sse2_8888_n_8888_OVER,
uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
+
#define BILINEAR_DECLARE_VARIABLES \
const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
- const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\
- const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \
+ const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
+ const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \
+ const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
+ const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
const __m128i xmm_ux = _mm_set_epi16 (unit_x&0xffff, unit_x&0xffff, unit_x&0xffff, unit_x&0xffff, \
unit_x&0xffff, unit_x&0xffff, unit_x&0xffff, unit_x&0xffff); \
const __m128i xmm_zero = _mm_setzero_si128 (); \
@@ -5388,18 +5392,30 @@ do { \
xmm_wt), \
_mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \
xmm_wb)); \
- /* calculate horizontal weights */ \
- xmm_wh = _mm_add_epi16 (xmm_addc, \
- _mm_xor_si128 (xmm_xorc, \
- _mm_srli_epi16 (xmm_x, 8))); \
- xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
- /* horizontal interpolation */ \
- xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
- xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
- a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
- _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
+ if (BILINEAR_INTERPOLATION_BITS < 8) \
+ { \
+ /* calculate horizontal weights */ \
+ xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7, \
+ _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ /* horizontal interpolation */ \
+ a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
+ a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \
+ } \
+ else \
+ { \
+ /* calculate horizontal weights */ \
+ xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8, \
+ _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ /* horizontal interpolation */ \
+ xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
+ xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
+ a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
+ _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
+ } \
/* shift and pack the result */ \
- a = _mm_srli_epi32 (a, 16); \
+ a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \
a = _mm_packs_epi32 (a, a); \
a = _mm_packus_epi16 (a, a); \
pix = _mm_cvtsi128_si32 (a); \
@@ -5845,6 +5861,9 @@ static const pixman_fast_path_t sse2_fast_paths[] =
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
diff --git a/pixman/test/affine-test.c b/pixman/test/affine-test.c
index a4ceed3da..6827cc3a8 100644
--- a/pixman/test/affine-test.c
+++ b/pixman/test/affine-test.c
@@ -301,11 +301,21 @@ test_composite (int testnum,
return crc32;
}
+#if BILINEAR_INTERPOLATION_BITS == 8
+#define CHECKSUM 0x1EF2175A
+#elif BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM 0x74050F50
+#elif BILINEAR_INTERPOLATION_BITS == 4
+#define CHECKSUM 0x4362EAE8
+#else
+#define CHECKSUM 0x00000000
+#endif
+
int
main (int argc, const char *argv[])
{
pixman_disable_out_of_bounds_workaround ();
- return fuzzer_test_main ("affine", 8000000, 0x1EF2175A,
+ return fuzzer_test_main ("affine", 8000000, CHECKSUM,
test_composite, argc, argv);
}
diff --git a/pixman/test/scaling-test.c b/pixman/test/scaling-test.c
index 6f2da1432..44c4f3de4 100644
--- a/pixman/test/scaling-test.c
+++ b/pixman/test/scaling-test.c
@@ -357,11 +357,21 @@ test_composite (int testnum,
return crc32;
}
+#if BILINEAR_INTERPOLATION_BITS == 8
+#define CHECKSUM 0x80DF1CB2
+#elif BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM 0x2818D5FB
+#elif BILINEAR_INTERPOLATION_BITS == 4
+#define CHECKSUM 0x387540A5
+#else
+#define CHECKSUM 0x00000000
+#endif
+
int
main (int argc, const char *argv[])
{
pixman_disable_out_of_bounds_workaround ();
- return fuzzer_test_main("scaling", 8000000, 0x80DF1CB2,
+ return fuzzer_test_main("scaling", 8000000, CHECKSUM,
test_composite, argc, argv);
}