diff options
Diffstat (limited to 'pixman')
-rw-r--r-- | pixman/pixman/pixman-arm-common.h | 45 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-neon-asm.S | 292 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-neon-asm.h | 17 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-neon.c | 56 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-simd-asm.S | 66 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-simd.c | 855 |
6 files changed, 771 insertions, 560 deletions
diff --git a/pixman/pixman/pixman-arm-common.h b/pixman/pixman/pixman-arm-common.h index 7420d9305..2c435041a 100644 --- a/pixman/pixman/pixman-arm-common.h +++ b/pixman/pixman/pixman-arm-common.h @@ -361,4 +361,49 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \
SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+/*****************************************************************************/
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op, \
+ src_type, dst_type) \
+void \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \
+ dst_type * dst, \
+ const src_type * top, \
+ const src_type * bottom, \
+ int wt, \
+ int wb, \
+ pixman_fixed_t x, \
+ pixman_fixed_t ux, \
+ int width); \
+ \
+static force_inline void \
+scaled_bilinear_scanline_##cputype##_##name##_##op ( \
+ dst_type * dst, \
+ const uint32_t * mask, \
+ const src_type * src_top, \
+ const src_type * src_bottom, \
+ int32_t w, \
+ int wt, \
+ int wb, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx, \
+ pixman_bool_t zero_src) \
+{ \
+ if ((flags & SKIP_ZERO_SRC) && zero_src) \
+ return; \
+ pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \
+ dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+} \
+ \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \
+ scaled_bilinear_scanline_##cputype##_##name##_##op, \
+ src_type, uint32_t, dst_type, COVER, FALSE, FALSE) \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \
+ scaled_bilinear_scanline_##cputype##_##name##_##op, \
+ src_type, uint32_t, dst_type, NONE, FALSE, FALSE) \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \
+ scaled_bilinear_scanline_##cputype##_##name##_##op, \
+ src_type, uint32_t, dst_type, PAD, FALSE, FALSE)
+
#endif
diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S index 5804396a4..6cca0f2cc 100644 --- a/pixman/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman/pixman-arm-neon-asm.S @@ -2405,17 +2405,161 @@ generate_composite_function_nearest_scanline \ fname:
.endm
-.macro bilinear_interpolate_last_pixel
- mov TMP1, X, asr #16
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
mov TMP2, X, asr #16
- add TMP1, TOP, TMP1, asl #2
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #2
add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d0}, [TMP1]
- vshr.u16 d30, d24, #8
- vld1.32 {d1}, [TMP2]
+ vld1.32 {reg1}, [TMP1]
+ vld1.32 {reg2}, [TMP2]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ vld1.32 {reg2[0]}, [TMP1]
+ vld1.32 {reg2[1]}, [TMP2]
+ convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+ bilinear_load_8888 reg1, reg2, tmp1
+ vmull.u8 acc1, reg1, d28
+ vmlal.u8 acc1, reg2, d29
+ bilinear_load_8888 reg3, reg4, tmp2
+ vmull.u8 acc2, reg3, d28
+ vmlal.u8 acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+ mov TMP2, X, asr #16
+ add X, X, UX
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ add TMP3, TOP, TMP4, asl #1
+ add TMP4, BOTTOM, TMP4, asl #1
+ vld1.32 {acc2lo[0]}, [TMP1]
+ vld1.32 {acc2hi[0]}, [TMP3]
+ vld1.32 {acc2lo[1]}, [TMP2]
+ vld1.32 {acc2hi[1]}, [TMP4]
+ convert_0565_to_x888 acc2, reg3, reg2, reg1
+ vzip.u8 reg1, reg3
+ vzip.u8 reg2, reg4
+ vzip.u8 reg3, reg4
+ vzip.u8 reg1, reg2
+ vmull.u8 acc1, reg1, d28
+ vmlal.u8 acc1, reg2, d29
+ vmull.u8 acc2, reg3, d28
+ vmlal.u8 acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ mov TMP2, X, asr #16
+ add X, X, UX
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ add TMP3, TOP, TMP4, asl #1
+ add TMP4, BOTTOM, TMP4, asl #1
+ vld1.32 {xacc2lo[0]}, [TMP1]
+ vld1.32 {xacc2hi[0]}, [TMP3]
+ vld1.32 {xacc2lo[1]}, [TMP2]
+ vld1.32 {xacc2hi[1]}, [TMP4]
+ convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ add TMP3, TOP, TMP4, asl #1
+ add TMP4, BOTTOM, TMP4, asl #1
+ vld1.32 {yacc2lo[0]}, [TMP1]
+ vzip.u8 xreg1, xreg3
+ vld1.32 {yacc2hi[0]}, [TMP3]
+ vzip.u8 xreg2, xreg4
+ vld1.32 {yacc2lo[1]}, [TMP2]
+ vzip.u8 xreg3, xreg4
+ vld1.32 {yacc2hi[1]}, [TMP4]
+ vzip.u8 xreg1, xreg2
+ convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+ vmull.u8 xacc1, xreg1, d28
+ vzip.u8 yreg1, yreg3
+ vmlal.u8 xacc1, xreg2, d29
+ vzip.u8 yreg2, yreg4
+ vmull.u8 xacc2, xreg3, d28
+ vzip.u8 yreg3, yreg4
+ vmlal.u8 xacc2, xreg4, d29
+ vzip.u8 yreg1, yreg2
+ vmull.u8 yacc1, yreg1, d28
+ vmlal.u8 yacc1, yreg2, d29
+ vmull.u8 yacc2, yreg3, d28
+ vmlal.u8 yacc2, yreg4, d29
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+ vst1.32 {d0, d1}, [OUT]!
+.elseif numpix == 2
+ vst1.32 {d0}, [OUT]!
+.elseif numpix == 1
+ vst1.32 {d0[0]}, [OUT, :32]!
+.else
+ .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+ vuzp.u8 d0, d1
+ vuzp.u8 d2, d3
+ vuzp.u8 d1, d3
+ vuzp.u8 d0, d2
+ convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+ vst1.16 {d2}, [OUT]!
+.elseif numpix == 2
+ vst1.32 {d2[0]}, [OUT]!
+.elseif numpix == 1
+ vst1.16 {d2[0]}, [OUT]!
+.else
+ .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+ bilinear_load_&src_fmt d0, d1, d2
vmull.u8 q1, d0, d28
vmlal.u8 q1, d1, d29
- /* 5 cycles bubble */
+ vshr.u16 d30, d24, #8
+ /* 4 cycles bubble */
vshll.u16 q0, d2, #8
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
@@ -2424,28 +2568,12 @@ fname: /* 3 cycles bubble */
vmovn.u16 d0, q0
/* 1 cycle bubble */
- vst1.32 {d0[0]}, [OUT, :32]!
+ bilinear_store_&dst_fmt 1, q2, q3
.endm
-.macro bilinear_interpolate_two_pixels
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d0}, [TMP1]
- vld1.32 {d1}, [TMP2]
- vmull.u8 q1, d0, d28
- vmlal.u8 q1, d1, d29
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d20}, [TMP1]
- vld1.32 {d21}, [TMP2]
- vmull.u8 q11, d20, d28
- vmlal.u8 q11, d21, d29
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+ bilinear_load_and_vertical_interpolate_two_&src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23
vshr.u16 q15, q12, #8
vadd.u16 q12, q12, q13
vshll.u16 q0, d2, #8
@@ -2457,28 +2585,14 @@ fname: vshrn.u32 d30, q0, #16
vshrn.u32 d31, q10, #16
vmovn.u16 d0, q15
- vst1.32 {d0}, [OUT]!
+ bilinear_store_&dst_fmt 2, q2, q3
.endm
-.macro bilinear_interpolate_four_pixels
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d0}, [TMP1]
- vld1.32 {d1}, [TMP2]
- vmull.u8 q1, d0, d28
- vmlal.u8 q1, d1, d29
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d20}, [TMP1]
- vld1.32 {d21}, [TMP2]
- vmull.u8 q11, d20, d28
- vmlal.u8 q11, d21, d29
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_load_and_vertical_interpolate_four_&src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23 \
+ q3, q9, d4, d5, d16, d17, d18, d19
+ pld [TMP1, PF_OFFS]
vshr.u16 q15, q12, #8
vadd.u16 q12, q12, q13
vshll.u16 q0, d2, #8
@@ -2487,54 +2601,44 @@ fname: vshll.u16 q10, d22, #8
vmlsl.u16 q10, d22, d31
vmlal.u16 q10, d23, d31
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d4}, [TMP1]
- vld1.32 {d5}, [TMP2]
- vmull.u8 q3, d4, d28
- vmlal.u8 q3, d5, d29
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d16}, [TMP1]
- vld1.32 {d17}, [TMP2]
- vmull.u8 q9, d16, d28
- vmlal.u8 q9, d17, d29
vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
vshll.u16 q2, d6, #8
vmlsl.u16 q2, d6, d30
vmlal.u16 q2, d7, d30
vshll.u16 q8, d18, #8
+ pld [TMP2, PF_OFFS]
vmlsl.u16 q8, d18, d31
vmlal.u16 q8, d19, d31
+ vadd.u16 q12, q12, q13
vshrn.u32 d0, q0, #16
vshrn.u32 d1, q10, #16
vshrn.u32 d4, q2, #16
vshrn.u32 d5, q8, #16
vmovn.u16 d0, q0
vmovn.u16 d1, q2
- vst1.32 {d0, d1}, [OUT]!
+ bilinear_store_&dst_fmt 4, q2, q3
.endm
-
/*
- * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out,
- * const uint32_t * top,
- * const uint32_t * bottom,
- * int wt,
- * int wb,
- * pixman_fixed_t x,
- * pixman_fixed_t ux,
- * int width)
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * TODO: use software pipelining and aligned writes to the destination buffer
+ * in order to improve performance
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ * fname - name of the function to generate
+ * src_fmt - source color format (8888 or 0565)
+ * dst_fmt - destination color format (8888 or 0565)
+ * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
*/
-pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+ bpp_shift, prefetch_distance
+
+pixman_asm_function fname
OUT .req r0
TOP .req r1
BOTTOM .req r2
@@ -2545,13 +2649,19 @@ pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon WIDTH .req ip
TMP1 .req r3
TMP2 .req r4
+ PF_OFFS .req r7
+ TMP3 .req r8
+ TMP4 .req r9
mov ip, sp
- push {r4, r5, r6, r7}
+ push {r4, r5, r6, r7, r8, r9}
+ mov PF_OFFS, #prefetch_distance
ldmia ip, {WB, X, UX, WIDTH}
+ mul PF_OFFS, PF_OFFS, UX
cmp WIDTH, #0
ble 3f
+
vdup.u16 q12, X
vdup.u16 q13, UX
vdup.u8 d28, WT
@@ -2561,20 +2671,21 @@ pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon subs WIDTH, WIDTH, #4
blt 1f
+ mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
0:
- bilinear_interpolate_four_pixels
+ bilinear_interpolate_four_pixels src_fmt, dst_fmt
subs WIDTH, WIDTH, #4
bge 0b
1:
tst WIDTH, #2
beq 2f
- bilinear_interpolate_two_pixels
+ bilinear_interpolate_two_pixels src_fmt, dst_fmt
2:
tst WIDTH, #1
beq 3f
- bilinear_interpolate_last_pixel
+ bilinear_interpolate_last_pixel src_fmt, dst_fmt
3:
- pop {r4, r5, r6, r7}
+ pop {r4, r5, r6, r7, r8, r9}
bx lr
.unreq OUT
@@ -2587,4 +2698,21 @@ pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon .unreq WIDTH
.unreq TMP1
.unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
.endfunc
+
+.endm
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28
diff --git a/pixman/pixman/pixman-arm-neon-asm.h b/pixman/pixman/pixman-arm-neon-asm.h index 6e3d583f5..0ba67d05f 100644 --- a/pixman/pixman/pixman-arm-neon-asm.h +++ b/pixman/pixman/pixman-arm-neon-asm.h @@ -1158,3 +1158,20 @@ fname: vsri.u16 out, tmp1, #5
vsri.u16 out, tmp2, #11
.endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+ vshl.u16 out0, in, #5 /* G top 6 bits */
+ vshl.u16 tmp, in, #11 /* B top 5 bits */
+ vsri.u16 in, in, #5 /* R is ready in top bits */
+ vsri.u16 out0, out0, #6 /* G is ready in top bits */
+ vsri.u16 tmp, tmp, #5 /* B is ready in top bits */
+ vshr.u16 out1, in, #8 /* R is in place */
+ vsri.u16 out0, tmp, #8 /* G & B is in place */
+ vzip.u16 out0, out1 /* everything is in place */
+.endm
diff --git a/pixman/pixman/pixman-arm-neon.c b/pixman/pixman/pixman-arm-neon.c index f86a49c53..5213a2007 100644 --- a/pixman/pixman/pixman-arm-neon.c +++ b/pixman/pixman/pixman-arm-neon.c @@ -127,6 +127,15 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565, PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
OVER, uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
+ uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
+ uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
+ uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
+ uint16_t, uint16_t)
+
void
pixman_composite_src_n_8_asm_neon (int32_t w,
int32_t h,
@@ -232,47 +241,6 @@ pixman_blt_neon (uint32_t *src_bits, }
}
-void
-pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t * out,
- const uint32_t * top,
- const uint32_t * bottom,
- int wt,
- int wb,
- pixman_fixed_t x,
- pixman_fixed_t ux,
- int width);
-
-static force_inline void
-scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t * dst,
- const uint32_t * mask,
- const uint32_t * src_top,
- const uint32_t * src_bottom,
- int32_t w,
- int wt,
- int wb,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- pixman_fixed_t max_vx,
- pixman_bool_t zero_src)
-{
- pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top,
- src_bottom, wt, wb,
- vx, unit_x, w);
-}
-
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC,
- scaled_bilinear_scanline_neon_8888_8888_SRC,
- uint32_t, uint32_t, uint32_t,
- COVER, FALSE, FALSE)
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC,
- scaled_bilinear_scanline_neon_8888_8888_SRC,
- uint32_t, uint32_t, uint32_t,
- PAD, FALSE, FALSE)
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC,
- scaled_bilinear_scanline_neon_8888_8888_SRC,
- uint32_t, uint32_t, uint32_t,
- NONE, FALSE, FALSE)
-
static const pixman_fast_path_t arm_neon_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565),
@@ -388,6 +356,12 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+
+ SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),
+
{ PIXMAN_OP_NONE },
};
diff --git a/pixman/pixman/pixman-arm-simd-asm.S b/pixman/pixman/pixman-arm-simd-asm.S index d97545c1b..e00836b6c 100644 --- a/pixman/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman/pixman-arm-simd-asm.S @@ -331,15 +331,29 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 .endfunc
/*
- * Note: This function is only using armv4t instructions (not even armv6),
+ * Note: This code is only using armv5te instructions (not even armv6),
* but is scheduled for ARM Cortex-A8 pipeline. So it might need to
* be split into a few variants, tuned for each microarchitecture.
*
* TODO: In order to get good performance on ARM9/ARM11 cores (which don't
* have efficient write combining), it needs to be changed to use 16-byte
* aligned writes using STM instruction.
+ *
+ * Nearest scanline scaler macro template uses the following arguments:
+ * fname - name of the function to generate
+ * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
+ * t - type suffix for LDR/STR instructions
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
+ * prefetch_braking_distance - stop prefetching when that many pixels are
+ * remaining before the end of scanline
*/
-pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
+
+.macro generate_nearest_scanline_func fname, bpp_shift, t, \
+ prefetch_distance, \
+ prefetch_braking_distance
+
+pixman_asm_function fname
W .req r0
DST .req r1
SRC .req r2
@@ -348,30 +362,46 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 TMP1 .req r4
TMP2 .req r5
VXMASK .req r6
+ PF_OFFS .req r7
ldr UNIT_X, [sp]
push {r4, r5, r6, r7}
- mvn VXMASK, #1
+ mvn VXMASK, #((1 << bpp_shift) - 1)
/* define helper macro */
.macro scale_2_pixels
- ldrh TMP1, [SRC, TMP1]
- and TMP2, VXMASK, VX, lsr #15
+ ldr&t TMP1, [SRC, TMP1]
+ and TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- strh TMP1, [DST], #2
+ str&t TMP1, [DST], #(1 << bpp_shift)
- ldrh TMP2, [SRC, TMP2]
- and TMP1, VXMASK, VX, lsr #15
+ ldr&t TMP2, [SRC, TMP2]
+ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- strh TMP2, [DST], #2
+ str&t TMP2, [DST], #(1 << bpp_shift)
.endm
/* now do the scaling */
- and TMP1, VXMASK, VX, lsr #15
+ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- subs W, #4
+ subs W, W, #(8 + prefetch_braking_distance)
blt 2f
-1: /* main loop, process 4 pixels per iteration */
+ /* calculate prefetch offset */
+ mov PF_OFFS, #prefetch_distance
+ mla PF_OFFS, UNIT_X, PF_OFFS, VX
+1: /* main loop, process 8 pixels per iteration with prefetch */
+ subs W, W, #8
+ add PF_OFFS, UNIT_X, lsl #3
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
+ bge 1b
+2:
+ subs W, W, #(4 - 8 - prefetch_braking_distance)
+ blt 2f
+1: /* process the remaining pixels */
scale_2_pixels
scale_2_pixels
subs W, W, #4
@@ -382,8 +412,8 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 scale_2_pixels
2:
tst W, #1
- ldrneh TMP1, [SRC, TMP1]
- strneh TMP1, [DST], #2
+ ldrne&t TMP1, [SRC, TMP1]
+ strne&t TMP1, [DST]
/* cleanup helper macro */
.purgem scale_2_pixels
.unreq DST
@@ -394,7 +424,15 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 .unreq TMP1
.unreq TMP2
.unreq VXMASK
+ .unreq PF_OFFS
/* return */
pop {r4, r5, r6, r7}
bx lr
.endfunc
+.endm
+
+generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
+
+generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32
diff --git a/pixman/pixman/pixman-arm-simd.c b/pixman/pixman/pixman-arm-simd.c index 6bbc1094d..45981a6b0 100644 --- a/pixman/pixman/pixman-arm-simd.c +++ b/pixman/pixman/pixman-arm-simd.c @@ -1,423 +1,432 @@ -/* - * Copyright © 2008 Mozilla Corporation - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Mozilla Corporation not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. Mozilla Corporation makes no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Jeff Muizelaar (jeff@infidigm.net) - * - */ -#ifdef HAVE_CONFIG_H -#include <config.h> -#endif - -#include "pixman-private.h" -#include "pixman-arm-common.h" -#include "pixman-fast-path.h" - -#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */ - -void -pixman_composite_add_8_8_asm_armv6 (int32_t width, - int32_t height, - uint8_t *dst_line, - int32_t dst_stride, - uint8_t *src_line, - int32_t src_stride) -{ - uint8_t *dst, *src; - int32_t w; - uint8_t s, d; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - /* ensure both src and dst are properly aligned before doing 32 bit reads - * we'll stay in this loop if src and dst have differing alignments - */ - while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3))) - { - s = *src; - d = *dst; - asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); - *dst = d; - - dst++; - src++; - w--; - } - - while (w >= 4) - { - asm ("uqadd8 %0, %1, %2" - : "=r" (*(uint32_t*)dst) - : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst)); - dst += 4; - src += 4; - w -= 4; - } - - while (w) - { - s = *src; - d = *dst; - asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); - *dst = d; - - dst++; - src++; - w--; - } - } - -} - -void -pixman_composite_over_8888_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t *src_line, - int32_t src_stride) -{ - uint32_t *dst; - uint32_t *src; - int32_t w; - uint32_t component_half = 0x800080; - uint32_t upper_component_mask = 0xff00ff00; - uint32_t alpha_mask = 0xff; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load src */ - "ldr r5, [%[src]], #4\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - - /* = 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" - - "ldr r4, [%[dest]] \n\t" - -#else - "ldr r4, [%[dest]] \n\t" - - /* = 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" -#endif - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* multiply by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - /* recombine the 0xff00ff00 bytes of r6 and r7 */ - "and r7, r7, %[upper_component_mask]\n\t" - "uxtab16 r6, r7, r6, ror #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) - : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask), - [alpha_mask] "r" (alpha_mask) - : "r4", "r5", "r6", "r7", "r8", "cc", "memory" - ); - } -} - -void -pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t *src_line, - int32_t src_stride, - uint32_t mask) -{ - uint32_t *dst; - uint32_t *src; - int32_t w; - uint32_t component_half = 0x800080; - uint32_t alpha_mask = 0xff; - - mask = (mask) >> 24; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load src */ - "ldr r5, [%[src]], #4\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - -#endif - "ldr r4, [%[dest]] \n\t" - - "uxtb16 r6, r5\n\t" - "uxtb16 r7, r5, ror #8\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, %[mask_alpha], %[component_half]\n\t" - "mla r7, r7, %[mask_alpha], %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r5, r6, r7, lsl #8\n\t" - - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r6, r6, r7, lsl #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) - : [component_half] "r" (component_half), [mask_alpha] "r" (mask), - [alpha_mask] "r" (alpha_mask) - : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory" - ); - } -} - -void -pixman_composite_over_n_8_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t src, - int32_t unused, - uint8_t *mask_line, - int32_t mask_stride) -{ - uint32_t srca; - uint32_t *dst; - uint8_t *mask; - int32_t w; - - srca = src >> 24; - - uint32_t component_mask = 0xff00ff; - uint32_t component_half = 0x800080; - - uint32_t src_hi = (src >> 8) & component_mask; - uint32_t src_lo = src & component_mask; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load mask */ - "ldrb r5, [%[mask]], #1\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - -#endif - "ldr r4, [%[dest]] \n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, %[src_lo], r5, %[component_half]\n\t" - "mla r7, %[src_hi], r5, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r5, r6, r7, lsl #8\n\t" - - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* we could simplify this to use 'sub' if we were - * willing to give up a register for alpha_mask - */ - "mvn r8, r5\n\t" - "mov r8, r8, lsr #24\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r6, r6, r7, lsl #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask) - : [component_half] "r" (component_half), - [src_hi] "r" (src_hi), [src_lo] "r" (src_lo) - : "r4", "r5", "r6", "r7", "r8", "cc", "memory"); - } -} - -#endif - -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, - uint8_t, 1, uint8_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, - uint32_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888, - uint32_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888, - uint8_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC, - uint16_t, uint16_t) - -static const pixman_fast_path_t arm_simd_fast_paths[] = -{ - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888), - - PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8), - - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888), - - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565), - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565), - - { PIXMAN_OP_NONE }, -}; - -pixman_implementation_t * -_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback) -{ - pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths); - - return imp; -} +/*
+ * Copyright © 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+#include "pixman-fast-path.h"
+
+#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
+
+void
+pixman_composite_add_8_8_asm_armv6 (int32_t width,
+ int32_t height,
+ uint8_t *dst_line,
+ int32_t dst_stride,
+ uint8_t *src_line,
+ int32_t src_stride)
+{
+ uint8_t *dst, *src;
+ int32_t w;
+ uint8_t s, d;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ /* ensure both src and dst are properly aligned before doing 32 bit reads
+ * we'll stay in this loop if src and dst have differing alignments
+ */
+ while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
+ {
+ s = *src;
+ d = *dst;
+ asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+ *dst = d;
+
+ dst++;
+ src++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ asm ("uqadd8 %0, %1, %2"
+ : "=r" (*(uint32_t*)dst)
+ : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *src;
+ d = *dst;
+ asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+ *dst = d;
+
+ dst++;
+ src++;
+ w--;
+ }
+ }
+
+}
+
+void
+pixman_composite_over_8888_8888_asm_armv6 (int32_t width,
+ int32_t height,
+ uint32_t *dst_line,
+ int32_t dst_stride,
+ uint32_t *src_line,
+ int32_t src_stride)
+{
+ uint32_t *dst;
+ uint32_t *src;
+ int32_t w;
+ uint32_t component_half = 0x800080;
+ uint32_t upper_component_mask = 0xff00ff00;
+ uint32_t alpha_mask = 0xff;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+/* #define inner_branch */
+ asm volatile (
+ "cmp %[w], #0\n\t"
+ "beq 2f\n\t"
+ "1:\n\t"
+ /* load src */
+ "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+ * The 0x0 case also allows us to avoid doing an unecessary data
+ * write which is more valuable so we only check for that
+ */
+ "cmp r5, #0\n\t"
+ "beq 3f\n\t"
+
+ /* = 255 - alpha */
+ "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+ "ldr r4, [%[dest]] \n\t"
+
+#else
+ "ldr r4, [%[dest]] \n\t"
+
+ /* = 255 - alpha */
+ "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+#endif
+ "uxtb16 r6, r4\n\t"
+ "uxtb16 r7, r4, ror #8\n\t"
+
+ /* multiply by 257 and divide by 65536 */
+ "mla r6, r6, r8, %[component_half]\n\t"
+ "mla r7, r7, r8, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ /* recombine the 0xff00ff00 bytes of r6 and r7 */
+ "and r7, r7, %[upper_component_mask]\n\t"
+ "uxtab16 r6, r7, r6, ror #8\n\t"
+
+ "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+ "3:\n\t"
+
+#endif
+ "str r5, [%[dest]], #4\n\t"
+ /* increment counter and jmp to top */
+ "subs %[w], %[w], #1\n\t"
+ "bne 1b\n\t"
+ "2:\n\t"
+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+ : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
+ [alpha_mask] "r" (alpha_mask)
+ : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
+ );
+ }
+}
+
+void
+pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width,
+ int32_t height,
+ uint32_t *dst_line,
+ int32_t dst_stride,
+ uint32_t *src_line,
+ int32_t src_stride,
+ uint32_t mask)
+{
+ uint32_t *dst;
+ uint32_t *src;
+ int32_t w;
+ uint32_t component_half = 0x800080;
+ uint32_t alpha_mask = 0xff;
+
+ mask = (mask) >> 24;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+/* #define inner_branch */
+ asm volatile (
+ "cmp %[w], #0\n\t"
+ "beq 2f\n\t"
+ "1:\n\t"
+ /* load src */
+ "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+ * The 0x0 case also allows us to avoid doing an unecessary data
+ * write which is more valuable so we only check for that
+ */
+ "cmp r5, #0\n\t"
+ "beq 3f\n\t"
+
+#endif
+ "ldr r4, [%[dest]] \n\t"
+
+ "uxtb16 r6, r5\n\t"
+ "uxtb16 r7, r5, ror #8\n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
+ "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r5, r6, r7, lsl #8\n\t"
+
+ "uxtb16 r6, r4\n\t"
+ "uxtb16 r7, r4, ror #8\n\t"
+
+ /* 255 - alpha */
+ "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, r6, r8, %[component_half]\n\t"
+ "mla r7, r7, r8, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r6, r6, r7, lsl #8\n\t"
+
+ "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+ "3:\n\t"
+
+#endif
+ "str r5, [%[dest]], #4\n\t"
+ /* increment counter and jmp to top */
+ "subs %[w], %[w], #1\n\t"
+ "bne 1b\n\t"
+ "2:\n\t"
+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+ : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
+ [alpha_mask] "r" (alpha_mask)
+ : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
+ );
+ }
+}
+
+void
+pixman_composite_over_n_8_8888_asm_armv6 (int32_t width,
+ int32_t height,
+ uint32_t *dst_line,
+ int32_t dst_stride,
+ uint32_t src,
+ int32_t unused,
+ uint8_t *mask_line,
+ int32_t mask_stride)
+{
+ uint32_t srca;
+ uint32_t *dst;
+ uint8_t *mask;
+ int32_t w;
+
+ srca = src >> 24;
+
+ uint32_t component_mask = 0xff00ff;
+ uint32_t component_half = 0x800080;
+
+ uint32_t src_hi = (src >> 8) & component_mask;
+ uint32_t src_lo = src & component_mask;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+/* #define inner_branch */
+ asm volatile (
+ "cmp %[w], #0\n\t"
+ "beq 2f\n\t"
+ "1:\n\t"
+ /* load mask */
+ "ldrb r5, [%[mask]], #1\n\t"
+#ifdef inner_branch
+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+ * The 0x0 case also allows us to avoid doing an unecessary data
+ * write which is more valuable so we only check for that
+ */
+ "cmp r5, #0\n\t"
+ "beq 3f\n\t"
+
+#endif
+ "ldr r4, [%[dest]] \n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, %[src_lo], r5, %[component_half]\n\t"
+ "mla r7, %[src_hi], r5, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r5, r6, r7, lsl #8\n\t"
+
+ "uxtb16 r6, r4\n\t"
+ "uxtb16 r7, r4, ror #8\n\t"
+
+ /* we could simplify this to use 'sub' if we were
+ * willing to give up a register for alpha_mask
+ */
+ "mvn r8, r5\n\t"
+ "mov r8, r8, lsr #24\n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, r6, r8, %[component_half]\n\t"
+ "mla r7, r7, r8, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r6, r6, r7, lsl #8\n\t"
+
+ "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+ "3:\n\t"
+
+#endif
+ "str r5, [%[dest]], #4\n\t"
+ /* increment counter and jmp to top */
+ "subs %[w], %[w], #1\n\t"
+ "bne 1b\n\t"
+ "2:\n\t"
+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
+ : [component_half] "r" (component_half),
+ [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
+ : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
+ }
+}
+
+#endif
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
+ uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
+ uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
+ uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
+ uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
+ uint32_t, uint32_t)
+
+static const pixman_fast_path_t arm_simd_fast_paths[] =
+{
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
+
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
+
+ { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
+{
+ pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
+
+ return imp;
+}
|