From 4218fd3a8c41e4d9715cbcd6d855736028835fb8 Mon Sep 17 00:00:00 2001 From: marha Date: Mon, 6 Dec 2010 08:03:07 +0000 Subject: libXext xserver pixman git update 6-12-2010 --- pixman/pixman/pixman-arm-common.h | 15 +- pixman/pixman/pixman-arm-neon-asm.S | 473 +++++++++++++++++++++++++++++------- pixman/pixman/pixman-arm-neon-asm.h | 5 + pixman/pixman/pixman-arm-neon.c | 42 +++- pixman/pixman/pixman-arm-simd.c | 4 +- 5 files changed, 430 insertions(+), 109 deletions(-) (limited to 'pixman') diff --git a/pixman/pixman/pixman-arm-common.h b/pixman/pixman/pixman-arm-common.h index 08f34cc20..ede63a629 100644 --- a/pixman/pixman/pixman-arm-common.h +++ b/pixman/pixman/pixman-arm-common.h @@ -47,6 +47,9 @@ * or mask), the corresponding stride argument is unused. */ +#define SKIP_ZERO_SRC 1 +#define SKIP_ZERO_MASK 2 + #define PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(cputype, name, \ src_type, src_cnt, \ dst_type, dst_cnt) \ @@ -87,7 +90,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \ src_line, src_stride); \ } -#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(cputype, name, \ +#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(flags, cputype, name, \ dst_type, dst_cnt) \ void \ pixman_composite_##name##_asm_##cputype (int32_t w, \ @@ -117,7 +120,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \ \ src = _pixman_image_get_solid (src_image, dst_image->bits.format); \ \ - if (src == 0) \ + if ((flags & SKIP_ZERO_SRC) && src == 0) \ return; \ \ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \ @@ -128,7 +131,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \ src); \ } -#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(cputype, name, \ +#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(flags, cputype, name, \ mask_type, mask_cnt, \ dst_type, dst_cnt) \ void \ @@ -163,7 +166,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \ \ src = _pixman_image_get_solid (src_image, dst_image->bits.format); \ \ - if (src == 0) \ + if ((flags & SKIP_ZERO_SRC) && src == 0) \ return; \ \ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \ @@ -177,7 +180,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \ mask_line, mask_stride); \ } -#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(cputype, name, \ +#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(flags, cputype, name, \ src_type, src_cnt, \ dst_type, dst_cnt) \ void \ @@ -211,7 +214,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \ \ mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);\ \ - if (mask == 0) \ + if ((flags & SKIP_ZERO_MASK) && mask == 0) \ return; \ \ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \ diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S index 0229bedfa..b6e7c366b 100644 --- a/pixman/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman/pixman-arm-neon-asm.S @@ -791,61 +791,112 @@ generate_composite_function \ /******************************************************************************/ -.macro pixman_composite_over_n_8_0565_process_pixblock_head - /* in */ - vmull.u8 q0, d24, d8 - vmull.u8 q1, d24, d9 - vmull.u8 q6, d24, d10 - vmull.u8 q7, d24, d11 - vrshr.u16 q10, q0, #8 - vrshr.u16 q11, q1, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q13, q7, #8 - vraddhn.u16 d0, q0, q10 - vraddhn.u16 d1, q1, q11 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d3, q7, q13 - - vshrn.u16 d6, q2, #8 - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vsri.u8 d6, d6, #5 - vmvn.8 d3, d3 - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 - /* now do alpha blending */ - vmull.u8 q10, d3, d6 - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 - vrshr.u16 q13, q10, #8 - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 - vraddhn.u16 d22, q12, q15 -.endm - -.macro pixman_composite_over_n_8_0565_process_pixblock_tail - vqadd.u8 d16, d2, d20 - vqadd.u8 q9, q0, q11 - /* convert to r5g6b5 */ - vshll.u8 q14, d16, #8 - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 +.macro pixman_composite_over_8888_8_0565_process_pixblock_head + vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ + vmull.u8 q1, d24, d9 + vmull.u8 q6, d24, d10 + vmull.u8 q7, d24, d11 + vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ + vrshr.u16 q9, q1, #8 + vrshr.u16 q10, q6, #8 + vrshr.u16 q11, q7, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q9 + vraddhn.u16 d2, q6, q10 + vraddhn.u16 d3, q7, q11 + vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ + vsri.u8 d7, d7, #6 + vmvn.8 d3, d3 + vshrn.u16 d30, q2, #2 + vmull.u8 q8, d3, d6 /* now do alpha blending */ + vmull.u8 q9, d3, d7 + vmull.u8 q10, d3, d30 +.endm + +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail + /* 3 cycle bubble (after vmull.u8) */ + vrshr.u16 q13, q8, #8 + vrshr.u16 q11, q9, #8 + vrshr.u16 q15, q10, #8 + vraddhn.u16 d16, q8, q13 + vraddhn.u16 d27, q9, q11 + vraddhn.u16 d26, q10, q15 + vqadd.u8 d16, d2, d16 + /* 1 cycle bubble */ + vqadd.u8 q9, q0, q13 + vshll.u8 q14, d16, #8 /* convert to 16bpp */ + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + /* 1 cycle bubble */ + vsri.u16 q14, q9, #11 .endm -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head - pixman_composite_over_n_8_0565_process_pixblock_tail - vst1.16 {d28, d29}, [DST_W, :128]! +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head vld1.16 {d4, d5}, [DST_R, :128]! - vld1.8 {d24}, [MASK]! + vshrn.u16 d6, q2, #8 + fetch_mask_pixblock + vshrn.u16 d7, q2, #3 + fetch_src_pixblock + vmull.u8 q6, d24, d10 + vrshr.u16 q13, q8, #8 + vrshr.u16 q11, q9, #8 + vrshr.u16 q15, q10, #8 + vraddhn.u16 d16, q8, q13 + vraddhn.u16 d27, q9, q11 + vraddhn.u16 d26, q10, q15 + vqadd.u8 d16, d2, d16 + vmull.u8 q1, d24, d9 + vqadd.u8 q9, q0, q13 + vshll.u8 q14, d16, #8 + vmull.u8 q0, d24, d8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vmull.u8 q7, d24, d11 + vsri.u16 q14, q9, #11 + cache_preload 8, 8 - pixman_composite_over_n_8_0565_process_pixblock_head + + vsli.u16 q2, q2, #5 + vrshr.u16 q8, q0, #8 + vrshr.u16 q9, q1, #8 + vrshr.u16 q10, q6, #8 + vrshr.u16 q11, q7, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q9 + vraddhn.u16 d2, q6, q10 + vraddhn.u16 d3, q7, q11 + vsri.u8 d6, d6, #5 + vsri.u8 d7, d7, #6 + vmvn.8 d3, d3 + vshrn.u16 d30, q2, #2 + vst1.16 {d28, d29}, [DST_W, :128]! + vmull.u8 q8, d3, d6 + vmull.u8 q9, d3, d7 + vmull.u8 q10, d3, d30 .endm +generate_composite_function \ + pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + /* * This function needs a special initialization of solid mask. * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET @@ -877,32 +928,32 @@ generate_composite_function \ 5, /* prefetch distance */ \ pixman_composite_over_n_8_0565_init, \ pixman_composite_over_n_8_0565_cleanup, \ - pixman_composite_over_n_8_0565_process_pixblock_head, \ - pixman_composite_over_n_8_0565_process_pixblock_tail, \ - pixman_composite_over_n_8_0565_process_pixblock_tail_head + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head /******************************************************************************/ -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head - vld1.16 {d4, d5}, [DST_R, :128]! - pixman_composite_over_n_8_0565_process_pixblock_tail - fetch_src_pixblock - cache_preload 8, 8 - vld1.8 {d24}, [MASK]! - pixman_composite_over_n_8_0565_process_pixblock_head - vst1.16 {d28, d29}, [DST_W, :128]! +.macro pixman_composite_over_8888_n_0565_init + add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) + vpush {d8-d15} + vld1.32 {d24[0]}, [DUMMY] + vdup.8 d24, d24[3] +.endm + +.macro pixman_composite_over_8888_n_0565_cleanup + vpop {d8-d15} .endm generate_composite_function \ - pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ + pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_n_8_0565_process_pixblock_head, \ - pixman_composite_over_n_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_n_0565_init, \ + pixman_composite_over_8888_n_0565_cleanup, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 28, /* dst_w_basereg */ \ 4, /* dst_r_basereg */ \ @@ -1171,7 +1222,7 @@ generate_composite_function \ pixman_composite_over_n_8_8888_process_pixblock_tail vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vld1.8 {d24}, [MASK]! + fetch_mask_pixblock cache_preload 8, 8 pixman_composite_over_n_8_8888_process_pixblock_head .endm @@ -1203,6 +1254,74 @@ generate_composite_function \ /******************************************************************************/ +.macro pixman_composite_over_n_8_8_process_pixblock_head + vmull.u8 q0, d24, d8 + vmull.u8 q1, d25, d8 + vmull.u8 q6, d26, d8 + vmull.u8 q7, d27, d8 + vrshr.u16 q10, q0, #8 + vrshr.u16 q11, q1, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q13, q7, #8 + vraddhn.u16 d0, q0, q10 + vraddhn.u16 d1, q1, q11 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d3, q7, q13 + vmvn.8 q12, q0 + vmvn.8 q13, q1 + vmull.u8 q8, d24, d4 + vmull.u8 q9, d25, d5 + vmull.u8 q10, d26, d6 + vmull.u8 q11, d27, d7 +.endm + +.macro pixman_composite_over_n_8_8_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8_8_process_pixblock_tail_head + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_n_8_8_process_pixblock_tail + fetch_mask_pixblock + cache_preload 32, 32 + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + pixman_composite_over_n_8_8_process_pixblock_head +.endm + +.macro pixman_composite_over_n_8_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d8[0]}, [DUMMY] + vdup.8 d8, d8[3] +.endm + +.macro pixman_composite_over_n_8_8_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8_8_init, \ + pixman_composite_over_n_8_8_cleanup, \ + pixman_composite_over_n_8_8_process_pixblock_head, \ + pixman_composite_over_n_8_8_process_pixblock_tail, \ + pixman_composite_over_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head /* * 'combine_mask_ca' replacement @@ -1273,7 +1392,7 @@ generate_composite_function \ vraddhn.u16 d29, q15, q9 vraddhn.u16 d30, q6, q10 vraddhn.u16 d31, q7, q11 - vld4.8 {d24, d25, d26, d27}, [MASK]! + fetch_mask_pixblock vqadd.u8 q14, q0, q14 vqadd.u8 q15, q1, q15 cache_preload 8, 8 @@ -1308,6 +1427,58 @@ generate_composite_function \ /******************************************************************************/ +.macro pixman_composite_in_n_8_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* and destination data in {d4, d5, d6, d7} */ + vmull.u8 q8, d4, d3 + vmull.u8 q9, d5, d3 + vmull.u8 q10, d6, d3 + vmull.u8 q11, d7, d3 +.endm + +.macro pixman_composite_in_n_8_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q8, q14 + vraddhn.u16 d29, q9, q15 + vraddhn.u16 d30, q10, q12 + vraddhn.u16 d31, q11, q13 +.endm + +.macro pixman_composite_in_n_8_process_pixblock_tail_head + pixman_composite_in_n_8_process_pixblock_tail + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + cache_preload 32, 32 + pixman_composite_in_n_8_process_pixblock_head + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_in_n_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d3, d3[3] +.endm + +.macro pixman_composite_in_n_8_cleanup +.endm + +generate_composite_function \ + pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_in_n_8_init, \ + pixman_composite_in_n_8_cleanup, \ + pixman_composite_in_n_8_process_pixblock_head, \ + pixman_composite_in_n_8_process_pixblock_tail, \ + pixman_composite_in_n_8_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ + .macro pixman_composite_add_n_8_8_process_pixblock_head /* expecting source data in {d8, d9, d10, d11} */ /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ @@ -1337,7 +1508,7 @@ generate_composite_function \ pixman_composite_add_n_8_8_process_pixblock_tail vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - vld1.8 {d24, d25, d26, d27}, [MASK]! + fetch_mask_pixblock cache_preload 32, 32 pixman_composite_add_n_8_8_process_pixblock_head .endm @@ -1394,7 +1565,7 @@ generate_composite_function \ pixman_composite_add_8_8_8_process_pixblock_tail vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - vld1.8 {d24, d25, d26, d27}, [MASK]! + fetch_mask_pixblock fetch_src_pixblock cache_preload 32, 32 pixman_composite_add_8_8_8_process_pixblock_head @@ -1423,34 +1594,50 @@ generate_composite_function \ /* expecting source data in {d0, d1, d2, d3} */ /* destination data in {d4, d5, d6, d7} */ /* mask in {d24, d25, d26, d27} */ - vmull.u8 q8, d27, d0 - vmull.u8 q9, d27, d1 + vmull.u8 q8, d27, d0 + vmull.u8 q9, d27, d1 vmull.u8 q10, d27, d2 vmull.u8 q11, d27, d3 - vrshr.u16 q0, q8, #8 - vrshr.u16 q1, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q9 - vraddhn.u16 d2, q12, q10 - vraddhn.u16 d3, q13, q11 - vqadd.u8 q14, q0, q2 - vqadd.u8 q15, q1, q3 + /* 1 cycle bubble */ + vrsra.u16 q8, q8, #8 + vrsra.u16 q9, q9, #8 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 .endm .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail + /* 2 cycle bubble */ + vrshrn.u16 d28, q8, #8 + vrshrn.u16 d29, q9, #8 + vrshrn.u16 d30, q10, #8 + vrshrn.u16 d31, q11, #8 + vqadd.u8 q14, q2, q14 + /* 1 cycle bubble */ + vqadd.u8 q15, q3, q15 .endm -/* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head - pixman_composite_add_8888_8888_8888_process_pixblock_tail - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vld4.8 {d24, d25, d26, d27}, [MASK]! fetch_src_pixblock + vrshrn.u16 d28, q8, #8 + fetch_mask_pixblock + vrshrn.u16 d29, q9, #8 + vmull.u8 q8, d27, d0 + vrshrn.u16 d30, q10, #8 + vmull.u8 q9, d27, d1 + vrshrn.u16 d31, q11, #8 + vmull.u8 q10, d27, d2 + vqadd.u8 q14, q2, q14 + vmull.u8 q11, d27, d3 + vqadd.u8 q15, q3, q15 + vrsra.u16 q8, q8, #8 + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrsra.u16 q9, q9, #8 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + vrsra.u16 q10, q10, #8 + cache_preload 8, 8 - pixman_composite_add_8888_8888_8888_process_pixblock_head + + vrsra.u16 q11, q11, #8 .endm generate_composite_function \ @@ -1476,6 +1663,78 @@ generate_composite_function_single_scanline \ /******************************************************************************/ +generate_composite_function \ + pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 27 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_add_n_8_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] + vdup.8 d2, d3[2] + vdup.8 d3, d3[3] +.endm + +.macro pixman_composite_add_n_8_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_n_8_8888_init, \ + pixman_composite_add_n_8_8888_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 27 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_add_8888_n_8888_init + add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) + vld1.32 {d27[0]}, [DUMMY] + vdup.8 d27, d27[3] +.endm + +.macro pixman_composite_add_8888_n_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_8888_n_8888_init, \ + pixman_composite_add_8888_n_8888_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 27 /* mask_basereg */ + +/******************************************************************************/ + .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head /* expecting source data in {d0, d1, d2, d3} */ /* destination data in {d4, d5, d6, d7} */ @@ -1519,7 +1778,7 @@ generate_composite_function_single_scanline \ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail fetch_src_pixblock cache_preload 8, 8 - vld4.8 {d12, d13, d14, d15}, [MASK]! + fetch_mask_pixblock pixman_composite_out_reverse_8888_n_8888_process_pixblock_head vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! .endm @@ -1590,7 +1849,7 @@ generate_composite_function \ pixman_composite_over_8888_n_8888_process_pixblock_tail fetch_src_pixblock cache_preload 8, 8 - vld4.8 {d12, d13, d14, d15}, [MASK]! + fetch_mask_pixblock pixman_composite_over_8888_n_8888_process_pixblock_head vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! .endm @@ -1632,7 +1891,7 @@ generate_composite_function_single_scanline \ pixman_composite_over_8888_n_8888_process_pixblock_tail fetch_src_pixblock cache_preload 8, 8 - vld1.8 {d15}, [MASK]! + fetch_mask_pixblock pixman_composite_over_8888_n_8888_process_pixblock_head vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! .endm @@ -1849,7 +2108,7 @@ generate_composite_function \ /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head - vld1.8 {d15}, [MASK]! + fetch_mask_pixblock pixman_composite_over_0565_8_0565_process_pixblock_tail fetch_src_pixblock vld1.16 {d10, d11}, [DST_R, :128]! @@ -1875,6 +2134,34 @@ generate_composite_function \ /******************************************************************************/ +.macro pixman_composite_over_0565_n_0565_init + add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) + vpush {d8-d15} + vld1.32 {d15[0]}, [DUMMY] + vdup.8 d15, d15[3] +.endm + +.macro pixman_composite_over_0565_n_0565_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_0565_n_0565_init, \ + pixman_composite_over_0565_n_0565_cleanup, \ + pixman_composite_over_0565_8_0565_process_pixblock_head, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 10, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 15 /* mask_basereg */ + +/******************************************************************************/ + .macro pixman_composite_add_0565_8_0565_process_pixblock_head /* mask is in d15 */ convert_0565_to_x888 q4, d2, d1, d0 @@ -1901,7 +2188,7 @@ generate_composite_function \ /* TODO: expand macros and do better instructions scheduling */ .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head - vld1.8 {d15}, [MASK]! + fetch_mask_pixblock pixman_composite_add_0565_8_0565_process_pixblock_tail fetch_src_pixblock vld1.16 {d10, d11}, [DST_R, :128]! diff --git a/pixman/pixman/pixman-arm-neon-asm.h b/pixman/pixman/pixman-arm-neon-asm.h index 1d8a31c1e..6e3d583f5 100644 --- a/pixman/pixman/pixman-arm-neon-asm.h +++ b/pixman/pixman/pixman-arm-neon-asm.h @@ -431,6 +431,11 @@ .endif .endm +.macro fetch_mask_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +.endm + /* * Macro which is used to process leading pixels until destination * pointer is properly aligned (at 16 bytes boundary). When destination diff --git a/pixman/pixman/pixman-arm-neon.c b/pixman/pixman/pixman-arm-neon.c index 858bb071e..9e879382b 100644 --- a/pixman/pixman/pixman-arm-neon.c +++ b/pixman/pixman/pixman-arm-neon.c @@ -63,29 +63,43 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888, PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565, uint8_t, 1, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_DST (neon, over_n_0565, +PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_0565, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_DST (neon, over_n_8888, +PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_8888, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_DST (neon, over_reverse_n_8888, +PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8, + uint8_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_0565, +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565, uint8_t, 1, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8888, +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888, uint8_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8888_8888_ca, +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca, uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, add_n_8_8, +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8, uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8, + uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888, + uint8_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_8888_n_8888, +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888, + uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_0565, + uint32_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_0565_n_0565, + uint16_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, add_8888_n_8888, uint32_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8, uint8_t, 1, uint8_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_0565_8_0565, uint16_t, 1, uint8_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8_8888, + uint32_t, 1, uint8_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888, uint32_t, 1, uint32_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888, @@ -235,6 +249,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, x8r8g8b8, neon_composite_src_0888_8888_rev), PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, r5g6b5, neon_composite_src_0888_0565_rev), PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8r8g8b8, neon_composite_src_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8), PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565), PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565), PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, neon_composite_over_n_8_8888), @@ -250,6 +265,10 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, neon_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, neon_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, r5g6b5, neon_composite_over_8888_n_0565), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, b5g6r5, neon_composite_over_8888_n_0565), + PIXMAN_STD_FAST_PATH (OVER, r5g6b5, solid, r5g6b5, neon_composite_over_0565_n_0565), + PIXMAN_STD_FAST_PATH (OVER, b5g6r5, solid, b5g6r5, neon_composite_over_0565_n_0565), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, neon_composite_over_8888_8_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, neon_composite_over_8888_8_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, neon_composite_over_8888_8_8888), @@ -268,13 +287,20 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, a8r8g8b8, neon_composite_src_x888_8888), PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, a8b8g8r8, neon_composite_src_x888_8888), PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, neon_composite_add_n_8_8), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, neon_composite_add_n_8_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, neon_composite_add_n_8_8888), PIXMAN_STD_FAST_PATH (ADD, a8, a8, a8, neon_composite_add_8_8_8), PIXMAN_STD_FAST_PATH (ADD, r5g6b5, a8, r5g6b5, neon_composite_add_0565_8_0565), PIXMAN_STD_FAST_PATH (ADD, b5g6r5, a8, b5g6r5, neon_composite_add_0565_8_0565), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8, a8r8g8b8, neon_composite_add_8888_8_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, a8, a8b8g8r8, neon_composite_add_8888_8_8888), PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, solid, a8r8g8b8, neon_composite_add_8888_n_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, solid, a8b8g8r8, neon_composite_add_8888_n_8888), PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8_8), PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, neon_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, neon_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (IN, solid, null, a8, neon_composite_in_n_8), PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888), PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888), PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, r5g6b5, neon_composite_out_reverse_8_0565), diff --git a/pixman/pixman/pixman-arm-simd.c b/pixman/pixman/pixman-arm-simd.c index 7f789ceab..e03dfe020 100644 --- a/pixman/pixman/pixman-arm-simd.c +++ b/pixman/pixman/pixman-arm-simd.c @@ -381,10 +381,10 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888, +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888, uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888, +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888, uint8_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC, -- cgit v1.2.3