aboutsummaryrefslogtreecommitdiff
path: root/pixman
diff options
context:
space:
mode:
authormarha <marha@users.sourceforge.net>2011-02-16 16:53:37 +0000
committermarha <marha@users.sourceforge.net>2011-02-16 16:53:37 +0000
commit48d0dcbd5b7f80810ce259bc9ed6f57f99e27ca9 (patch)
treede0c183abc0d1a958dc04cdf75f56d20db3f45e8 /pixman
parent6c2a4cdbdb1024e7069b1c51bb046d64750e30a5 (diff)
downloadvcxsrv-48d0dcbd5b7f80810ce259bc9ed6f57f99e27ca9.tar.gz
vcxsrv-48d0dcbd5b7f80810ce259bc9ed6f57f99e27ca9.tar.bz2
vcxsrv-48d0dcbd5b7f80810ce259bc9ed6f57f99e27ca9.zip
pixman mesa git update 16 Feb 2011
Diffstat (limited to 'pixman')
-rw-r--r--pixman/Makefile.am2
-rw-r--r--pixman/configure.ac1
-rw-r--r--pixman/demos/Makefile.am34
-rw-r--r--pixman/demos/alpha-test.c (renamed from pixman/test/alpha-test.c)0
-rw-r--r--pixman/demos/clip-in.c (renamed from pixman/test/clip-in.c)0
-rw-r--r--pixman/demos/clip-test.c (renamed from pixman/test/clip-test.c)0
-rw-r--r--pixman/demos/composite-test.c (renamed from pixman/test/composite-test.c)0
-rw-r--r--pixman/demos/convolution-test.c (renamed from pixman/test/convolution-test.c)0
-rw-r--r--pixman/demos/gradient-test.c (renamed from pixman/test/gradient-test.c)0
-rw-r--r--pixman/demos/gtk-utils.c (renamed from pixman/test/gtk-utils.c)0
-rw-r--r--pixman/demos/gtk-utils.h (renamed from pixman/test/gtk-utils.h)0
-rw-r--r--pixman/demos/radial-test.c (renamed from pixman/test/radial-test.c)2
-rw-r--r--pixman/demos/screen-test.c (renamed from pixman/test/screen-test.c)0
-rw-r--r--pixman/demos/trap-test.c (renamed from pixman/test/trap-test.c)0
-rw-r--r--pixman/pixman/pixman-arm-common.h59
-rw-r--r--pixman/pixman/pixman-arm-neon-asm.S4758
-rw-r--r--pixman/pixman/pixman-arm-neon.c11
-rw-r--r--pixman/pixman/pixman-fast-path.c4455
-rw-r--r--pixman/pixman/pixman-fast-path.h1039
-rw-r--r--pixman/pixman/pixman-sse2.c124
-rw-r--r--pixman/test/Makefile.am104
-rw-r--r--pixman/test/scaling-test.c618
22 files changed, 5805 insertions, 5402 deletions
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 63b08c1fb..062c58a89 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -1,4 +1,4 @@
-SUBDIRS = pixman test
+SUBDIRS = pixman test demos
pkgconfigdir=$(libdir)/pkgconfig
pkgconfig_DATA=pixman-1.pc
diff --git a/pixman/configure.ac b/pixman/configure.ac
index ab2ecde1b..5242799bb 100644
--- a/pixman/configure.ac
+++ b/pixman/configure.ac
@@ -794,6 +794,7 @@ AC_OUTPUT([pixman-1.pc
Makefile
pixman/Makefile
pixman/pixman-version.h
+ demos/Makefile
test/Makefile])
m4_if(m4_eval(pixman_minor % 2), [1], [
diff --git a/pixman/demos/Makefile.am b/pixman/demos/Makefile.am
new file mode 100644
index 000000000..2dcdfd350
--- /dev/null
+++ b/pixman/demos/Makefile.am
@@ -0,0 +1,34 @@
+if HAVE_GTK
+
+AM_CFLAGS = @OPENMP_CFLAGS@
+AM_LDFLAGS = @OPENMP_CFLAGS@
+
+LDADD = $(GTK_LIBS) $(top_builddir)/pixman/libpixman-1.la -lm
+INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(GTK_CFLAGS)
+
+GTK_UTILS = gtk-utils.c gtk-utils.h
+
+DEMOS = \
+ clip-test \
+ clip-in \
+ composite-test \
+ gradient-test \
+ radial-test \
+ alpha-test \
+ screen-test \
+ convolution-test \
+ trap-test
+
+gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
+alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
+composite_test_SOURCES = composite-test.c $(GTK_UTILS)
+clip_test_SOURCES = clip-test.c $(GTK_UTILS)
+clip_in_SOURCES = clip-in.c $(GTK_UTILS)
+trap_test_SOURCES = trap-test.c $(GTK_UTILS)
+screen_test_SOURCES = screen-test.c $(GTK_UTILS)
+convolution_test_SOURCES = convolution-test.c $(GTK_UTILS)
+radial_test_SOURCES = radial-test.c ../test/utils.c ../test/utils.h $(GTK_UTILS)
+
+noinst_PROGRAMS = $(DEMOS)
+
+endif
diff --git a/pixman/test/alpha-test.c b/pixman/demos/alpha-test.c
index 92c208142..92c208142 100644
--- a/pixman/test/alpha-test.c
+++ b/pixman/demos/alpha-test.c
diff --git a/pixman/test/clip-in.c b/pixman/demos/clip-in.c
index 51579811f..51579811f 100644
--- a/pixman/test/clip-in.c
+++ b/pixman/demos/clip-in.c
diff --git a/pixman/test/clip-test.c b/pixman/demos/clip-test.c
index aa0df4482..aa0df4482 100644
--- a/pixman/test/clip-test.c
+++ b/pixman/demos/clip-test.c
diff --git a/pixman/test/composite-test.c b/pixman/demos/composite-test.c
index 79d5d5eac..79d5d5eac 100644
--- a/pixman/test/composite-test.c
+++ b/pixman/demos/composite-test.c
diff --git a/pixman/test/convolution-test.c b/pixman/demos/convolution-test.c
index da284af7b..da284af7b 100644
--- a/pixman/test/convolution-test.c
+++ b/pixman/demos/convolution-test.c
diff --git a/pixman/test/gradient-test.c b/pixman/demos/gradient-test.c
index fc84844b0..fc84844b0 100644
--- a/pixman/test/gradient-test.c
+++ b/pixman/demos/gradient-test.c
diff --git a/pixman/test/gtk-utils.c b/pixman/demos/gtk-utils.c
index f45cdc912..f45cdc912 100644
--- a/pixman/test/gtk-utils.c
+++ b/pixman/demos/gtk-utils.c
diff --git a/pixman/test/gtk-utils.h b/pixman/demos/gtk-utils.h
index 2cb13bcf0..2cb13bcf0 100644
--- a/pixman/test/gtk-utils.h
+++ b/pixman/demos/gtk-utils.h
diff --git a/pixman/test/radial-test.c b/pixman/demos/radial-test.c
index 5d716c339..35e90d786 100644
--- a/pixman/test/radial-test.c
+++ b/pixman/demos/radial-test.c
@@ -1,4 +1,4 @@
-#include "utils.h"
+#include "../test/utils.h"
#include "gtk-utils.h"
#define NUM_GRADIENTS 7
diff --git a/pixman/test/screen-test.c b/pixman/demos/screen-test.c
index e69dba3de..e69dba3de 100644
--- a/pixman/test/screen-test.c
+++ b/pixman/demos/screen-test.c
diff --git a/pixman/test/trap-test.c b/pixman/demos/trap-test.c
index 19295e7a5..19295e7a5 100644
--- a/pixman/test/trap-test.c
+++ b/pixman/demos/trap-test.c
diff --git a/pixman/pixman/pixman-arm-common.h b/pixman/pixman/pixman-arm-common.h
index 372e9f9a8..9b1322b6c 100644
--- a/pixman/pixman/pixman-arm-common.h
+++ b/pixman/pixman/pixman-arm-common.h
@@ -282,19 +282,20 @@ cputype##_composite_##name (pixman_implementation_t *imp, \
src_type, dst_type) \
void \
pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \
- int32_t w, \
- dst_type * dst, \
- src_type * src, \
- pixman_fixed_t vx, \
- pixman_fixed_t unit_x);\
+ int32_t w, \
+ dst_type * dst, \
+ const src_type * src, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x); \
\
static force_inline void \
scaled_nearest_scanline_##cputype##_##name##_##op (dst_type * pd, \
- src_type * ps, \
+ const src_type * ps, \
int32_t w, \
pixman_fixed_t vx, \
pixman_fixed_t unit_x, \
- pixman_fixed_t max_vx) \
+ pixman_fixed_t max_vx, \
+ pixman_bool_t zero_src) \
{ \
pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \
vx, unit_x);\
@@ -316,4 +317,48 @@ FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op, \
SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \
SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op, \
+ src_type, dst_type) \
+void \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \
+ int32_t w, \
+ dst_type * dst, \
+ const src_type * src, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ const uint8_t * mask); \
+ \
+static force_inline void \
+scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t * mask, \
+ dst_type * pd, \
+ const src_type * ps, \
+ int32_t w, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx, \
+ pixman_bool_t zero_src) \
+{ \
+ if ((flags & SKIP_ZERO_SRC) && zero_src) \
+ return; \
+ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \
+ vx, unit_x, \
+ mask); \
+} \
+ \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \
+ scaled_nearest_scanline_##cputype##_##name##_##op,\
+ src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op, \
+ scaled_nearest_scanline_##cputype##_##name##_##op,\
+ src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \
+ scaled_nearest_scanline_##cputype##_##name##_##op,\
+ src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
#endif
diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S
index 51533d42f..47daf457c 100644
--- a/pixman/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman/pixman-arm-neon-asm.S
@@ -1,2365 +1,2393 @@
-/*
- * Copyright © 2009 Nokia Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
- */
-
-/*
- * This file contains implementations of NEON optimized pixel processing
- * functions. There is no full and detailed tutorial, but some functions
- * (those which are exposing some new or interesting features) are
- * extensively commented and can be used as examples.
- *
- * You may want to have a look at the comments for following functions:
- * - pixman_composite_over_8888_0565_asm_neon
- * - pixman_composite_over_n_8_0565_asm_neon
- */
-
-/* Prevent the stack from becoming executable for no reason... */
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
- .text
- .fpu neon
- .arch armv7a
- .object_arch armv4
- .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
- .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
- .arm
- .altmacro
-
-#include "pixman-arm-neon-asm.h"
-
-/* Global configuration options and preferences */
-
-/*
- * The code can optionally make use of unaligned memory accesses to improve
- * performance of handling leading/trailing pixels for each scanline.
- * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
- * example in linux if unaligned memory accesses are not configured to
- * generate.exceptions.
- */
-.set RESPECT_STRICT_ALIGNMENT, 1
-
-/*
- * Set default prefetch type. There is a choice between the following options:
- *
- * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
- * as NOP to workaround some HW bugs or for whatever other reason)
- *
- * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
- * advanced prefetch intruduces heavy overhead)
- *
- * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
- * which can run ARM and NEON instructions simultaneously so that extra ARM
- * instructions do not add (many) extra cycles, but improve prefetch efficiency)
- *
- * Note: some types of function can't support advanced prefetch and fallback
- * to simple one (those which handle 24bpp pixels)
- */
-.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
-
-/* Prefetch distance in pixels for simple prefetch */
-.set PREFETCH_DISTANCE_SIMPLE, 64
-
-/*
- * Implementation of pixman_composite_over_8888_0565_asm_neon
- *
- * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
- * performs OVER compositing operation. Function fast_composite_over_8888_0565
- * from pixman-fast-path.c does the same in C and can be used as a reference.
- *
- * First we need to have some NEON assembly code which can do the actual
- * operation on the pixels and provide it to the template macro.
- *
- * Template macro quite conveniently takes care of emitting all the necessary
- * code for memory reading and writing (including quite tricky cases of
- * handling unaligned leading/trailing pixels), so we only need to deal with
- * the data in NEON registers.
- *
- * NEON registers allocation in general is recommented to be the following:
- * d0, d1, d2, d3 - contain loaded source pixel data
- * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
- * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
- * d28, d29, d30, d31 - place for storing the result (destination pixels)
- *
- * As can be seen above, four 64-bit NEON registers are used for keeping
- * intermediate pixel data and up to 8 pixels can be processed in one step
- * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
- *
- * This particular function uses the following registers allocation:
- * d0, d1, d2, d3 - contain loaded source pixel data
- * d4, d5 - contain loaded destination pixels (they are needed)
- * d28, d29 - place for storing the result (destination pixels)
- */
-
-/*
- * Step one. We need to have some code to do some arithmetics on pixel data.
- * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
- * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
- * perform all the needed calculations and write the result to {d28, d29}.
- * The rationale for having two macros and not just one will be explained
- * later. In practice, any single monolitic function which does the work can
- * be split into two parts in any arbitrary way without affecting correctness.
- *
- * There is one special trick here too. Common template macro can optionally
- * make our life a bit easier by doing R, G, B, A color components
- * deinterleaving for 32bpp pixel formats (and this feature is used in
- * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
- * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
- * actually use d0 register for blue channel (a vector of eight 8-bit
- * values), d1 register for green, d2 for red and d3 for alpha. This
- * simple conversion can be also done with a few NEON instructions:
- *
- * Packed to planar conversion:
- * vuzp.8 d0, d1
- * vuzp.8 d2, d3
- * vuzp.8 d1, d3
- * vuzp.8 d0, d2
- *
- * Planar to packed conversion:
- * vzip.8 d0, d2
- * vzip.8 d1, d3
- * vzip.8 d2, d3
- * vzip.8 d0, d1
- *
- * But pixel can be loaded directly in planar format using VLD4.8 NEON
- * instruction. It is 1 cycle slower than VLD1.32, so this is not always
- * desirable, that's why deinterleaving is optional.
- *
- * But anyway, here is the code:
- */
-.macro pixman_composite_over_8888_0565_process_pixblock_head
- /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
- and put data into d6 - red, d7 - green, d30 - blue */
- vshrn.u16 d6, q2, #8
- vshrn.u16 d7, q2, #3
- vsli.u16 q2, q2, #5
- vsri.u8 d6, d6, #5
- vmvn.8 d3, d3 /* invert source alpha */
- vsri.u8 d7, d7, #6
- vshrn.u16 d30, q2, #2
- /* now do alpha blending, storing results in 8-bit planar format
- into d16 - red, d19 - green, d18 - blue */
- vmull.u8 q10, d3, d6
- vmull.u8 q11, d3, d7
- vmull.u8 q12, d3, d30
- vrshr.u16 q13, q10, #8
- vrshr.u16 q3, q11, #8
- vrshr.u16 q15, q12, #8
- vraddhn.u16 d20, q10, q13
- vraddhn.u16 d23, q11, q3
- vraddhn.u16 d22, q12, q15
-.endm
-
-.macro pixman_composite_over_8888_0565_process_pixblock_tail
- /* ... continue alpha blending */
- vqadd.u8 d16, d2, d20
- vqadd.u8 q9, q0, q11
- /* convert the result to r5g6b5 and store it into {d28, d29} */
- vshll.u8 q14, d16, #8
- vshll.u8 q8, d19, #8
- vshll.u8 q9, d18, #8
- vsri.u16 q14, q8, #5
- vsri.u16 q14, q9, #11
-.endm
-
-/*
- * OK, now we got almost everything that we need. Using the above two
- * macros, the work can be done right. But now we want to optimize
- * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
- * a lot from good code scheduling and software pipelining.
- *
- * Let's construct some code, which will run in the core main loop.
- * Some pseudo-code of the main loop will look like this:
- * head
- * while (...) {
- * tail
- * head
- * }
- * tail
- *
- * It may look a bit weird, but this setup allows to hide instruction
- * latencies better and also utilize dual-issue capability more
- * efficiently (make pairs of load-store and ALU instructions).
- *
- * So what we need now is a '*_tail_head' macro, which will be used
- * in the core main loop. A trivial straightforward implementation
- * of this macro would look like this:
- *
- * pixman_composite_over_8888_0565_process_pixblock_tail
- * vst1.16 {d28, d29}, [DST_W, :128]!
- * vld1.16 {d4, d5}, [DST_R, :128]!
- * vld4.32 {d0, d1, d2, d3}, [SRC]!
- * pixman_composite_over_8888_0565_process_pixblock_head
- * cache_preload 8, 8
- *
- * Now it also got some VLD/VST instructions. We simply can't move from
- * processing one block of pixels to the other one with just arithmetics.
- * The previously processed data needs to be written to memory and new
- * data needs to be fetched. Fortunately, this main loop does not deal
- * with partial leading/trailing pixels and can load/store a full block
- * of pixels in a bulk. Additionally, destination buffer is already
- * 16 bytes aligned here (which is good for performance).
- *
- * New things here are DST_R, DST_W, SRC and MASK identifiers. These
- * are the aliases for ARM registers which are used as pointers for
- * accessing data. We maintain separate pointers for reading and writing
- * destination buffer (DST_R and DST_W).
- *
- * Another new thing is 'cache_preload' macro. It is used for prefetching
- * data into CPU L2 cache and improve performance when dealing with large
- * images which are far larger than cache size. It uses one argument
- * (actually two, but they need to be the same here) - number of pixels
- * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
- * details about this macro. Moreover, if good performance is needed
- * the code from this macro needs to be copied into '*_tail_head' macro
- * and mixed with the rest of code for optimal instructions scheduling.
- * We are actually doing it below.
- *
- * Now after all the explanations, here is the optimized code.
- * Different instruction streams (originaling from '*_head', '*_tail'
- * and 'cache_preload' macro) use different indentation levels for
- * better readability. Actually taking the code from one of these
- * indentation levels and ignoring a few VLD/VST instructions would
- * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
- * macro!
- */
-
-#if 1
-
-.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
- vqadd.u8 d16, d2, d20
- vld1.16 {d4, d5}, [DST_R, :128]!
- vqadd.u8 q9, q0, q11
- vshrn.u16 d6, q2, #8
- fetch_src_pixblock
- vshrn.u16 d7, q2, #3
- vsli.u16 q2, q2, #5
- vshll.u8 q14, d16, #8
- PF add PF_X, PF_X, #8
- vshll.u8 q8, d19, #8
- PF tst PF_CTL, #0xF
- vsri.u8 d6, d6, #5
- PF addne PF_X, PF_X, #8
- vmvn.8 d3, d3
- PF subne PF_CTL, PF_CTL, #1
- vsri.u8 d7, d7, #6
- vshrn.u16 d30, q2, #2
- vmull.u8 q10, d3, d6
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- vmull.u8 q11, d3, d7
- vmull.u8 q12, d3, d30
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- vsri.u16 q14, q8, #5
- PF cmp PF_X, ORIG_W
- vshll.u8 q9, d18, #8
- vrshr.u16 q13, q10, #8
- PF subge PF_X, PF_X, ORIG_W
- vrshr.u16 q3, q11, #8
- vrshr.u16 q15, q12, #8
- PF subges PF_CTL, PF_CTL, #0x10
- vsri.u16 q14, q9, #11
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- vraddhn.u16 d20, q10, q13
- vraddhn.u16 d23, q11, q3
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vraddhn.u16 d22, q12, q15
- vst1.16 {d28, d29}, [DST_W, :128]!
-.endm
-
-#else
-
-/* If we did not care much about the performance, we would just use this... */
-.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
- pixman_composite_over_8888_0565_process_pixblock_tail
- vst1.16 {d28, d29}, [DST_W, :128]!
- vld1.16 {d4, d5}, [DST_R, :128]!
- fetch_src_pixblock
- pixman_composite_over_8888_0565_process_pixblock_head
- cache_preload 8, 8
-.endm
-
-#endif
-
-/*
- * And now the final part. We are using 'generate_composite_function' macro
- * to put all the stuff together. We are specifying the name of the function
- * which we want to get, number of bits per pixel for the source, mask and
- * destination (0 if unused, like mask in this case). Next come some bit
- * flags:
- * FLAG_DST_READWRITE - tells that the destination buffer is both read
- * and written, for write-only buffer we would use
- * FLAG_DST_WRITEONLY flag instead
- * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
- * and separate color channels for 32bpp format.
- * The next things are:
- * - the number of pixels processed per iteration (8 in this case, because
- * that's the maximum what can fit into four 64-bit NEON registers).
- * - prefetch distance, measured in pixel blocks. In this case it is 5 times
- * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
- * prefetch distance can be selected by running some benchmarks.
- *
- * After that we specify some macros, these are 'default_init',
- * 'default_cleanup' here which are empty (but it is possible to have custom
- * init/cleanup macros to be able to save/restore some extra NEON registers
- * like d8-d15 or do anything else) followed by
- * 'pixman_composite_over_8888_0565_process_pixblock_head',
- * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
- * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
- * which we got implemented above.
- *
- * The last part is the NEON registers allocation scheme.
- */
-generate_composite_function \
- pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_over_8888_0565_process_pixblock_head, \
- pixman_composite_over_8888_0565_process_pixblock_tail, \
- pixman_composite_over_8888_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 24 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_0565_process_pixblock_head
- /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
- and put data into d6 - red, d7 - green, d30 - blue */
- vshrn.u16 d6, q2, #8
- vshrn.u16 d7, q2, #3
- vsli.u16 q2, q2, #5
- vsri.u8 d6, d6, #5
- vsri.u8 d7, d7, #6
- vshrn.u16 d30, q2, #2
- /* now do alpha blending, storing results in 8-bit planar format
- into d16 - red, d19 - green, d18 - blue */
- vmull.u8 q10, d3, d6
- vmull.u8 q11, d3, d7
- vmull.u8 q12, d3, d30
- vrshr.u16 q13, q10, #8
- vrshr.u16 q3, q11, #8
- vrshr.u16 q15, q12, #8
- vraddhn.u16 d20, q10, q13
- vraddhn.u16 d23, q11, q3
- vraddhn.u16 d22, q12, q15
-.endm
-
-.macro pixman_composite_over_n_0565_process_pixblock_tail
- /* ... continue alpha blending */
- vqadd.u8 d16, d2, d20
- vqadd.u8 q9, q0, q11
- /* convert the result to r5g6b5 and store it into {d28, d29} */
- vshll.u8 q14, d16, #8
- vshll.u8 q8, d19, #8
- vshll.u8 q9, d18, #8
- vsri.u16 q14, q8, #5
- vsri.u16 q14, q9, #11
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_0565_process_pixblock_tail_head
- pixman_composite_over_n_0565_process_pixblock_tail
- vld1.16 {d4, d5}, [DST_R, :128]!
- vst1.16 {d28, d29}, [DST_W, :128]!
- pixman_composite_over_n_0565_process_pixblock_head
- cache_preload 8, 8
-.endm
-
-.macro pixman_composite_over_n_0565_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d3[0]}, [DUMMY]
- vdup.8 d0, d3[0]
- vdup.8 d1, d3[1]
- vdup.8 d2, d3[2]
- vdup.8 d3, d3[3]
- vmvn.8 d3, d3 /* invert source alpha */
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_0565_init, \
- default_cleanup, \
- pixman_composite_over_n_0565_process_pixblock_head, \
- pixman_composite_over_n_0565_process_pixblock_tail, \
- pixman_composite_over_n_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 24 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_8888_0565_process_pixblock_head
- vshll.u8 q8, d1, #8
- vshll.u8 q14, d2, #8
- vshll.u8 q9, d0, #8
-.endm
-
-.macro pixman_composite_src_8888_0565_process_pixblock_tail
- vsri.u16 q14, q8, #5
- vsri.u16 q14, q9, #11
-.endm
-
-.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
- vsri.u16 q14, q8, #5
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- fetch_src_pixblock
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vsri.u16 q14, q9, #11
- PF cmp PF_X, ORIG_W
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- vshll.u8 q8, d1, #8
- vst1.16 {d28, d29}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- vshll.u8 q14, d2, #8
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- vshll.u8 q9, d0, #8
-.endm
-
-generate_composite_function \
- pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_8888_0565_process_pixblock_head, \
- pixman_composite_src_8888_0565_process_pixblock_tail, \
- pixman_composite_src_8888_0565_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0565_8888_process_pixblock_head
- vshrn.u16 d30, q0, #8
- vshrn.u16 d29, q0, #3
- vsli.u16 q0, q0, #5
- vmov.u8 d31, #255
- vsri.u8 d30, d30, #5
- vsri.u8 d29, d29, #6
- vshrn.u16 d28, q0, #2
-.endm
-
-.macro pixman_composite_src_0565_8888_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
- pixman_composite_src_0565_8888_process_pixblock_tail
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- fetch_src_pixblock
- pixman_composite_src_0565_8888_process_pixblock_head
- cache_preload 8, 8
-.endm
-
-generate_composite_function \
- pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_0565_8888_process_pixblock_head, \
- pixman_composite_src_0565_8888_process_pixblock_tail, \
- pixman_composite_src_0565_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8_8_process_pixblock_head
- vqadd.u8 q14, q0, q2
- vqadd.u8 q15, q1, q3
-.endm
-
-.macro pixman_composite_add_8_8_process_pixblock_tail
-.endm
-
-.macro pixman_composite_add_8_8_process_pixblock_tail_head
- fetch_src_pixblock
- PF add PF_X, PF_X, #32
- PF tst PF_CTL, #0xF
- vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- PF addne PF_X, PF_X, #32
- PF subne PF_CTL, PF_CTL, #1
- vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- vqadd.u8 q14, q0, q2
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vqadd.u8 q15, q1, q3
-.endm
-
-generate_composite_function \
- pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
- FLAG_DST_READWRITE, \
- 32, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8_8_process_pixblock_head, \
- pixman_composite_add_8_8_process_pixblock_tail, \
- pixman_composite_add_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
- fetch_src_pixblock
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- vqadd.u8 q14, q0, q2
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vqadd.u8 q15, q1, q3
-.endm
-
-generate_composite_function \
- pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8_8_process_pixblock_head, \
- pixman_composite_add_8_8_process_pixblock_tail, \
- pixman_composite_add_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8_8_process_pixblock_head, \
- pixman_composite_add_8_8_process_pixblock_tail, \
- pixman_composite_add_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
- vmvn.8 d24, d3 /* get inverted alpha */
- /* do alpha blending */
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d24, d5
- vmull.u8 q10, d24, d6
- vmull.u8 q11, d24, d7
-.endm
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
-.endm
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
- fetch_src_pixblock
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- vmvn.8 d22, d3
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
- vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
- vmull.u8 q9, d22, d5
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vmull.u8 q11, d22, d7
-.endm
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
- pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
- pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_8888_process_pixblock_head
- pixman_composite_out_reverse_8888_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_8888_8888_process_pixblock_tail
- pixman_composite_out_reverse_8888_8888_process_pixblock_tail
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
-.endm
-
-.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
- fetch_src_pixblock
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- vmvn.8 d22, d3
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
- vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
- vmull.u8 q9, d22, d5
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vmull.u8 q11, d22, d7
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_over_8888_8888_process_pixblock_head, \
- pixman_composite_over_8888_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_over_8888_8888_process_pixblock_head, \
- pixman_composite_over_8888_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_8888_process_pixblock_tail_head
- pixman_composite_over_8888_8888_process_pixblock_tail
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- pixman_composite_over_8888_8888_process_pixblock_head
- cache_preload 8, 8
-.endm
-
-.macro pixman_composite_over_n_8888_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d3[0]}, [DUMMY]
- vdup.8 d0, d3[0]
- vdup.8 d1, d3[1]
- vdup.8 d2, d3[2]
- vdup.8 d3, d3[3]
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_8888_init, \
- default_cleanup, \
- pixman_composite_over_8888_8888_process_pixblock_head, \
- pixman_composite_over_8888_8888_process_pixblock_tail, \
- pixman_composite_over_n_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
- vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
- vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
- vmvn.8 d22, d3
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
- vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
- vmull.u8 q9, d22, d5
- vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vmull.u8 q11, d22, d7
-.endm
-
-.macro pixman_composite_over_reverse_n_8888_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d7[0]}, [DUMMY]
- vdup.8 d4, d7[0]
- vdup.8 d5, d7[1]
- vdup.8 d6, d7[2]
- vdup.8 d7, d7[3]
-.endm
-
-generate_composite_function \
- pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_reverse_n_8888_init, \
- default_cleanup, \
- pixman_composite_over_8888_8888_process_pixblock_head, \
- pixman_composite_over_8888_8888_process_pixblock_tail, \
- pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 4, /* src_basereg */ \
- 24 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_head
- vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
- vmull.u8 q1, d24, d9
- vmull.u8 q6, d24, d10
- vmull.u8 q7, d24, d11
- vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
- vshrn.u16 d7, q2, #3
- vsli.u16 q2, q2, #5
- vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
- vrshr.u16 q9, q1, #8
- vrshr.u16 q10, q6, #8
- vrshr.u16 q11, q7, #8
- vraddhn.u16 d0, q0, q8
- vraddhn.u16 d1, q1, q9
- vraddhn.u16 d2, q6, q10
- vraddhn.u16 d3, q7, q11
- vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
- vsri.u8 d7, d7, #6
- vmvn.8 d3, d3
- vshrn.u16 d30, q2, #2
- vmull.u8 q8, d3, d6 /* now do alpha blending */
- vmull.u8 q9, d3, d7
- vmull.u8 q10, d3, d30
-.endm
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
- /* 3 cycle bubble (after vmull.u8) */
- vrshr.u16 q13, q8, #8
- vrshr.u16 q11, q9, #8
- vrshr.u16 q15, q10, #8
- vraddhn.u16 d16, q8, q13
- vraddhn.u16 d27, q9, q11
- vraddhn.u16 d26, q10, q15
- vqadd.u8 d16, d2, d16
- /* 1 cycle bubble */
- vqadd.u8 q9, q0, q13
- vshll.u8 q14, d16, #8 /* convert to 16bpp */
- vshll.u8 q8, d19, #8
- vshll.u8 q9, d18, #8
- vsri.u16 q14, q8, #5
- /* 1 cycle bubble */
- vsri.u16 q14, q9, #11
-.endm
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
- vld1.16 {d4, d5}, [DST_R, :128]!
- vshrn.u16 d6, q2, #8
- fetch_mask_pixblock
- vshrn.u16 d7, q2, #3
- fetch_src_pixblock
- vmull.u8 q6, d24, d10
- vrshr.u16 q13, q8, #8
- vrshr.u16 q11, q9, #8
- vrshr.u16 q15, q10, #8
- vraddhn.u16 d16, q8, q13
- vraddhn.u16 d27, q9, q11
- vraddhn.u16 d26, q10, q15
- vqadd.u8 d16, d2, d16
- vmull.u8 q1, d24, d9
- vqadd.u8 q9, q0, q13
- vshll.u8 q14, d16, #8
- vmull.u8 q0, d24, d8
- vshll.u8 q8, d19, #8
- vshll.u8 q9, d18, #8
- vsri.u16 q14, q8, #5
- vmull.u8 q7, d24, d11
- vsri.u16 q14, q9, #11
-
- cache_preload 8, 8
-
- vsli.u16 q2, q2, #5
- vrshr.u16 q8, q0, #8
- vrshr.u16 q9, q1, #8
- vrshr.u16 q10, q6, #8
- vrshr.u16 q11, q7, #8
- vraddhn.u16 d0, q0, q8
- vraddhn.u16 d1, q1, q9
- vraddhn.u16 d2, q6, q10
- vraddhn.u16 d3, q7, q11
- vsri.u8 d6, d6, #5
- vsri.u8 d7, d7, #6
- vmvn.8 d3, d3
- vshrn.u16 d30, q2, #2
- vst1.16 {d28, d29}, [DST_W, :128]!
- vmull.u8 q8, d3, d6
- vmull.u8 q9, d3, d7
- vmull.u8 q10, d3, d30
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_8888_8_0565_process_pixblock_head, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 8, /* src_basereg */ \
- 24 /* mask_basereg */
-
-/******************************************************************************/
-
-/*
- * This function needs a special initialization of solid mask.
- * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
- * offset, split into color components and replicated in d8-d11
- * registers. Additionally, this function needs all the NEON registers,
- * so it has to save d8-d15 registers which are callee saved according
- * to ABI. These registers are restored from 'cleanup' macro. All the
- * other NEON registers are caller saved, so can be clobbered freely
- * without introducing any problems.
- */
-.macro pixman_composite_over_n_8_0565_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vpush {d8-d15}
- vld1.32 {d11[0]}, [DUMMY]
- vdup.8 d8, d11[0]
- vdup.8 d9, d11[1]
- vdup.8 d10, d11[2]
- vdup.8 d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8_0565_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_8_0565_init, \
- pixman_composite_over_n_8_0565_cleanup, \
- pixman_composite_over_8888_8_0565_process_pixblock_head, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_n_0565_init
- add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
- vpush {d8-d15}
- vld1.32 {d24[0]}, [DUMMY]
- vdup.8 d24, d24[3]
-.endm
-
-.macro pixman_composite_over_8888_n_0565_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_8888_n_0565_init, \
- pixman_composite_over_8888_n_0565_cleanup, \
- pixman_composite_over_8888_8_0565_process_pixblock_head, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 8, /* src_basereg */ \
- 24 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0565_0565_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_0565_0565_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
- vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
- fetch_src_pixblock
- cache_preload 16, 16
-.endm
-
-generate_composite_function \
- pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
- FLAG_DST_WRITEONLY, \
- 16, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_0565_0565_process_pixblock_head, \
- pixman_composite_src_0565_0565_process_pixblock_tail, \
- pixman_composite_src_0565_0565_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_8_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_8_process_pixblock_tail_head
- vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_8_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d0[0]}, [DUMMY]
- vsli.u64 d0, d0, #8
- vsli.u64 d0, d0, #16
- vsli.u64 d0, d0, #32
- vorr d1, d0, d0
- vorr q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_8_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
- FLAG_DST_WRITEONLY, \
- 32, /* number of pixels, processed in a single block */ \
- 0, /* prefetch distance */ \
- pixman_composite_src_n_8_init, \
- pixman_composite_src_n_8_cleanup, \
- pixman_composite_src_n_8_process_pixblock_head, \
- pixman_composite_src_n_8_process_pixblock_tail, \
- pixman_composite_src_n_8_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_0565_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_0565_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_0565_process_pixblock_tail_head
- vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_0565_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d0[0]}, [DUMMY]
- vsli.u64 d0, d0, #16
- vsli.u64 d0, d0, #32
- vorr d1, d0, d0
- vorr q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_0565_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
- FLAG_DST_WRITEONLY, \
- 16, /* number of pixels, processed in a single block */ \
- 0, /* prefetch distance */ \
- pixman_composite_src_n_0565_init, \
- pixman_composite_src_n_0565_cleanup, \
- pixman_composite_src_n_0565_process_pixblock_head, \
- pixman_composite_src_n_0565_process_pixblock_tail, \
- pixman_composite_src_n_0565_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_8888_process_pixblock_tail_head
- vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_8888_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d0[0]}, [DUMMY]
- vsli.u64 d0, d0, #32
- vorr d1, d0, d0
- vorr q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_8888_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
- FLAG_DST_WRITEONLY, \
- 8, /* number of pixels, processed in a single block */ \
- 0, /* prefetch distance */ \
- pixman_composite_src_n_8888_init, \
- pixman_composite_src_n_8888_cleanup, \
- pixman_composite_src_n_8888_process_pixblock_head, \
- pixman_composite_src_n_8888_process_pixblock_tail, \
- pixman_composite_src_n_8888_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_8888_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_8888_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
- vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
- fetch_src_pixblock
- cache_preload 8, 8
-.endm
-
-generate_composite_function \
- pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_WRITEONLY, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_8888_8888_process_pixblock_head, \
- pixman_composite_src_8888_8888_process_pixblock_tail, \
- pixman_composite_src_8888_8888_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_x888_8888_process_pixblock_head
- vorr q0, q0, q2
- vorr q1, q1, q2
-.endm
-
-.macro pixman_composite_src_x888_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
- vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
- fetch_src_pixblock
- vorr q0, q0, q2
- vorr q1, q1, q2
- cache_preload 8, 8
-.endm
-
-.macro pixman_composite_src_x888_8888_init
- vmov.u8 q2, #0xFF
- vshl.u32 q2, q2, #24
-.endm
-
-generate_composite_function \
- pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_WRITEONLY, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- pixman_composite_src_x888_8888_init, \
- default_cleanup, \
- pixman_composite_src_x888_8888_process_pixblock_head, \
- pixman_composite_src_x888_8888_process_pixblock_tail, \
- pixman_composite_src_x888_8888_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8_8888_process_pixblock_head
- /* expecting deinterleaved source data in {d8, d9, d10, d11} */
- /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
- /* and destination data in {d4, d5, d6, d7} */
- /* mask is in d24 (d25, d26, d27 are unused) */
-
- /* in */
- vmull.u8 q0, d24, d8
- vmull.u8 q1, d24, d9
- vmull.u8 q6, d24, d10
- vmull.u8 q7, d24, d11
- vrshr.u16 q10, q0, #8
- vrshr.u16 q11, q1, #8
- vrshr.u16 q12, q6, #8
- vrshr.u16 q13, q7, #8
- vraddhn.u16 d0, q0, q10
- vraddhn.u16 d1, q1, q11
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d3, q7, q13
- vmvn.8 d24, d3 /* get inverted alpha */
- /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
- /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
- /* now do alpha blending */
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d24, d5
- vmull.u8 q10, d24, d6
- vmull.u8 q11, d24, d7
-.endm
-
-.macro pixman_composite_over_n_8_8888_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
- pixman_composite_over_n_8_8888_process_pixblock_tail
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- fetch_mask_pixblock
- cache_preload 8, 8
- pixman_composite_over_n_8_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_n_8_8888_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vpush {d8-d15}
- vld1.32 {d11[0]}, [DUMMY]
- vdup.8 d8, d11[0]
- vdup.8 d9, d11[1]
- vdup.8 d10, d11[2]
- vdup.8 d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8_8888_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_8_8888_init, \
- pixman_composite_over_n_8_8888_cleanup, \
- pixman_composite_over_n_8_8888_process_pixblock_head, \
- pixman_composite_over_n_8_8888_process_pixblock_tail, \
- pixman_composite_over_n_8_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8_8_process_pixblock_head
- vmull.u8 q0, d24, d8
- vmull.u8 q1, d25, d8
- vmull.u8 q6, d26, d8
- vmull.u8 q7, d27, d8
- vrshr.u16 q10, q0, #8
- vrshr.u16 q11, q1, #8
- vrshr.u16 q12, q6, #8
- vrshr.u16 q13, q7, #8
- vraddhn.u16 d0, q0, q10
- vraddhn.u16 d1, q1, q11
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d3, q7, q13
- vmvn.8 q12, q0
- vmvn.8 q13, q1
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d25, d5
- vmull.u8 q10, d26, d6
- vmull.u8 q11, d27, d7
-.endm
-
-.macro pixman_composite_over_n_8_8_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
- vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- pixman_composite_over_n_8_8_process_pixblock_tail
- fetch_mask_pixblock
- cache_preload 32, 32
- vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
- pixman_composite_over_n_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_n_8_8_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vpush {d8-d15}
- vld1.32 {d8[0]}, [DUMMY]
- vdup.8 d8, d8[3]
-.endm
-
-.macro pixman_composite_over_n_8_8_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
- FLAG_DST_READWRITE, \
- 32, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_8_8_init, \
- pixman_composite_over_n_8_8_cleanup, \
- pixman_composite_over_n_8_8_process_pixblock_head, \
- pixman_composite_over_n_8_8_process_pixblock_tail, \
- pixman_composite_over_n_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
- /*
- * 'combine_mask_ca' replacement
- *
- * input: solid src (n) in {d8, d9, d10, d11}
- * dest in {d4, d5, d6, d7 }
- * mask in {d24, d25, d26, d27}
- * output: updated src in {d0, d1, d2, d3 }
- * updated mask in {d24, d25, d26, d3 }
- */
- vmull.u8 q0, d24, d8
- vmull.u8 q1, d25, d9
- vmull.u8 q6, d26, d10
- vmull.u8 q7, d27, d11
- vmull.u8 q9, d11, d25
- vmull.u8 q12, d11, d24
- vmull.u8 q13, d11, d26
- vrshr.u16 q8, q0, #8
- vrshr.u16 q10, q1, #8
- vrshr.u16 q11, q6, #8
- vraddhn.u16 d0, q0, q8
- vraddhn.u16 d1, q1, q10
- vraddhn.u16 d2, q6, q11
- vrshr.u16 q11, q12, #8
- vrshr.u16 q8, q9, #8
- vrshr.u16 q6, q13, #8
- vrshr.u16 q10, q7, #8
- vraddhn.u16 d24, q12, q11
- vraddhn.u16 d25, q9, q8
- vraddhn.u16 d26, q13, q6
- vraddhn.u16 d3, q7, q10
- /*
- * 'combine_over_ca' replacement
- *
- * output: updated dest in {d28, d29, d30, d31}
- */
- vmvn.8 d24, d24
- vmvn.8 d25, d25
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d25, d5
- vmvn.8 d26, d26
- vmvn.8 d27, d3
- vmull.u8 q10, d26, d6
- vmull.u8 q11, d27, d7
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
- /* ... continue 'combine_over_ca' replacement */
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q6, q10, #8
- vrshr.u16 q7, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q6, q10
- vraddhn.u16 d31, q7, q11
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vrshr.u16 q6, q10, #8
- vrshr.u16 q7, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q6, q10
- vraddhn.u16 d31, q7, q11
- fetch_mask_pixblock
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
- cache_preload 8, 8
- pixman_composite_over_n_8888_8888_ca_process_pixblock_head
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vpush {d8-d15}
- vld1.32 {d11[0]}, [DUMMY]
- vdup.8 d8, d11[0]
- vdup.8 d9, d11[1]
- vdup.8 d10, d11[2]
- vdup.8 d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_8888_8888_ca_init, \
- pixman_composite_over_n_8888_8888_ca_cleanup, \
- pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
- pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
- pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_in_n_8_process_pixblock_head
- /* expecting source data in {d0, d1, d2, d3} */
- /* and destination data in {d4, d5, d6, d7} */
- vmull.u8 q8, d4, d3
- vmull.u8 q9, d5, d3
- vmull.u8 q10, d6, d3
- vmull.u8 q11, d7, d3
-.endm
-
-.macro pixman_composite_in_n_8_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d28, q8, q14
- vraddhn.u16 d29, q9, q15
- vraddhn.u16 d30, q10, q12
- vraddhn.u16 d31, q11, q13
-.endm
-
-.macro pixman_composite_in_n_8_process_pixblock_tail_head
- pixman_composite_in_n_8_process_pixblock_tail
- vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- cache_preload 32, 32
- pixman_composite_in_n_8_process_pixblock_head
- vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_in_n_8_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d3[0]}, [DUMMY]
- vdup.8 d3, d3[3]
-.endm
-
-.macro pixman_composite_in_n_8_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
- FLAG_DST_READWRITE, \
- 32, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_in_n_8_init, \
- pixman_composite_in_n_8_cleanup, \
- pixman_composite_in_n_8_process_pixblock_head, \
- pixman_composite_in_n_8_process_pixblock_tail, \
- pixman_composite_in_n_8_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 24 /* mask_basereg */
-
-.macro pixman_composite_add_n_8_8_process_pixblock_head
- /* expecting source data in {d8, d9, d10, d11} */
- /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
- /* and destination data in {d4, d5, d6, d7} */
- /* mask is in d24, d25, d26, d27 */
- vmull.u8 q0, d24, d11
- vmull.u8 q1, d25, d11
- vmull.u8 q6, d26, d11
- vmull.u8 q7, d27, d11
- vrshr.u16 q10, q0, #8
- vrshr.u16 q11, q1, #8
- vrshr.u16 q12, q6, #8
- vrshr.u16 q13, q7, #8
- vraddhn.u16 d0, q0, q10
- vraddhn.u16 d1, q1, q11
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d3, q7, q13
- vqadd.u8 q14, q0, q2
- vqadd.u8 q15, q1, q3
-.endm
-
-.macro pixman_composite_add_n_8_8_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
- pixman_composite_add_n_8_8_process_pixblock_tail
- vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
- vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- fetch_mask_pixblock
- cache_preload 32, 32
- pixman_composite_add_n_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_add_n_8_8_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vpush {d8-d15}
- vld1.32 {d11[0]}, [DUMMY]
- vdup.8 d11, d11[3]
-.endm
-
-.macro pixman_composite_add_n_8_8_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
- FLAG_DST_READWRITE, \
- 32, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_add_n_8_8_init, \
- pixman_composite_add_n_8_8_cleanup, \
- pixman_composite_add_n_8_8_process_pixblock_head, \
- pixman_composite_add_n_8_8_process_pixblock_tail, \
- pixman_composite_add_n_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8_8_8_process_pixblock_head
- /* expecting source data in {d0, d1, d2, d3} */
- /* destination data in {d4, d5, d6, d7} */
- /* mask in {d24, d25, d26, d27} */
- vmull.u8 q8, d24, d0
- vmull.u8 q9, d25, d1
- vmull.u8 q10, d26, d2
- vmull.u8 q11, d27, d3
- vrshr.u16 q0, q8, #8
- vrshr.u16 q1, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d0, q0, q8
- vraddhn.u16 d1, q1, q9
- vraddhn.u16 d2, q12, q10
- vraddhn.u16 d3, q13, q11
- vqadd.u8 q14, q0, q2
- vqadd.u8 q15, q1, q3
-.endm
-
-.macro pixman_composite_add_8_8_8_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
- pixman_composite_add_8_8_8_process_pixblock_tail
- vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
- vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- fetch_mask_pixblock
- fetch_src_pixblock
- cache_preload 32, 32
- pixman_composite_add_8_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_add_8_8_8_init
-.endm
-
-.macro pixman_composite_add_8_8_8_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
- FLAG_DST_READWRITE, \
- 32, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_add_8_8_8_init, \
- pixman_composite_add_8_8_8_cleanup, \
- pixman_composite_add_8_8_8_process_pixblock_head, \
- pixman_composite_add_8_8_8_process_pixblock_tail, \
- pixman_composite_add_8_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
- /* expecting source data in {d0, d1, d2, d3} */
- /* destination data in {d4, d5, d6, d7} */
- /* mask in {d24, d25, d26, d27} */
- vmull.u8 q8, d27, d0
- vmull.u8 q9, d27, d1
- vmull.u8 q10, d27, d2
- vmull.u8 q11, d27, d3
- /* 1 cycle bubble */
- vrsra.u16 q8, q8, #8
- vrsra.u16 q9, q9, #8
- vrsra.u16 q10, q10, #8
- vrsra.u16 q11, q11, #8
-.endm
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
- /* 2 cycle bubble */
- vrshrn.u16 d28, q8, #8
- vrshrn.u16 d29, q9, #8
- vrshrn.u16 d30, q10, #8
- vrshrn.u16 d31, q11, #8
- vqadd.u8 q14, q2, q14
- /* 1 cycle bubble */
- vqadd.u8 q15, q3, q15
-.endm
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
- fetch_src_pixblock
- vrshrn.u16 d28, q8, #8
- fetch_mask_pixblock
- vrshrn.u16 d29, q9, #8
- vmull.u8 q8, d27, d0
- vrshrn.u16 d30, q10, #8
- vmull.u8 q9, d27, d1
- vrshrn.u16 d31, q11, #8
- vmull.u8 q10, d27, d2
- vqadd.u8 q14, q2, q14
- vmull.u8 q11, d27, d3
- vqadd.u8 q15, q3, q15
- vrsra.u16 q8, q8, #8
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vrsra.u16 q9, q9, #8
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- vrsra.u16 q10, q10, #8
-
- cache_preload 8, 8
-
- vrsra.u16 q11, q11, #8
-.endm
-
-generate_composite_function \
- pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8888_8888_8888_process_pixblock_head, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8888_8888_8888_process_pixblock_head, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-generate_composite_function \
- pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8888_8888_8888_process_pixblock_head, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 27 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_n_8_8888_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d3[0]}, [DUMMY]
- vdup.8 d0, d3[0]
- vdup.8 d1, d3[1]
- vdup.8 d2, d3[2]
- vdup.8 d3, d3[3]
-.endm
-
-.macro pixman_composite_add_n_8_8888_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_add_n_8_8888_init, \
- pixman_composite_add_n_8_8888_cleanup, \
- pixman_composite_add_8888_8888_8888_process_pixblock_head, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 27 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_n_8888_init
- add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
- vld1.32 {d27[0]}, [DUMMY]
- vdup.8 d27, d27[3]
-.endm
-
-.macro pixman_composite_add_8888_n_8888_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_add_8888_n_8888_init, \
- pixman_composite_add_8888_n_8888_cleanup, \
- pixman_composite_add_8888_8888_8888_process_pixblock_head, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 27 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
- /* expecting source data in {d0, d1, d2, d3} */
- /* destination data in {d4, d5, d6, d7} */
- /* solid mask is in d15 */
-
- /* 'in' */
- vmull.u8 q8, d15, d3
- vmull.u8 q6, d15, d2
- vmull.u8 q5, d15, d1
- vmull.u8 q4, d15, d0
- vrshr.u16 q13, q8, #8
- vrshr.u16 q12, q6, #8
- vrshr.u16 q11, q5, #8
- vrshr.u16 q10, q4, #8
- vraddhn.u16 d3, q8, q13
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d1, q5, q11
- vraddhn.u16 d0, q4, q10
- vmvn.8 d24, d3 /* get inverted alpha */
- /* now do alpha blending */
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d24, d5
- vmull.u8 q10, d24, d6
- vmull.u8 q11, d24, d7
-.endm
-
-.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
- fetch_src_pixblock
- cache_preload 8, 8
- fetch_mask_pixblock
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
- pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 12 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_n_8888_process_pixblock_head
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- pixman_composite_over_8888_n_8888_process_pixblock_tail
- fetch_src_pixblock
- cache_preload 8, 8
- pixman_composite_over_8888_n_8888_process_pixblock_head
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_over_8888_n_8888_init
- add DUMMY, sp, #48
- vpush {d8-d15}
- vld1.32 {d15[0]}, [DUMMY]
- vdup.8 d15, d15[3]
-.endm
-
-.macro pixman_composite_over_8888_n_8888_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_8888_n_8888_init, \
- pixman_composite_over_8888_n_8888_cleanup, \
- pixman_composite_over_8888_n_8888_process_pixblock_head, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- pixman_composite_over_8888_n_8888_process_pixblock_tail
- fetch_src_pixblock
- cache_preload 8, 8
- fetch_mask_pixblock
- pixman_composite_over_8888_n_8888_process_pixblock_head
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_8888_n_8888_process_pixblock_head, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 12 /* mask_basereg */
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_8888_n_8888_process_pixblock_head, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 12 /* mask_basereg */
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- pixman_composite_over_8888_n_8888_process_pixblock_tail
- fetch_src_pixblock
- cache_preload 8, 8
- fetch_mask_pixblock
- pixman_composite_over_8888_n_8888_process_pixblock_head
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_8888_n_8888_process_pixblock_head, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 15 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_0888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_0888_0888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
- vst3.8 {d0, d1, d2}, [DST_W]!
- fetch_src_pixblock
- cache_preload 8, 8
-.endm
-
-generate_composite_function \
- pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
- FLAG_DST_WRITEONLY, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_0888_0888_process_pixblock_head, \
- pixman_composite_src_0888_0888_process_pixblock_tail, \
- pixman_composite_src_0888_0888_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
- vswp d0, d2
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
- vst4.8 {d0, d1, d2, d3}, [DST_W]!
- fetch_src_pixblock
- vswp d0, d2
- cache_preload 8, 8
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_init
- veor d3, d3, d3
-.endm
-
-generate_composite_function \
- pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- pixman_composite_src_0888_8888_rev_init, \
- default_cleanup, \
- pixman_composite_src_0888_8888_rev_process_pixblock_head, \
- pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
- pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
- vshll.u8 q8, d1, #8
- vshll.u8 q9, d2, #8
-.endm
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
- vshll.u8 q14, d0, #8
- vsri.u16 q14, q8, #5
- vsri.u16 q14, q9, #11
-.endm
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
- vshll.u8 q14, d0, #8
- fetch_src_pixblock
- vsri.u16 q14, q8, #5
- vsri.u16 q14, q9, #11
- vshll.u8 q8, d1, #8
- vst1.16 {d28, d29}, [DST_W, :128]!
- vshll.u8 q9, d2, #8
-.endm
-
-generate_composite_function \
- pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
- FLAG_DST_WRITEONLY, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_0888_0565_rev_process_pixblock_head, \
- pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
- pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d3, d1
- vmull.u8 q10, d3, d2
-.endm
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
- vrshr.u16 q11, q8, #8
- vswp d3, d31
- vrshr.u16 q12, q9, #8
- vrshr.u16 q13, q10, #8
- vraddhn.u16 d30, q11, q8
- vraddhn.u16 d29, q12, q9
- vraddhn.u16 d28, q13, q10
-.endm
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
- vrshr.u16 q11, q8, #8
- vswp d3, d31
- vrshr.u16 q12, q9, #8
- vrshr.u16 q13, q10, #8
- fetch_src_pixblock
- vraddhn.u16 d30, q11, q8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vraddhn.u16 d29, q12, q9
- vraddhn.u16 d28, q13, q10
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d3, d1
- vmull.u8 q10, d3, d2
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-.endm
-
-generate_composite_function \
- pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_pixbuf_8888_process_pixblock_head, \
- pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
- pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d3, d1
- vmull.u8 q10, d3, d2
-.endm
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
- vrshr.u16 q11, q8, #8
- vswp d3, d31
- vrshr.u16 q12, q9, #8
- vrshr.u16 q13, q10, #8
- vraddhn.u16 d28, q11, q8
- vraddhn.u16 d29, q12, q9
- vraddhn.u16 d30, q13, q10
-.endm
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
- vrshr.u16 q11, q8, #8
- vswp d3, d31
- vrshr.u16 q12, q9, #8
- vrshr.u16 q13, q10, #8
- fetch_src_pixblock
- vraddhn.u16 d28, q11, q8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vraddhn.u16 d29, q12, q9
- vraddhn.u16 d30, q13, q10
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d3, d1
- vmull.u8 q10, d3, d2
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-.endm
-
-generate_composite_function \
- pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
- pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
- pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_0565_8_0565_process_pixblock_head
- /* mask is in d15 */
- convert_0565_to_x888 q4, d2, d1, d0
- convert_0565_to_x888 q5, d6, d5, d4
- /* source pixel data is in {d0, d1, d2, XX} */
- /* destination pixel data is in {d4, d5, d6, XX} */
- vmvn.8 d7, d15
- vmull.u8 q6, d15, d2
- vmull.u8 q5, d15, d1
- vmull.u8 q4, d15, d0
- vmull.u8 q8, d7, d4
- vmull.u8 q9, d7, d5
- vmull.u8 q13, d7, d6
- vrshr.u16 q12, q6, #8
- vrshr.u16 q11, q5, #8
- vrshr.u16 q10, q4, #8
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d1, q5, q11
- vraddhn.u16 d0, q4, q10
-.endm
-
-.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q13, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q12, q13
- vqadd.u8 q0, q0, q14
- vqadd.u8 q1, q1, q15
- /* 32bpp result is in {d0, d1, d2, XX} */
- convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
- fetch_mask_pixblock
- pixman_composite_over_0565_8_0565_process_pixblock_tail
- fetch_src_pixblock
- vld1.16 {d10, d11}, [DST_R, :128]!
- cache_preload 8, 8
- pixman_composite_over_0565_8_0565_process_pixblock_head
- vst1.16 {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
- pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_0565_8_0565_process_pixblock_head, \
- pixman_composite_over_0565_8_0565_process_pixblock_tail, \
- pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 10, /* dst_r_basereg */ \
- 8, /* src_basereg */ \
- 15 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_0565_n_0565_init
- add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
- vpush {d8-d15}
- vld1.32 {d15[0]}, [DUMMY]
- vdup.8 d15, d15[3]
-.endm
-
-.macro pixman_composite_over_0565_n_0565_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_0565_n_0565_init, \
- pixman_composite_over_0565_n_0565_cleanup, \
- pixman_composite_over_0565_8_0565_process_pixblock_head, \
- pixman_composite_over_0565_8_0565_process_pixblock_tail, \
- pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 10, /* dst_r_basereg */ \
- 8, /* src_basereg */ \
- 15 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_0565_8_0565_process_pixblock_head
- /* mask is in d15 */
- convert_0565_to_x888 q4, d2, d1, d0
- convert_0565_to_x888 q5, d6, d5, d4
- /* source pixel data is in {d0, d1, d2, XX} */
- /* destination pixel data is in {d4, d5, d6, XX} */
- vmull.u8 q6, d15, d2
- vmull.u8 q5, d15, d1
- vmull.u8 q4, d15, d0
- vrshr.u16 q12, q6, #8
- vrshr.u16 q11, q5, #8
- vrshr.u16 q10, q4, #8
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d1, q5, q11
- vraddhn.u16 d0, q4, q10
-.endm
-
-.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
- vqadd.u8 q0, q0, q2
- vqadd.u8 q1, q1, q3
- /* 32bpp result is in {d0, d1, d2, XX} */
- convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
- fetch_mask_pixblock
- pixman_composite_add_0565_8_0565_process_pixblock_tail
- fetch_src_pixblock
- vld1.16 {d10, d11}, [DST_R, :128]!
- cache_preload 8, 8
- pixman_composite_add_0565_8_0565_process_pixblock_head
- vst1.16 {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
- pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_add_0565_8_0565_process_pixblock_head, \
- pixman_composite_add_0565_8_0565_process_pixblock_tail, \
- pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 10, /* dst_r_basereg */ \
- 8, /* src_basereg */ \
- 15 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
- /* mask is in d15 */
- convert_0565_to_x888 q5, d6, d5, d4
- /* destination pixel data is in {d4, d5, d6, xx} */
- vmvn.8 d24, d15 /* get inverted alpha */
- /* now do alpha blending */
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d24, d5
- vmull.u8 q10, d24, d6
-.endm
-
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vraddhn.u16 d0, q14, q8
- vraddhn.u16 d1, q15, q9
- vraddhn.u16 d2, q12, q10
- /* 32bpp result is in {d0, d1, d2, XX} */
- convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
- fetch_src_pixblock
- pixman_composite_out_reverse_8_0565_process_pixblock_tail
- vld1.16 {d10, d11}, [DST_R, :128]!
- cache_preload 8, 8
- pixman_composite_out_reverse_8_0565_process_pixblock_head
- vst1.16 {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
- pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_out_reverse_8_0565_process_pixblock_head, \
- pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
- pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 10, /* dst_r_basereg */ \
- 15, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-generate_composite_function_nearest_scanline \
- pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_over_8888_8888_process_pixblock_head, \
- pixman_composite_over_8888_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_nearest_scanline \
- pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_over_8888_0565_process_pixblock_head, \
- pixman_composite_over_8888_0565_process_pixblock_tail, \
- pixman_composite_over_8888_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 24 /* mask_basereg */
-
-generate_composite_function_nearest_scanline \
- pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_8888_0565_process_pixblock_head, \
- pixman_composite_src_8888_0565_process_pixblock_tail, \
- pixman_composite_src_8888_0565_process_pixblock_tail_head
-
-generate_composite_function_nearest_scanline \
- pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_0565_8888_process_pixblock_head, \
- pixman_composite_src_0565_8888_process_pixblock_tail, \
- pixman_composite_src_0565_8888_process_pixblock_tail_head
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ * - pixman_composite_over_8888_0565_asm_neon
+ * - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .fpu neon
+ .arch armv7a
+ .object_arch armv4
+ .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
+ .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
+ .arm
+ .altmacro
+
+#include "pixman-arm-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ * to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * d0, d1, d2, d3 - contain loaded source pixel data
+ * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
+ * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
+ * d28, d29, d30, d31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * d0, d1, d2, d3 - contain loaded source pixel data
+ * d4, d5 - contain loaded destination pixels (they are needed)
+ * d28, d29 - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
+ * perform all the needed calculations and write the result to {d28, d29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
+ * actually use d0 register for blue channel (a vector of eight 8-bit
+ * values), d1 register for green, d2 for red and d3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion:
+ * vuzp.8 d0, d1
+ * vuzp.8 d2, d3
+ * vuzp.8 d1, d3
+ * vuzp.8 d0, d2
+ *
+ * Planar to packed conversion:
+ * vzip.8 d0, d2
+ * vzip.8 d1, d3
+ * vzip.8 d2, d3
+ * vzip.8 d0, d1
+ *
+ * But pixel can be loaded directly in planar format using VLD4.8 NEON
+ * instruction. It is 1 cycle slower than VLD1.32, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+ /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+ and put data into d6 - red, d7 - green, d30 - blue */
+ vshrn.u16 d6, q2, #8
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vsri.u8 d6, d6, #5
+ vmvn.8 d3, d3 /* invert source alpha */
+ vsri.u8 d7, d7, #6
+ vshrn.u16 d30, q2, #2
+ /* now do alpha blending, storing results in 8-bit planar format
+ into d16 - red, d19 - green, d18 - blue */
+ vmull.u8 q10, d3, d6
+ vmull.u8 q11, d3, d7
+ vmull.u8 q12, d3, d30
+ vrshr.u16 q13, q10, #8
+ vrshr.u16 q3, q11, #8
+ vrshr.u16 q15, q12, #8
+ vraddhn.u16 d20, q10, q13
+ vraddhn.u16 d23, q11, q3
+ vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+ /* ... continue alpha blending */
+ vqadd.u8 d16, d2, d20
+ vqadd.u8 q9, q0, q11
+ /* convert the result to r5g6b5 and store it into {d28, d29} */
+ vshll.u8 q14, d16, #8
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ * head
+ * while (...) {
+ * tail
+ * head
+ * }
+ * tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ * pixman_composite_over_8888_0565_process_pixblock_tail
+ * vst1.16 {d28, d29}, [DST_W, :128]!
+ * vld1.16 {d4, d5}, [DST_R, :128]!
+ * vld4.32 {d0, d1, d2, d3}, [SRC]!
+ * pixman_composite_over_8888_0565_process_pixblock_head
+ * cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few VLD/VST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+ vqadd.u8 d16, d2, d20
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vqadd.u8 q9, q0, q11
+ vshrn.u16 d6, q2, #8
+ fetch_src_pixblock
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vshll.u8 q14, d16, #8
+ PF add PF_X, PF_X, #8
+ vshll.u8 q8, d19, #8
+ PF tst PF_CTL, #0xF
+ vsri.u8 d6, d6, #5
+ PF addne PF_X, PF_X, #8
+ vmvn.8 d3, d3
+ PF subne PF_CTL, PF_CTL, #1
+ vsri.u8 d7, d7, #6
+ vshrn.u16 d30, q2, #2
+ vmull.u8 q10, d3, d6
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmull.u8 q11, d3, d7
+ vmull.u8 q12, d3, d30
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vsri.u16 q14, q8, #5
+ PF cmp PF_X, ORIG_W
+ vshll.u8 q9, d18, #8
+ vrshr.u16 q13, q10, #8
+ PF subge PF_X, PF_X, ORIG_W
+ vrshr.u16 q3, q11, #8
+ vrshr.u16 q15, q12, #8
+ PF subges PF_CTL, PF_CTL, #0x10
+ vsri.u16 q14, q9, #11
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vraddhn.u16 d20, q10, q13
+ vraddhn.u16 d23, q11, q3
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vraddhn.u16 d22, q12, q15
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+ pixman_composite_over_8888_0565_process_pixblock_tail
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ fetch_src_pixblock
+ pixman_composite_over_8888_0565_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ * FLAG_DST_READWRITE - tells that the destination buffer is both read
+ * and written, for write-only buffer we would use
+ * FLAG_DST_WRITEONLY flag instead
+ * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ * and separate color channels for 32bpp format.
+ * The next things are:
+ * - the number of pixels processed per iteration (8 in this case, because
+ * that's the maximum what can fit into four 64-bit NEON registers).
+ * - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ * prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+ pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_0565_process_pixblock_head, \
+ pixman_composite_over_8888_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+ /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+ and put data into d6 - red, d7 - green, d30 - blue */
+ vshrn.u16 d6, q2, #8
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vsri.u8 d6, d6, #5
+ vsri.u8 d7, d7, #6
+ vshrn.u16 d30, q2, #2
+ /* now do alpha blending, storing results in 8-bit planar format
+ into d16 - red, d19 - green, d18 - blue */
+ vmull.u8 q10, d3, d6
+ vmull.u8 q11, d3, d7
+ vmull.u8 q12, d3, d30
+ vrshr.u16 q13, q10, #8
+ vrshr.u16 q3, q11, #8
+ vrshr.u16 q15, q12, #8
+ vraddhn.u16 d20, q10, q13
+ vraddhn.u16 d23, q11, q3
+ vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+ /* ... continue alpha blending */
+ vqadd.u8 d16, d2, d20
+ vqadd.u8 q9, q0, q11
+ /* convert the result to r5g6b5 and store it into {d28, d29} */
+ vshll.u8 q14, d16, #8
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+ pixman_composite_over_n_0565_process_pixblock_tail
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ pixman_composite_over_n_0565_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_0565_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+ vmvn.8 d3, d3 /* invert source alpha */
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_0565_init, \
+ default_cleanup, \
+ pixman_composite_over_n_0565_process_pixblock_head, \
+ pixman_composite_over_n_0565_process_pixblock_tail, \
+ pixman_composite_over_n_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+ vshll.u8 q8, d1, #8
+ vshll.u8 q14, d2, #8
+ vshll.u8 q9, d0, #8
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+ vsri.u16 q14, q8, #5
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ fetch_src_pixblock
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vsri.u16 q14, q9, #11
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vshll.u8 q8, d1, #8
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ vshll.u8 q14, d2, #8
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vshll.u8 q9, d0, #8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_0565_process_pixblock_head, \
+ pixman_composite_src_8888_0565_process_pixblock_tail, \
+ pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+ vshrn.u16 d30, q0, #8
+ vshrn.u16 d29, q0, #3
+ vsli.u16 q0, q0, #5
+ vmov.u8 d31, #255
+ vsri.u8 d30, d30, #5
+ vsri.u8 d29, d29, #6
+ vshrn.u16 d28, q0, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+ pixman_composite_src_0565_8888_process_pixblock_tail
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ fetch_src_pixblock
+ pixman_composite_src_0565_8888_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_8888_process_pixblock_head, \
+ pixman_composite_src_0565_8888_process_pixblock_tail, \
+ pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_process_pixblock_head
+ vqadd.u8 q14, q0, q2
+ vqadd.u8 q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
+ fetch_src_pixblock
+ PF add PF_X, PF_X, #32
+ PF tst PF_CTL, #0xF
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ PF addne PF_X, PF_X, #32
+ PF subne PF_CTL, PF_CTL, #1
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ vqadd.u8 q14, q0, q2
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vqadd.u8 q15, q1, q3
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+ fetch_src_pixblock
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ vqadd.u8 q14, q0, q2
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vqadd.u8 q15, q1, q3
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
+ vmvn.8 d24, d3 /* get inverted alpha */
+ /* do alpha blending */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+ vmull.u8 q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q14, q8, #8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ PF cmp PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ fetch_src_pixblock
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vmull.u8 q10, d22, d6
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+ pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q14, q8, #8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ PF cmp PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ fetch_src_pixblock
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vmull.u8 q10, d22, d6
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+ pixman_composite_over_8888_8888_process_pixblock_tail
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ pixman_composite_over_8888_8888_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8888_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+ vrshr.u16 q14, q8, #8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ PF cmp PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+ vmull.u8 q10, d22, d6
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d7[0]}, [DUMMY]
+ vdup.8 d4, d7[0]
+ vdup.8 d5, d7[1]
+ vdup.8 d6, d7[2]
+ vdup.8 d7, d7[3]
+.endm
+
+generate_composite_function \
+ pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_reverse_n_8888_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 4, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
+ vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
+ vmull.u8 q1, d24, d9
+ vmull.u8 q6, d24, d10
+ vmull.u8 q7, d24, d11
+ vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
+ vrshr.u16 q9, q1, #8
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q11, q7, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q9
+ vraddhn.u16 d2, q6, q10
+ vraddhn.u16 d3, q7, q11
+ vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
+ vsri.u8 d7, d7, #6
+ vmvn.8 d3, d3
+ vshrn.u16 d30, q2, #2
+ vmull.u8 q8, d3, d6 /* now do alpha blending */
+ vmull.u8 q9, d3, d7
+ vmull.u8 q10, d3, d30
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
+ /* 3 cycle bubble (after vmull.u8) */
+ vrshr.u16 q13, q8, #8
+ vrshr.u16 q11, q9, #8
+ vrshr.u16 q15, q10, #8
+ vraddhn.u16 d16, q8, q13
+ vraddhn.u16 d27, q9, q11
+ vraddhn.u16 d26, q10, q15
+ vqadd.u8 d16, d2, d16
+ /* 1 cycle bubble */
+ vqadd.u8 q9, q0, q13
+ vshll.u8 q14, d16, #8 /* convert to 16bpp */
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ /* 1 cycle bubble */
+ vsri.u16 q14, q9, #11
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vshrn.u16 d6, q2, #8
+ fetch_mask_pixblock
+ vshrn.u16 d7, q2, #3
+ fetch_src_pixblock
+ vmull.u8 q6, d24, d10
+ vrshr.u16 q13, q8, #8
+ vrshr.u16 q11, q9, #8
+ vrshr.u16 q15, q10, #8
+ vraddhn.u16 d16, q8, q13
+ vraddhn.u16 d27, q9, q11
+ vraddhn.u16 d26, q10, q15
+ vqadd.u8 d16, d2, d16
+ vmull.u8 q1, d24, d9
+ vqadd.u8 q9, q0, q13
+ vshll.u8 q14, d16, #8
+ vmull.u8 q0, d24, d8
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ vmull.u8 q7, d24, d11
+ vsri.u16 q14, q9, #11
+
+ cache_preload 8, 8
+
+ vsli.u16 q2, q2, #5
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q9, q1, #8
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q11, q7, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q9
+ vraddhn.u16 d2, q6, q10
+ vraddhn.u16 d3, q7, q11
+ vsri.u8 d6, d6, #5
+ vsri.u8 d7, d7, #6
+ vmvn.8 d3, d3
+ vshrn.u16 d30, q2, #2
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ vmull.u8 q8, d3, d6
+ vmull.u8 q9, d3, d7
+ vmull.u8 q10, d3, d30
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d8, d11[0]
+ vdup.8 d9, d11[1]
+ vdup.8 d10, d11[2]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_0565_init, \
+ pixman_composite_over_n_8_0565_cleanup, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_0565_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vpush {d8-d15}
+ vld1.32 {d24[0]}, [DUMMY]
+ vdup.8 d24, d24[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_8888_n_0565_init, \
+ pixman_composite_over_8888_n_0565_cleanup, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+ vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+ fetch_src_pixblock
+ cache_preload 16, 16
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 16, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_0565_process_pixblock_head, \
+ pixman_composite_src_0565_0565_process_pixblock_tail, \
+ pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+ vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d0[0]}, [DUMMY]
+ vsli.u64 d0, d0, #8
+ vsli.u64 d0, d0, #16
+ vsli.u64 d0, d0, #32
+ vorr d1, d0, d0
+ vorr q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+ FLAG_DST_WRITEONLY, \
+ 32, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_8_init, \
+ pixman_composite_src_n_8_cleanup, \
+ pixman_composite_src_n_8_process_pixblock_head, \
+ pixman_composite_src_n_8_process_pixblock_tail, \
+ pixman_composite_src_n_8_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+ vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_0565_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d0[0]}, [DUMMY]
+ vsli.u64 d0, d0, #16
+ vsli.u64 d0, d0, #32
+ vorr d1, d0, d0
+ vorr q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 16, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_0565_init, \
+ pixman_composite_src_n_0565_cleanup, \
+ pixman_composite_src_n_0565_process_pixblock_head, \
+ pixman_composite_src_n_0565_process_pixblock_tail, \
+ pixman_composite_src_n_0565_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+ vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d0[0]}, [DUMMY]
+ vsli.u64 d0, d0, #32
+ vorr d1, d0, d0
+ vorr q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_8888_init, \
+ pixman_composite_src_n_8888_cleanup, \
+ pixman_composite_src_n_8888_process_pixblock_head, \
+ pixman_composite_src_n_8888_process_pixblock_tail, \
+ pixman_composite_src_n_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+ vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+ fetch_src_pixblock
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_8888_process_pixblock_head, \
+ pixman_composite_src_8888_8888_process_pixblock_tail, \
+ pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+ vorr q0, q0, q2
+ vorr q1, q1, q2
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+ vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+ fetch_src_pixblock
+ vorr q0, q0, q2
+ vorr q1, q1, q2
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+ vmov.u8 q2, #0xFF
+ vshl.u32 q2, q2, #24
+.endm
+
+generate_composite_function \
+ pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ pixman_composite_src_x888_8888_init, \
+ default_cleanup, \
+ pixman_composite_src_x888_8888_process_pixblock_head, \
+ pixman_composite_src_x888_8888_process_pixblock_tail, \
+ pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+ /* expecting deinterleaved source data in {d8, d9, d10, d11} */
+ /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+ /* and destination data in {d4, d5, d6, d7} */
+ /* mask is in d24 (d25, d26, d27 are unused) */
+
+ /* in */
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d24, d9
+ vmull.u8 q6, d24, d10
+ vmull.u8 q7, d24, d11
+ vrshr.u16 q10, q0, #8
+ vrshr.u16 q11, q1, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q13, q7, #8
+ vraddhn.u16 d0, q0, q10
+ vraddhn.u16 d1, q1, q11
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d3, q7, q13
+ vmvn.8 d24, d3 /* get inverted alpha */
+ /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
+ /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
+ /* now do alpha blending */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+ vmull.u8 q11, d24, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+ pixman_composite_over_n_8_8888_process_pixblock_tail
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ fetch_mask_pixblock
+ cache_preload 8, 8
+ pixman_composite_over_n_8_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d8, d11[0]
+ vdup.8 d9, d11[1]
+ vdup.8 d10, d11[2]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_8888_init, \
+ pixman_composite_over_n_8_8888_cleanup, \
+ pixman_composite_over_n_8_8888_process_pixblock_head, \
+ pixman_composite_over_n_8_8888_process_pixblock_tail, \
+ pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d25, d8
+ vmull.u8 q6, d26, d8
+ vmull.u8 q7, d27, d8
+ vrshr.u16 q10, q0, #8
+ vrshr.u16 q11, q1, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q13, q7, #8
+ vraddhn.u16 d0, q0, q10
+ vraddhn.u16 d1, q1, q11
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d3, q7, q13
+ vmvn.8 q12, q0
+ vmvn.8 q13, q1
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d25, d5
+ vmull.u8 q10, d26, d6
+ vmull.u8 q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_n_8_8_process_pixblock_tail
+ fetch_mask_pixblock
+ cache_preload 32, 32
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d8[0]}, [DUMMY]
+ vdup.8 d8, d8[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_8_init, \
+ pixman_composite_over_n_8_8_cleanup, \
+ pixman_composite_over_n_8_8_process_pixblock_head, \
+ pixman_composite_over_n_8_8_process_pixblock_tail, \
+ pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+ /*
+ * 'combine_mask_ca' replacement
+ *
+ * input: solid src (n) in {d8, d9, d10, d11}
+ * dest in {d4, d5, d6, d7 }
+ * mask in {d24, d25, d26, d27}
+ * output: updated src in {d0, d1, d2, d3 }
+ * updated mask in {d24, d25, d26, d3 }
+ */
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d25, d9
+ vmull.u8 q6, d26, d10
+ vmull.u8 q7, d27, d11
+ vmull.u8 q9, d11, d25
+ vmull.u8 q12, d11, d24
+ vmull.u8 q13, d11, d26
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q10, q1, #8
+ vrshr.u16 q11, q6, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q10
+ vraddhn.u16 d2, q6, q11
+ vrshr.u16 q11, q12, #8
+ vrshr.u16 q8, q9, #8
+ vrshr.u16 q6, q13, #8
+ vrshr.u16 q10, q7, #8
+ vraddhn.u16 d24, q12, q11
+ vraddhn.u16 d25, q9, q8
+ vraddhn.u16 d26, q13, q6
+ vraddhn.u16 d3, q7, q10
+ /*
+ * 'combine_over_ca' replacement
+ *
+ * output: updated dest in {d28, d29, d30, d31}
+ */
+ vmvn.8 d24, d24
+ vmvn.8 d25, d25
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d25, d5
+ vmvn.8 d26, d26
+ vmvn.8 d27, d3
+ vmull.u8 q10, d26, d6
+ vmull.u8 q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+ /* ... continue 'combine_over_ca' replacement */
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q6, q10, #8
+ vrshr.u16 q7, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q6, q10
+ vraddhn.u16 d31, q7, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q6, q10, #8
+ vrshr.u16 q7, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q6, q10
+ vraddhn.u16 d31, q7, q11
+ fetch_mask_pixblock
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ cache_preload 8, 8
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d8, d11[0]
+ vdup.8 d9, d11[1]
+ vdup.8 d10, d11[2]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8888_8888_ca_init, \
+ pixman_composite_over_n_8888_8888_ca_cleanup, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_in_n_8_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* and destination data in {d4, d5, d6, d7} */
+ vmull.u8 q8, d4, d3
+ vmull.u8 q9, d5, d3
+ vmull.u8 q10, d6, d3
+ vmull.u8 q11, d7, d3
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q8, q14
+ vraddhn.u16 d29, q9, q15
+ vraddhn.u16 d30, q10, q12
+ vraddhn.u16 d31, q11, q13
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+ pixman_composite_in_n_8_process_pixblock_tail
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ cache_preload 32, 32
+ pixman_composite_in_n_8_process_pixblock_head
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_in_n_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_in_n_8_init, \
+ pixman_composite_in_n_8_cleanup, \
+ pixman_composite_in_n_8_process_pixblock_head, \
+ pixman_composite_in_n_8_process_pixblock_tail, \
+ pixman_composite_in_n_8_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+ /* expecting source data in {d8, d9, d10, d11} */
+ /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+ /* and destination data in {d4, d5, d6, d7} */
+ /* mask is in d24, d25, d26, d27 */
+ vmull.u8 q0, d24, d11
+ vmull.u8 q1, d25, d11
+ vmull.u8 q6, d26, d11
+ vmull.u8 q7, d27, d11
+ vrshr.u16 q10, q0, #8
+ vrshr.u16 q11, q1, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q13, q7, #8
+ vraddhn.u16 d0, q0, q10
+ vraddhn.u16 d1, q1, q11
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d3, q7, q13
+ vqadd.u8 q14, q0, q2
+ vqadd.u8 q15, q1, q3
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+ pixman_composite_add_n_8_8_process_pixblock_tail
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ fetch_mask_pixblock
+ cache_preload 32, 32
+ pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_n_8_8_init, \
+ pixman_composite_add_n_8_8_cleanup, \
+ pixman_composite_add_n_8_8_process_pixblock_head, \
+ pixman_composite_add_n_8_8_process_pixblock_tail, \
+ pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* destination data in {d4, d5, d6, d7} */
+ /* mask in {d24, d25, d26, d27} */
+ vmull.u8 q8, d24, d0
+ vmull.u8 q9, d25, d1
+ vmull.u8 q10, d26, d2
+ vmull.u8 q11, d27, d3
+ vrshr.u16 q0, q8, #8
+ vrshr.u16 q1, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q9
+ vraddhn.u16 d2, q12, q10
+ vraddhn.u16 d3, q13, q11
+ vqadd.u8 q14, q0, q2
+ vqadd.u8 q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+ pixman_composite_add_8_8_8_process_pixblock_tail
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ fetch_mask_pixblock
+ fetch_src_pixblock
+ cache_preload 32, 32
+ pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_8_8_8_init, \
+ pixman_composite_add_8_8_8_cleanup, \
+ pixman_composite_add_8_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_8_process_pixblock_tail, \
+ pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* destination data in {d4, d5, d6, d7} */
+ /* mask in {d24, d25, d26, d27} */
+ vmull.u8 q8, d27, d0
+ vmull.u8 q9, d27, d1
+ vmull.u8 q10, d27, d2
+ vmull.u8 q11, d27, d3
+ /* 1 cycle bubble */
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+ /* 2 cycle bubble */
+ vrshrn.u16 d28, q8, #8
+ vrshrn.u16 d29, q9, #8
+ vrshrn.u16 d30, q10, #8
+ vrshrn.u16 d31, q11, #8
+ vqadd.u8 q14, q2, q14
+ /* 1 cycle bubble */
+ vqadd.u8 q15, q3, q15
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+ fetch_src_pixblock
+ vrshrn.u16 d28, q8, #8
+ fetch_mask_pixblock
+ vrshrn.u16 d29, q9, #8
+ vmull.u8 q8, d27, d0
+ vrshrn.u16 d30, q10, #8
+ vmull.u8 q9, d27, d1
+ vrshrn.u16 d31, q11, #8
+ vmull.u8 q10, d27, d2
+ vqadd.u8 q14, q2, q14
+ vmull.u8 q11, d27, d3
+ vqadd.u8 q15, q3, q15
+ vrsra.u16 q8, q8, #8
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrsra.u16 q9, q9, #8
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q10, q10, #8
+
+ cache_preload 8, 8
+
+ vrsra.u16 q11, q11, #8
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+generate_composite_function \
+ pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_n_8_8888_init, \
+ pixman_composite_add_n_8_8888_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_n_8888_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vld1.32 {d27[0]}, [DUMMY]
+ vdup.8 d27, d27[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_8888_n_8888_init, \
+ pixman_composite_add_8888_n_8888_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* destination data in {d4, d5, d6, d7} */
+ /* solid mask is in d15 */
+
+ /* 'in' */
+ vmull.u8 q8, d15, d3
+ vmull.u8 q6, d15, d2
+ vmull.u8 q5, d15, d1
+ vmull.u8 q4, d15, d0
+ vrshr.u16 q13, q8, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q11, q5, #8
+ vrshr.u16 q10, q4, #8
+ vraddhn.u16 d3, q8, q13
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d1, q5, q11
+ vraddhn.u16 d0, q4, q10
+ vmvn.8 d24, d3 /* get inverted alpha */
+ /* now do alpha blending */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+ vmull.u8 q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+ add DUMMY, sp, #48
+ vpush {d8-d15}
+ vld1.32 {d15[0]}, [DUMMY]
+ vdup.8 d15, d15[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_8888_n_8888_init, \
+ pixman_composite_over_8888_n_8888_cleanup, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+ vst3.8 {d0, d1, d2}, [DST_W]!
+ fetch_src_pixblock
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_0888_process_pixblock_head, \
+ pixman_composite_src_0888_0888_process_pixblock_tail, \
+ pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+ vswp d0, d2
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+ vst4.8 {d0, d1, d2, d3}, [DST_W]!
+ fetch_src_pixblock
+ vswp d0, d2
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+ veor d3, d3, d3
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ pixman_composite_src_0888_8888_rev_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+ vshll.u8 q8, d1, #8
+ vshll.u8 q9, d2, #8
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+ vshll.u8 q14, d0, #8
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+ vshll.u8 q14, d0, #8
+ fetch_src_pixblock
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+ vshll.u8 q8, d1, #8
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ vshll.u8 q9, d2, #8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ vraddhn.u16 d30, q11, q8
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d28, q13, q10
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ fetch_src_pixblock
+ vraddhn.u16 d30, q11, q8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d28, q13, q10
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+ pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ vraddhn.u16 d28, q11, q8
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d30, q13, q10
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ fetch_src_pixblock
+ vraddhn.u16 d28, q11, q8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d30, q13, q10
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+ pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_head
+ /* mask is in d15 */
+ convert_0565_to_x888 q4, d2, d1, d0
+ convert_0565_to_x888 q5, d6, d5, d4
+ /* source pixel data is in {d0, d1, d2, XX} */
+ /* destination pixel data is in {d4, d5, d6, XX} */
+ vmvn.8 d7, d15
+ vmull.u8 q6, d15, d2
+ vmull.u8 q5, d15, d1
+ vmull.u8 q4, d15, d0
+ vmull.u8 q8, d7, d4
+ vmull.u8 q9, d7, d5
+ vmull.u8 q13, d7, d6
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q11, q5, #8
+ vrshr.u16 q10, q4, #8
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d1, q5, q11
+ vraddhn.u16 d0, q4, q10
+.endm
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q13, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q13
+ vqadd.u8 q0, q0, q14
+ vqadd.u8 q1, q1, q15
+ /* 32bpp result is in {d0, d1, d2, XX} */
+ convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
+ fetch_mask_pixblock
+ pixman_composite_over_0565_8_0565_process_pixblock_tail
+ fetch_src_pixblock
+ vld1.16 {d10, d11}, [DST_R, :128]!
+ cache_preload 8, 8
+ pixman_composite_over_0565_8_0565_process_pixblock_head
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_n_0565_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vpush {d8-d15}
+ vld1.32 {d15[0]}, [DUMMY]
+ vdup.8 d15, d15[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_0565_n_0565_init, \
+ pixman_composite_over_0565_n_0565_cleanup, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+ /* mask is in d15 */
+ convert_0565_to_x888 q4, d2, d1, d0
+ convert_0565_to_x888 q5, d6, d5, d4
+ /* source pixel data is in {d0, d1, d2, XX} */
+ /* destination pixel data is in {d4, d5, d6, XX} */
+ vmull.u8 q6, d15, d2
+ vmull.u8 q5, d15, d1
+ vmull.u8 q4, d15, d0
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q11, q5, #8
+ vrshr.u16 q10, q4, #8
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d1, q5, q11
+ vraddhn.u16 d0, q4, q10
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+ vqadd.u8 q0, q0, q2
+ vqadd.u8 q1, q1, q3
+ /* 32bpp result is in {d0, d1, d2, XX} */
+ convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+ fetch_mask_pixblock
+ pixman_composite_add_0565_8_0565_process_pixblock_tail
+ fetch_src_pixblock
+ vld1.16 {d10, d11}, [DST_R, :128]!
+ cache_preload 8, 8
+ pixman_composite_add_0565_8_0565_process_pixblock_head
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_add_0565_8_0565_process_pixblock_head, \
+ pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+ /* mask is in d15 */
+ convert_0565_to_x888 q5, d6, d5, d4
+ /* destination pixel data is in {d4, d5, d6, xx} */
+ vmvn.8 d24, d15 /* get inverted alpha */
+ /* now do alpha blending */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vraddhn.u16 d0, q14, q8
+ vraddhn.u16 d1, q15, q9
+ vraddhn.u16 d2, q12, q10
+ /* 32bpp result is in {d0, d1, d2, XX} */
+ convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+ fetch_src_pixblock
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail
+ vld1.16 {d10, d11}, [DST_R, :128]!
+ cache_preload 8, 8
+ pixman_composite_out_reverse_8_0565_process_pixblock_head
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 15, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_0565_process_pixblock_head, \
+ pixman_composite_over_8888_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_0565_process_pixblock_head, \
+ pixman_composite_src_8888_0565_process_pixblock_tail, \
+ pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_8888_process_pixblock_head, \
+ pixman_composite_src_0565_8888_process_pixblock_tail, \
+ pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
diff --git a/pixman/pixman/pixman-arm-neon.c b/pixman/pixman/pixman-arm-neon.c
index 7d6c83775..3e0c0d1c2 100644
--- a/pixman/pixman/pixman-arm-neon.c
+++ b/pixman/pixman/pixman-arm-neon.c
@@ -122,6 +122,11 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC,
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC,
uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565,
+ OVER, uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
+ OVER, uint16_t, uint16_t)
+
void
pixman_composite_src_n_8_asm_neon (int32_t w,
int32_t h,
@@ -332,6 +337,12 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565),
+
+ PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
+
{ PIXMAN_OP_NONE },
};
diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c
index 4cb8321aa..92f030871 100644
--- a/pixman/pixman/pixman-fast-path.c
+++ b/pixman/pixman/pixman-fast-path.c
@@ -1,2227 +1,2228 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. SuSE makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author: Keith Packard, SuSE, Inc.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <string.h>
-#include <stdlib.h>
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-fast-path.h"
-
-static force_inline uint32_t
-fetch_24 (uint8_t *a)
-{
- if (((unsigned long)a) & 1)
- {
-#ifdef WORDS_BIGENDIAN
- return (*a << 16) | (*(uint16_t *)(a + 1));
-#else
- return *a | (*(uint16_t *)(a + 1) << 8);
-#endif
- }
- else
- {
-#ifdef WORDS_BIGENDIAN
- return (*(uint16_t *)a << 8) | *(a + 2);
-#else
- return *(uint16_t *)a | (*(a + 2) << 16);
-#endif
- }
-}
-
-static force_inline void
-store_24 (uint8_t *a,
- uint32_t v)
-{
- if (((unsigned long)a) & 1)
- {
-#ifdef WORDS_BIGENDIAN
- *a = (uint8_t) (v >> 16);
- *(uint16_t *)(a + 1) = (uint16_t) (v);
-#else
- *a = (uint8_t) (v);
- *(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
-#endif
- }
- else
- {
-#ifdef WORDS_BIGENDIAN
- *(uint16_t *)a = (uint16_t)(v >> 8);
- *(a + 2) = (uint8_t)v;
-#else
- *(uint16_t *)a = (uint16_t)v;
- *(a + 2) = (uint8_t)(v >> 16);
-#endif
- }
-}
-
-static force_inline uint32_t
-over (uint32_t src,
- uint32_t dest)
-{
- uint32_t a = ~src >> 24;
-
- UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
-
- return dest;
-}
-
-static uint32_t
-in (uint32_t x,
- uint8_t y)
-{
- uint16_t a = y;
-
- UN8x4_MUL_UN8 (x, a);
-
- return x;
-}
-
-/*
- * Naming convention:
- *
- * op_src_mask_dest
- */
-static void
-fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line;
- uint32_t *dst, *dst_line;
- uint8_t *mask, *mask_line;
- int src_stride, mask_stride, dst_stride;
- uint8_t m;
- uint32_t s, d;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
- while (w--)
- {
- m = *mask++;
- if (m)
- {
- s = *src | 0xff000000;
-
- if (m == 0xff)
- {
- *dst = s;
- }
- else
- {
- d = in (s, m);
- *dst = over (d, *dst);
- }
- }
- src++;
- dst++;
- }
- }
-}
-
-static void
-fast_composite_in_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dest_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
- uint16_t t;
-
- src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-
- srca = src >> 24;
-
- PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- if (srca == 0xff)
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
-
- if (m == 0)
- *dst = 0;
- else if (m != 0xff)
- *dst = MUL_UN8 (m, *dst, t);
-
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- m = MUL_UN8 (m, srca, t);
-
- if (m == 0)
- *dst = 0;
- else if (m != 0xff)
- *dst = MUL_UN8 (m, *dst, t);
-
- dst++;
- }
- }
- }
-}
-
-static void
-fast_composite_in_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dest_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint8_t s;
- uint16_t t;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
-
- if (s == 0)
- *dst = 0;
- else if (s != 0xff)
- *dst = MUL_UN8 (s, *dst, t);
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, *dst, d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- *dst = src;
- else
- *dst = over (src, *dst);
- }
- else if (m)
- {
- d = in (src, m);
- *dst = over (d, *dst);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, s;
- uint32_t *dst_line, *dst, d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
-
- if (ma)
- {
- d = *dst;
- s = src;
-
- UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
-
- *dst = s;
- }
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca, s;
- uint32_t *dst_line, *dst, d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
- if (ma == 0xffffffff)
- {
- if (srca == 0xff)
- *dst = src;
- else
- *dst = over (src, *dst);
- }
- else if (ma)
- {
- d = *dst;
- s = src;
-
- UN8x4_MUL_UN8x4 (s, ma);
- UN8x4_MUL_UN8 (ma, srca);
- ma = ~ma;
- UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
- *dst = d;
- }
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint8_t *dst_line, *dst;
- uint32_t d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- {
- d = src;
- }
- else
- {
- d = fetch_24 (dst);
- d = over (src, d);
- }
- store_24 (dst, d);
- }
- else if (m)
- {
- d = over (in (src, m), fetch_24 (dst));
- store_24 (dst, d);
- }
- dst += 3;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- {
- d = src;
- }
- else
- {
- d = *dst;
- d = over (src, CONVERT_0565_TO_0888 (d));
- }
- *dst = CONVERT_8888_TO_0565 (d);
- }
- else if (m)
- {
- d = *dst;
- d = over (in (src, m), CONVERT_0565_TO_0888 (d));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca, s;
- uint16_t src16;
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- src16 = CONVERT_8888_TO_0565 (src);
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
- if (ma == 0xffffffff)
- {
- if (srca == 0xff)
- {
- *dst = src16;
- }
- else
- {
- d = *dst;
- d = over (src, CONVERT_0565_TO_0888 (d));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- }
- else if (ma)
- {
- d = *dst;
- d = CONVERT_0565_TO_0888 (d);
-
- s = src;
-
- UN8x4_MUL_UN8x4 (s, ma);
- UN8x4_MUL_UN8 (ma, srca);
- ma = ~ma;
- UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- uint8_t a;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (a == 0xff)
- *dst = s;
- else if (s)
- *dst = over (s, *dst);
- dst++;
- }
- }
-}
-
-static void
-fast_composite_src_x888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- *dst++ = (*src++) | 0xff000000;
- }
-}
-
-#if 0
-static void
-fast_composite_over_8888_0888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint32_t d;
- uint32_t *src_line, *src, s;
- uint8_t a;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (a)
- {
- if (a == 0xff)
- d = s;
- else
- d = over (s, fetch_24 (dst));
-
- store_24 (dst, d);
- }
- dst += 3;
- }
- }
-}
-#endif
-
-static void
-fast_composite_over_8888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint32_t *src_line, *src, s;
- uint8_t a;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (s)
- {
- if (a == 0xff)
- {
- d = s;
- }
- else
- {
- d = *dst;
- d = over (s, CONVERT_0565_TO_0888 (d));
- }
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_src_x888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- *dst = CONVERT_8888_TO_0565 (s);
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint8_t s, d;
- uint16_t t;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- if (s)
- {
- if (s != 0xff)
- {
- d = *dst;
- t = d + s;
- s = t | (0 - (t >> 8));
- }
- *dst = s;
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint32_t s, d;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- if (s)
- {
- if (s != 0xffffffff)
- {
- d = *dst;
- if (d)
- UN8x4_ADD_UN8x4 (s, d);
- }
- *dst = s;
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t src;
- uint8_t sa;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
- sa = (src >> 24);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- uint16_t tmp;
- uint16_t a;
- uint32_t m, d;
- uint32_t r;
-
- a = *mask++;
- d = *dst;
-
- m = MUL_UN8 (sa, a, tmp);
- r = ADD_UN8 (m, d, tmp);
-
- *dst++ = r;
- }
- }
-}
-
-#ifdef WORDS_BIGENDIAN
-#define CREATE_BITMASK(n) (0x80000000 >> (n))
-#define UPDATE_BITMASK(n) ((n) >> 1)
-#else
-#define CREATE_BITMASK(n) (1 << (n))
-#define UPDATE_BITMASK(n) ((n) << 1)
-#endif
-
-#define TEST_BIT(p, n) \
- (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
-#define SET_BIT(p, n) \
- do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
-
-static void
-fast_composite_add_1000_1000 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
- src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
- dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- /*
- * TODO: improve performance by processing uint32_t data instead
- * of individual bits
- */
- if (TEST_BIT (src, src_x + w))
- SET_BIT (dst, dest_x + w);
- }
- }
-}
-
-static void
-fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst, *dst_line;
- uint32_t *mask, *mask_line;
- int mask_stride, dst_stride;
- uint32_t bitcache, bitmask;
- int32_t w;
-
- if (width <= 0)
- return;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
- dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
- mask_stride, mask_line, 1);
- mask_line += mask_x >> 5;
-
- if (srca == 0xff)
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = src;
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = over (src, *dst);
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
-}
-
-static void
-fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dst, *dst_line;
- uint32_t *mask, *mask_line;
- int mask_stride, dst_stride;
- uint32_t bitcache, bitmask;
- int32_t w;
- uint32_t d;
- uint16_t src565;
-
- if (width <= 0)
- return;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
- dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
- mask_stride, mask_line, 1);
- mask_line += mask_x >> 5;
-
- if (srca == 0xff)
- {
- src565 = CONVERT_8888_TO_0565 (src);
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = src565;
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- {
- d = over (src, CONVERT_0565_TO_0888 (*dst));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
-}
-
-/*
- * Simple bitblt
- */
-
-static void
-fast_composite_solid_fill (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- if (dst_image->bits.format == PIXMAN_a1)
- {
- src = src >> 31;
- }
- else if (dst_image->bits.format == PIXMAN_a8)
- {
- src = src >> 24;
- }
- else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
- dst_image->bits.format == PIXMAN_b5g6r5)
- {
- src = CONVERT_8888_TO_0565 (src);
- }
-
- pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
- dest_x, dest_y,
- width, height,
- src);
-}
-
-static void
-fast_composite_src_memcpy (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8;
- uint32_t n_bytes = width * bpp;
- int dst_stride, src_stride;
- uint8_t *dst;
- uint8_t *src;
-
- src_stride = src_image->bits.rowstride * 4;
- dst_stride = dst_image->bits.rowstride * 4;
-
- src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
- dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
-
- while (height--)
- {
- memcpy (dst, src, n_bytes);
-
- dst += dst_stride;
- src += src_stride;
- }
-}
-
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
-FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
-
-/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
-static force_inline void
-scaled_nearest_scanline_565_565_SRC (uint16_t * dst,
- uint16_t * src,
- int32_t w,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- pixman_fixed_t max_vx)
-{
- uint16_t tmp1, tmp2, tmp3, tmp4;
- while ((w -= 4) >= 0)
- {
- tmp1 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp3 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp4 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- *dst++ = tmp1;
- *dst++ = tmp2;
- *dst++ = tmp3;
- *dst++ = tmp4;
- }
- if (w & 2)
- {
- tmp1 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- *dst++ = tmp1;
- *dst++ = tmp2;
- }
- if (w & 1)
- *dst++ = src[pixman_fixed_to_int (vx)];
-}
-
-FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, COVER)
-FAST_NEAREST_MAINLOOP (565_565_none_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, NONE)
-FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, PAD)
-
-static force_inline uint32_t
-fetch_nearest (pixman_repeat_t src_repeat,
- pixman_format_code_t format,
- uint32_t *src, int x, int src_width)
-{
- if (repeat (src_repeat, &x, src_width))
- {
- if (format == PIXMAN_x8r8g8b8)
- return *(src + x) | 0xff000000;
- else
- return *(src + x);
- }
- else
- {
- return 0;
- }
-}
-
-static force_inline void
-combine_over (uint32_t s, uint32_t *dst)
-{
- if (s)
- {
- uint8_t ia = 0xff - (s >> 24);
-
- if (ia)
- UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
- else
- *dst = s;
- }
-}
-
-static force_inline void
-combine_src (uint32_t s, uint32_t *dst)
-{
- *dst = s;
-}
-
-static void
-fast_composite_scaled_nearest (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line;
- uint32_t *src_line;
- int dst_stride, src_stride;
- int src_width, src_height;
- pixman_repeat_t src_repeat;
- pixman_fixed_t unit_x, unit_y;
- pixman_format_code_t src_format;
- pixman_vector_t v;
- pixman_fixed_t vy;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
- * transformed from destination space to source space
- */
- PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (src_image->common.transform, &v))
- return;
-
- unit_x = src_image->common.transform->matrix[0][0];
- unit_y = src_image->common.transform->matrix[1][1];
-
- /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
- v.vector[0] -= pixman_fixed_e;
- v.vector[1] -= pixman_fixed_e;
-
- src_height = src_image->bits.height;
- src_width = src_image->bits.width;
- src_repeat = src_image->common.repeat;
- src_format = src_image->bits.format;
-
- vy = v.vector[1];
- while (height--)
- {
- pixman_fixed_t vx = v.vector[0];
- int y = pixman_fixed_to_int (vy);
- uint32_t *dst = dst_line;
-
- dst_line += dst_stride;
-
- /* adjust the y location by a unit vector in the y direction
- * this is equivalent to transforming y+1 of the destination point to source space */
- vy += unit_y;
-
- if (!repeat (src_repeat, &y, src_height))
- {
- if (op == PIXMAN_OP_SRC)
- memset (dst, 0, sizeof (*dst) * width);
- }
- else
- {
- int w = width;
-
- uint32_t *src = src_line + y * src_stride;
-
- while (w >= 2)
- {
- uint32_t s1, s2;
- int x1, x2;
-
- x1 = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- x2 = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- w -= 2;
-
- s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
- s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
-
- if (op == PIXMAN_OP_OVER)
- {
- combine_over (s1, dst++);
- combine_over (s2, dst++);
- }
- else
- {
- combine_src (s1, dst++);
- combine_src (s2, dst++);
- }
- }
-
- while (w--)
- {
- uint32_t s;
- int x;
-
- x = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- s = fetch_nearest (src_repeat, src_format, src, x, src_width);
-
- if (op == PIXMAN_OP_OVER)
- combine_over (s, dst++);
- else
- combine_src (s, dst++);
- }
- }
- }
-}
-
-#define CACHE_LINE_SIZE 64
-
-#define FAST_SIMPLE_ROTATE(suffix, pix_type) \
- \
-static void \
-blt_rotated_90_trivial_##suffix (pix_type *dst, \
- int dst_stride, \
- const pix_type *src, \
- int src_stride, \
- int w, \
- int h) \
-{ \
- int x, y; \
- for (y = 0; y < h; y++) \
- { \
- const pix_type *s = src + (h - y - 1); \
- pix_type *d = dst + dst_stride * y; \
- for (x = 0; x < w; x++) \
- { \
- *d++ = *s; \
- s += src_stride; \
- } \
- } \
-} \
- \
-static void \
-blt_rotated_270_trivial_##suffix (pix_type *dst, \
- int dst_stride, \
- const pix_type *src, \
- int src_stride, \
- int w, \
- int h) \
-{ \
- int x, y; \
- for (y = 0; y < h; y++) \
- { \
- const pix_type *s = src + src_stride * (w - 1) + y; \
- pix_type *d = dst + dst_stride * y; \
- for (x = 0; x < w; x++) \
- { \
- *d++ = *s; \
- s -= src_stride; \
- } \
- } \
-} \
- \
-static void \
-blt_rotated_90_##suffix (pix_type *dst, \
- int dst_stride, \
- const pix_type *src, \
- int src_stride, \
- int W, \
- int H) \
-{ \
- int x; \
- int leading_pixels = 0, trailing_pixels = 0; \
- const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \
- \
- /* \
- * split processing into handling destination as TILE_SIZExH cache line \
- * aligned vertical stripes (optimistically assuming that destination \
- * stride is a multiple of cache line, if not - it will be just a bit \
- * slower) \
- */ \
- \
- if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \
- { \
- leading_pixels = TILE_SIZE - (((uintptr_t)dst & \
- (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
- if (leading_pixels > W) \
- leading_pixels = W; \
- \
- /* unaligned leading part NxH (where N < TILE_SIZE) */ \
- blt_rotated_90_trivial_##suffix ( \
- dst, \
- dst_stride, \
- src, \
- src_stride, \
- leading_pixels, \
- H); \
- \
- dst += leading_pixels; \
- src += leading_pixels * src_stride; \
- W -= leading_pixels; \
- } \
- \
- if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \
- { \
- trailing_pixels = (((uintptr_t)(dst + W) & \
- (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
- if (trailing_pixels > W) \
- trailing_pixels = W; \
- W -= trailing_pixels; \
- } \
- \
- for (x = 0; x < W; x += TILE_SIZE) \
- { \
- /* aligned middle part TILE_SIZExH */ \
- blt_rotated_90_trivial_##suffix ( \
- dst + x, \
- dst_stride, \
- src + src_stride * x, \
- src_stride, \
- TILE_SIZE, \
- H); \
- } \
- \
- if (trailing_pixels) \
- { \
- /* unaligned trailing part NxH (where N < TILE_SIZE) */ \
- blt_rotated_90_trivial_##suffix ( \
- dst + W, \
- dst_stride, \
- src + W * src_stride, \
- src_stride, \
- trailing_pixels, \
- H); \
- } \
-} \
- \
-static void \
-blt_rotated_270_##suffix (pix_type *dst, \
- int dst_stride, \
- const pix_type *src, \
- int src_stride, \
- int W, \
- int H) \
-{ \
- int x; \
- int leading_pixels = 0, trailing_pixels = 0; \
- const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \
- \
- /* \
- * split processing into handling destination as TILE_SIZExH cache line \
- * aligned vertical stripes (optimistically assuming that destination \
- * stride is a multiple of cache line, if not - it will be just a bit \
- * slower) \
- */ \
- \
- if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \
- { \
- leading_pixels = TILE_SIZE - (((uintptr_t)dst & \
- (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
- if (leading_pixels > W) \
- leading_pixels = W; \
- \
- /* unaligned leading part NxH (where N < TILE_SIZE) */ \
- blt_rotated_270_trivial_##suffix ( \
- dst, \
- dst_stride, \
- src + src_stride * (W - leading_pixels), \
- src_stride, \
- leading_pixels, \
- H); \
- \
- dst += leading_pixels; \
- W -= leading_pixels; \
- } \
- \
- if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \
- { \
- trailing_pixels = (((uintptr_t)(dst + W) & \
- (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
- if (trailing_pixels > W) \
- trailing_pixels = W; \
- W -= trailing_pixels; \
- src += trailing_pixels * src_stride; \
- } \
- \
- for (x = 0; x < W; x += TILE_SIZE) \
- { \
- /* aligned middle part TILE_SIZExH */ \
- blt_rotated_270_trivial_##suffix ( \
- dst + x, \
- dst_stride, \
- src + src_stride * (W - x - TILE_SIZE), \
- src_stride, \
- TILE_SIZE, \
- H); \
- } \
- \
- if (trailing_pixels) \
- { \
- /* unaligned trailing part NxH (where N < TILE_SIZE) */ \
- blt_rotated_270_trivial_##suffix ( \
- dst + W, \
- dst_stride, \
- src - trailing_pixels * src_stride, \
- src_stride, \
- trailing_pixels, \
- H); \
- } \
-} \
- \
-static void \
-fast_composite_rotate_90_##suffix (pixman_implementation_t *imp, \
- pixman_op_t op, \
- pixman_image_t * src_image, \
- pixman_image_t * mask_image, \
- pixman_image_t * dst_image, \
- int32_t src_x, \
- int32_t src_y, \
- int32_t mask_x, \
- int32_t mask_y, \
- int32_t dest_x, \
- int32_t dest_y, \
- int32_t width, \
- int32_t height) \
-{ \
- pix_type *dst_line; \
- pix_type *src_line; \
- int dst_stride, src_stride; \
- int src_x_t, src_y_t; \
- \
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \
- dst_stride, dst_line, 1); \
- src_x_t = -src_y + pixman_fixed_to_int ( \
- src_image->common.transform->matrix[0][2] + \
- pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
- src_y_t = src_x + pixman_fixed_to_int ( \
- src_image->common.transform->matrix[1][2] + \
- pixman_fixed_1 / 2 - pixman_fixed_e); \
- PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \
- src_stride, src_line, 1); \
- blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride, \
- width, height); \
-} \
- \
-static void \
-fast_composite_rotate_270_##suffix (pixman_implementation_t *imp, \
- pixman_op_t op, \
- pixman_image_t * src_image, \
- pixman_image_t * mask_image, \
- pixman_image_t * dst_image, \
- int32_t src_x, \
- int32_t src_y, \
- int32_t mask_x, \
- int32_t mask_y, \
- int32_t dest_x, \
- int32_t dest_y, \
- int32_t width, \
- int32_t height) \
-{ \
- pix_type *dst_line; \
- pix_type *src_line; \
- int dst_stride, src_stride; \
- int src_x_t, src_y_t; \
- \
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \
- dst_stride, dst_line, 1); \
- src_x_t = src_y + pixman_fixed_to_int ( \
- src_image->common.transform->matrix[0][2] + \
- pixman_fixed_1 / 2 - pixman_fixed_e); \
- src_y_t = -src_x + pixman_fixed_to_int ( \
- src_image->common.transform->matrix[1][2] + \
- pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
- PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \
- src_stride, src_line, 1); \
- blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride, \
- width, height); \
-}
-
-FAST_SIMPLE_ROTATE (8, uint8_t)
-FAST_SIMPLE_ROTATE (565, uint16_t)
-FAST_SIMPLE_ROTATE (8888, uint32_t)
-
-static const pixman_fast_path_t c_fast_paths[] =
-{
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5, fast_composite_over_n_1_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5, fast_composite_over_n_1_0565),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
- PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
- PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
- PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
-
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
-
-#define NEAREST_FAST_PATH(op,s,d) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, SCALED_NEAREST_FLAGS, \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest, \
- }
-
- NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
- NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
-
- NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
- NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
-
- NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
- NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
-
- NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
- NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
-
-#define SIMPLE_ROTATE_FLAGS(angle) \
- (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM | \
- FAST_PATH_NEAREST_FILTER | \
- FAST_PATH_SAMPLES_COVER_CLIP | \
- FAST_PATH_STANDARD_FLAGS)
-
-#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_rotate_90_##suffix, \
- }, \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_rotate_270_##suffix, \
- }
-
- SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
- SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
- SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
- SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
- SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
-
- { PIXMAN_OP_NONE },
-};
-
-#ifdef WORDS_BIGENDIAN
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
-#else
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
-#endif
-
-static force_inline void
-pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
-{
- if (offs)
- {
- int leading_pixels = 32 - offs;
- if (leading_pixels >= width)
- {
- if (v)
- *dst |= A1_FILL_MASK (width, offs);
- else
- *dst &= ~A1_FILL_MASK (width, offs);
- return;
- }
- else
- {
- if (v)
- *dst++ |= A1_FILL_MASK (leading_pixels, offs);
- else
- *dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
- width -= leading_pixels;
- }
- }
- while (width >= 32)
- {
- if (v)
- *dst++ = 0xFFFFFFFF;
- else
- *dst++ = 0;
- width -= 32;
- }
- if (width > 0)
- {
- if (v)
- *dst |= A1_FILL_MASK (width, 0);
- else
- *dst &= ~A1_FILL_MASK (width, 0);
- }
-}
-
-static void
-pixman_fill1 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- uint32_t *dst = bits + y * stride + (x >> 5);
- int offs = x & 31;
-
- if (xor & 1)
- {
- while (height--)
- {
- pixman_fill1_line (dst, offs, width, 1);
- dst += stride;
- }
- }
- else
- {
- while (height--)
- {
- pixman_fill1_line (dst, offs, width, 0);
- dst += stride;
- }
- }
-}
-
-static void
-pixman_fill8 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int byte_stride = stride * (int) sizeof (uint32_t);
- uint8_t *dst = (uint8_t *) bits;
- uint8_t v = xor & 0xff;
- int i;
-
- dst = dst + y * byte_stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- dst[i] = v;
-
- dst += byte_stride;
- }
-}
-
-static void
-pixman_fill16 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int short_stride =
- (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
- uint16_t *dst = (uint16_t *)bits;
- uint16_t v = xor & 0xffff;
- int i;
-
- dst = dst + y * short_stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- dst[i] = v;
-
- dst += short_stride;
- }
-}
-
-static void
-pixman_fill32 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int i;
-
- bits = bits + y * stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- bits[i] = xor;
-
- bits += stride;
- }
-}
-
-static pixman_bool_t
-fast_path_fill (pixman_implementation_t *imp,
- uint32_t * bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- switch (bpp)
- {
- case 1:
- pixman_fill1 (bits, stride, x, y, width, height, xor);
- break;
-
- case 8:
- pixman_fill8 (bits, stride, x, y, width, height, xor);
- break;
-
- case 16:
- pixman_fill16 (bits, stride, x, y, width, height, xor);
- break;
-
- case 32:
- pixman_fill32 (bits, stride, x, y, width, height, xor);
- break;
-
- default:
- return _pixman_implementation_fill (
- imp->delegate, bits, stride, bpp, x, y, width, height, xor);
- break;
- }
-
- return TRUE;
-}
-
-pixman_implementation_t *
-_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
-{
- pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
-
- imp->fill = fast_path_fill;
-
- return imp;
-}
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. SuSE makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-fast-path.h"
+
+static force_inline uint32_t
+fetch_24 (uint8_t *a)
+{
+ if (((unsigned long)a) & 1)
+ {
+#ifdef WORDS_BIGENDIAN
+ return (*a << 16) | (*(uint16_t *)(a + 1));
+#else
+ return *a | (*(uint16_t *)(a + 1) << 8);
+#endif
+ }
+ else
+ {
+#ifdef WORDS_BIGENDIAN
+ return (*(uint16_t *)a << 8) | *(a + 2);
+#else
+ return *(uint16_t *)a | (*(a + 2) << 16);
+#endif
+ }
+}
+
+static force_inline void
+store_24 (uint8_t *a,
+ uint32_t v)
+{
+ if (((unsigned long)a) & 1)
+ {
+#ifdef WORDS_BIGENDIAN
+ *a = (uint8_t) (v >> 16);
+ *(uint16_t *)(a + 1) = (uint16_t) (v);
+#else
+ *a = (uint8_t) (v);
+ *(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
+#endif
+ }
+ else
+ {
+#ifdef WORDS_BIGENDIAN
+ *(uint16_t *)a = (uint16_t)(v >> 8);
+ *(a + 2) = (uint8_t)v;
+#else
+ *(uint16_t *)a = (uint16_t)v;
+ *(a + 2) = (uint8_t)(v >> 16);
+#endif
+ }
+}
+
+static force_inline uint32_t
+over (uint32_t src,
+ uint32_t dest)
+{
+ uint32_t a = ~src >> 24;
+
+ UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
+
+ return dest;
+}
+
+static uint32_t
+in (uint32_t x,
+ uint8_t y)
+{
+ uint16_t a = y;
+
+ UN8x4_MUL_UN8 (x, a);
+
+ return x;
+}
+
+/*
+ * Naming convention:
+ *
+ * op_src_mask_dest
+ */
+static void
+fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *src, *src_line;
+ uint32_t *dst, *dst_line;
+ uint8_t *mask, *mask_line;
+ int src_stride, mask_stride, dst_stride;
+ uint8_t m;
+ uint32_t s, d;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+
+ w = width;
+ while (w--)
+ {
+ m = *mask++;
+ if (m)
+ {
+ s = *src | 0xff000000;
+
+ if (m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ d = in (s, m);
+ *dst = over (d, *dst);
+ }
+ }
+ src++;
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_in_n_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dest_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint16_t t;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ srca = src >> 24;
+
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ if (srca == 0xff)
+ {
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ m = *mask++;
+
+ if (m == 0)
+ *dst = 0;
+ else if (m != 0xff)
+ *dst = MUL_UN8 (m, *dst, t);
+
+ dst++;
+ }
+ }
+ }
+ else
+ {
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ m = *mask++;
+ m = MUL_UN8 (m, srca, t);
+
+ if (m == 0)
+ *dst = 0;
+ else if (m != 0xff)
+ *dst = MUL_UN8 (m, *dst, t);
+
+ dst++;
+ }
+ }
+ }
+}
+
+static void
+fast_composite_in_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dest_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint8_t s;
+ uint16_t t;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+
+ if (s == 0)
+ *dst = 0;
+ else if (s != 0xff)
+ *dst = MUL_UN8 (s, *dst, t);
+
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst, d;
+ uint8_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ m = *mask++;
+ if (m == 0xff)
+ {
+ if (srca == 0xff)
+ *dst = src;
+ else
+ *dst = over (src, *dst);
+ }
+ else if (m)
+ {
+ d = in (src, m);
+ *dst = over (d, *dst);
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, s;
+ uint32_t *dst_line, *dst, d;
+ uint32_t *mask_line, *mask, ma;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ ma = *mask++;
+
+ if (ma)
+ {
+ d = *dst;
+ s = src;
+
+ UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
+
+ *dst = s;
+ }
+
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca, s;
+ uint32_t *dst_line, *dst, d;
+ uint32_t *mask_line, *mask, ma;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ ma = *mask++;
+ if (ma == 0xffffffff)
+ {
+ if (srca == 0xff)
+ *dst = src;
+ else
+ *dst = over (src, *dst);
+ }
+ else if (ma)
+ {
+ d = *dst;
+ s = src;
+
+ UN8x4_MUL_UN8x4 (s, ma);
+ UN8x4_MUL_UN8 (ma, srca);
+ ma = ~ma;
+ UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+ *dst = d;
+ }
+
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint8_t *dst_line, *dst;
+ uint32_t d;
+ uint8_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ m = *mask++;
+ if (m == 0xff)
+ {
+ if (srca == 0xff)
+ {
+ d = src;
+ }
+ else
+ {
+ d = fetch_24 (dst);
+ d = over (src, d);
+ }
+ store_24 (dst, d);
+ }
+ else if (m)
+ {
+ d = over (in (src, m), fetch_24 (dst));
+ store_24 (dst, d);
+ }
+ dst += 3;
+ }
+ }
+}
+
+static void
+fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint16_t *dst_line, *dst;
+ uint32_t d;
+ uint8_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ m = *mask++;
+ if (m == 0xff)
+ {
+ if (srca == 0xff)
+ {
+ d = src;
+ }
+ else
+ {
+ d = *dst;
+ d = over (src, CONVERT_0565_TO_0888 (d));
+ }
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ else if (m)
+ {
+ d = *dst;
+ d = over (in (src, m), CONVERT_0565_TO_0888 (d));
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca, s;
+ uint16_t src16;
+ uint16_t *dst_line, *dst;
+ uint32_t d;
+ uint32_t *mask_line, *mask, ma;
+ int dst_stride, mask_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ src16 = CONVERT_8888_TO_0565 (src);
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ ma = *mask++;
+ if (ma == 0xffffffff)
+ {
+ if (srca == 0xff)
+ {
+ *dst = src16;
+ }
+ else
+ {
+ d = *dst;
+ d = over (src, CONVERT_0565_TO_0888 (d));
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ }
+ else if (ma)
+ {
+ d = *dst;
+ d = CONVERT_0565_TO_0888 (d);
+
+ s = src;
+
+ UN8x4_MUL_UN8x4 (s, ma);
+ UN8x4_MUL_UN8 (ma, srca);
+ ma = ~ma;
+ UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_over_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ uint8_t a;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ a = s >> 24;
+ if (a == 0xff)
+ *dst = s;
+ else if (s)
+ *dst = over (s, *dst);
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_src_x888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ *dst++ = (*src++) | 0xff000000;
+ }
+}
+
+#if 0
+static void
+fast_composite_over_8888_0888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint32_t d;
+ uint32_t *src_line, *src, s;
+ uint8_t a;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ a = s >> 24;
+ if (a)
+ {
+ if (a == 0xff)
+ d = s;
+ else
+ d = over (s, fetch_24 (dst));
+
+ store_24 (dst, d);
+ }
+ dst += 3;
+ }
+ }
+}
+#endif
+
+static void
+fast_composite_over_8888_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dst_line, *dst;
+ uint32_t d;
+ uint32_t *src_line, *src, s;
+ uint8_t a;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ a = s >> 24;
+ if (s)
+ {
+ if (a == 0xff)
+ {
+ d = s;
+ }
+ else
+ {
+ d = *dst;
+ d = over (s, CONVERT_0565_TO_0888 (d));
+ }
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_src_x888_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dst_line, *dst;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ *dst = CONVERT_8888_TO_0565 (s);
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint8_t s, d;
+ uint16_t t;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ if (s)
+ {
+ if (s != 0xff)
+ {
+ d = *dst;
+ t = d + s;
+ s = t | (0 - (t >> 8));
+ }
+ *dst = s;
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_add_8888_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint32_t s, d;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ if (s)
+ {
+ if (s != 0xffffffff)
+ {
+ d = *dst;
+ if (d)
+ UN8x4_ADD_UN8x4 (s, d);
+ }
+ *dst = s;
+ }
+ dst++;
+ }
+ }
+}
+
+static void
+fast_composite_add_n_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t src;
+ uint8_t sa;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+ sa = (src >> 24);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w--)
+ {
+ uint16_t tmp;
+ uint16_t a;
+ uint32_t m, d;
+ uint32_t r;
+
+ a = *mask++;
+ d = *dst;
+
+ m = MUL_UN8 (sa, a, tmp);
+ r = ADD_UN8 (m, d, tmp);
+
+ *dst++ = r;
+ }
+ }
+}
+
+#ifdef WORDS_BIGENDIAN
+#define CREATE_BITMASK(n) (0x80000000 >> (n))
+#define UPDATE_BITMASK(n) ((n) >> 1)
+#else
+#define CREATE_BITMASK(n) (1 << (n))
+#define UPDATE_BITMASK(n) ((n) << 1)
+#endif
+
+#define TEST_BIT(p, n) \
+ (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
+#define SET_BIT(p, n) \
+ do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
+
+static void
+fast_composite_add_1000_1000 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
+ src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
+ dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ /*
+ * TODO: improve performance by processing uint32_t data instead
+ * of individual bits
+ */
+ if (TEST_BIT (src, src_x + w))
+ SET_BIT (dst, dest_x + w);
+ }
+ }
+}
+
+static void
+fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint32_t *dst, *dst_line;
+ uint32_t *mask, *mask_line;
+ int mask_stride, dst_stride;
+ uint32_t bitcache, bitmask;
+ int32_t w;
+
+ if (width <= 0)
+ return;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
+ dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+ mask_stride, mask_line, 1);
+ mask_line += mask_x >> 5;
+
+ if (srca == 0xff)
+ {
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (mask_x & 31);
+
+ while (w--)
+ {
+ if (bitmask == 0)
+ {
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (0);
+ }
+ if (bitcache & bitmask)
+ *dst = src;
+ bitmask = UPDATE_BITMASK (bitmask);
+ dst++;
+ }
+ }
+ }
+ else
+ {
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (mask_x & 31);
+
+ while (w--)
+ {
+ if (bitmask == 0)
+ {
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (0);
+ }
+ if (bitcache & bitmask)
+ *dst = over (src, *dst);
+ bitmask = UPDATE_BITMASK (bitmask);
+ dst++;
+ }
+ }
+ }
+}
+
+static void
+fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint16_t *dst, *dst_line;
+ uint32_t *mask, *mask_line;
+ int mask_stride, dst_stride;
+ uint32_t bitcache, bitmask;
+ int32_t w;
+ uint32_t d;
+ uint16_t src565;
+
+ if (width <= 0)
+ return;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+ srca = src >> 24;
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
+ dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+ mask_stride, mask_line, 1);
+ mask_line += mask_x >> 5;
+
+ if (srca == 0xff)
+ {
+ src565 = CONVERT_8888_TO_0565 (src);
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (mask_x & 31);
+
+ while (w--)
+ {
+ if (bitmask == 0)
+ {
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (0);
+ }
+ if (bitcache & bitmask)
+ *dst = src565;
+ bitmask = UPDATE_BITMASK (bitmask);
+ dst++;
+ }
+ }
+ }
+ else
+ {
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (mask_x & 31);
+
+ while (w--)
+ {
+ if (bitmask == 0)
+ {
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (0);
+ }
+ if (bitcache & bitmask)
+ {
+ d = over (src, CONVERT_0565_TO_0888 (*dst));
+ *dst = CONVERT_8888_TO_0565 (d);
+ }
+ bitmask = UPDATE_BITMASK (bitmask);
+ dst++;
+ }
+ }
+ }
+}
+
+/*
+ * Simple bitblt
+ */
+
+static void
+fast_composite_solid_fill (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src;
+
+ src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+ if (dst_image->bits.format == PIXMAN_a1)
+ {
+ src = src >> 31;
+ }
+ else if (dst_image->bits.format == PIXMAN_a8)
+ {
+ src = src >> 24;
+ }
+ else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
+ dst_image->bits.format == PIXMAN_b5g6r5)
+ {
+ src = CONVERT_8888_TO_0565 (src);
+ }
+
+ pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+ PIXMAN_FORMAT_BPP (dst_image->bits.format),
+ dest_x, dest_y,
+ width, height,
+ src);
+}
+
+static void
+fast_composite_src_memcpy (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8;
+ uint32_t n_bytes = width * bpp;
+ int dst_stride, src_stride;
+ uint8_t *dst;
+ uint8_t *src;
+
+ src_stride = src_image->bits.rowstride * 4;
+ dst_stride = dst_image->bits.rowstride * 4;
+
+ src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
+ dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
+
+ while (height--)
+ {
+ memcpy (dst, src, n_bytes);
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
+
+/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
+static force_inline void
+scaled_nearest_scanline_565_565_SRC (uint16_t * dst,
+ const uint16_t * src,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t fully_transparent_src)
+{
+ uint16_t tmp1, tmp2, tmp3, tmp4;
+ while ((w -= 4) >= 0)
+ {
+ tmp1 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp2 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp3 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp4 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ *dst++ = tmp1;
+ *dst++ = tmp2;
+ *dst++ = tmp3;
+ *dst++ = tmp4;
+ }
+ if (w & 2)
+ {
+ tmp1 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp2 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ *dst++ = tmp1;
+ *dst++ = tmp2;
+ }
+ if (w & 1)
+ *dst++ = src[pixman_fixed_to_int (vx)];
+}
+
+FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
+ scaled_nearest_scanline_565_565_SRC,
+ uint16_t, uint16_t, COVER)
+FAST_NEAREST_MAINLOOP (565_565_none_SRC,
+ scaled_nearest_scanline_565_565_SRC,
+ uint16_t, uint16_t, NONE)
+FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
+ scaled_nearest_scanline_565_565_SRC,
+ uint16_t, uint16_t, PAD)
+
+static force_inline uint32_t
+fetch_nearest (pixman_repeat_t src_repeat,
+ pixman_format_code_t format,
+ uint32_t *src, int x, int src_width)
+{
+ if (repeat (src_repeat, &x, src_width))
+ {
+ if (format == PIXMAN_x8r8g8b8)
+ return *(src + x) | 0xff000000;
+ else
+ return *(src + x);
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+static force_inline void
+combine_over (uint32_t s, uint32_t *dst)
+{
+ if (s)
+ {
+ uint8_t ia = 0xff - (s >> 24);
+
+ if (ia)
+ UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
+ else
+ *dst = s;
+ }
+}
+
+static force_inline void
+combine_src (uint32_t s, uint32_t *dst)
+{
+ *dst = s;
+}
+
+static void
+fast_composite_scaled_nearest (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *dst_line;
+ uint32_t *src_line;
+ int dst_stride, src_stride;
+ int src_width, src_height;
+ pixman_repeat_t src_repeat;
+ pixman_fixed_t unit_x, unit_y;
+ pixman_format_code_t src_format;
+ pixman_vector_t v;
+ pixman_fixed_t vy;
+
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
+ * transformed from destination space to source space
+ */
+ PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
+
+ /* reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (src_image->common.transform, &v))
+ return;
+
+ unit_x = src_image->common.transform->matrix[0][0];
+ unit_y = src_image->common.transform->matrix[1][1];
+
+ /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
+ v.vector[0] -= pixman_fixed_e;
+ v.vector[1] -= pixman_fixed_e;
+
+ src_height = src_image->bits.height;
+ src_width = src_image->bits.width;
+ src_repeat = src_image->common.repeat;
+ src_format = src_image->bits.format;
+
+ vy = v.vector[1];
+ while (height--)
+ {
+ pixman_fixed_t vx = v.vector[0];
+ int y = pixman_fixed_to_int (vy);
+ uint32_t *dst = dst_line;
+
+ dst_line += dst_stride;
+
+ /* adjust the y location by a unit vector in the y direction
+ * this is equivalent to transforming y+1 of the destination point to source space */
+ vy += unit_y;
+
+ if (!repeat (src_repeat, &y, src_height))
+ {
+ if (op == PIXMAN_OP_SRC)
+ memset (dst, 0, sizeof (*dst) * width);
+ }
+ else
+ {
+ int w = width;
+
+ uint32_t *src = src_line + y * src_stride;
+
+ while (w >= 2)
+ {
+ uint32_t s1, s2;
+ int x1, x2;
+
+ x1 = pixman_fixed_to_int (vx);
+ vx += unit_x;
+
+ x2 = pixman_fixed_to_int (vx);
+ vx += unit_x;
+
+ w -= 2;
+
+ s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
+ s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
+
+ if (op == PIXMAN_OP_OVER)
+ {
+ combine_over (s1, dst++);
+ combine_over (s2, dst++);
+ }
+ else
+ {
+ combine_src (s1, dst++);
+ combine_src (s2, dst++);
+ }
+ }
+
+ while (w--)
+ {
+ uint32_t s;
+ int x;
+
+ x = pixman_fixed_to_int (vx);
+ vx += unit_x;
+
+ s = fetch_nearest (src_repeat, src_format, src, x, src_width);
+
+ if (op == PIXMAN_OP_OVER)
+ combine_over (s, dst++);
+ else
+ combine_src (s, dst++);
+ }
+ }
+ }
+}
+
+#define CACHE_LINE_SIZE 64
+
+#define FAST_SIMPLE_ROTATE(suffix, pix_type) \
+ \
+static void \
+blt_rotated_90_trivial_##suffix (pix_type *dst, \
+ int dst_stride, \
+ const pix_type *src, \
+ int src_stride, \
+ int w, \
+ int h) \
+{ \
+ int x, y; \
+ for (y = 0; y < h; y++) \
+ { \
+ const pix_type *s = src + (h - y - 1); \
+ pix_type *d = dst + dst_stride * y; \
+ for (x = 0; x < w; x++) \
+ { \
+ *d++ = *s; \
+ s += src_stride; \
+ } \
+ } \
+} \
+ \
+static void \
+blt_rotated_270_trivial_##suffix (pix_type *dst, \
+ int dst_stride, \
+ const pix_type *src, \
+ int src_stride, \
+ int w, \
+ int h) \
+{ \
+ int x, y; \
+ for (y = 0; y < h; y++) \
+ { \
+ const pix_type *s = src + src_stride * (w - 1) + y; \
+ pix_type *d = dst + dst_stride * y; \
+ for (x = 0; x < w; x++) \
+ { \
+ *d++ = *s; \
+ s -= src_stride; \
+ } \
+ } \
+} \
+ \
+static void \
+blt_rotated_90_##suffix (pix_type *dst, \
+ int dst_stride, \
+ const pix_type *src, \
+ int src_stride, \
+ int W, \
+ int H) \
+{ \
+ int x; \
+ int leading_pixels = 0, trailing_pixels = 0; \
+ const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \
+ \
+ /* \
+ * split processing into handling destination as TILE_SIZExH cache line \
+ * aligned vertical stripes (optimistically assuming that destination \
+ * stride is a multiple of cache line, if not - it will be just a bit \
+ * slower) \
+ */ \
+ \
+ if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \
+ { \
+ leading_pixels = TILE_SIZE - (((uintptr_t)dst & \
+ (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+ if (leading_pixels > W) \
+ leading_pixels = W; \
+ \
+ /* unaligned leading part NxH (where N < TILE_SIZE) */ \
+ blt_rotated_90_trivial_##suffix ( \
+ dst, \
+ dst_stride, \
+ src, \
+ src_stride, \
+ leading_pixels, \
+ H); \
+ \
+ dst += leading_pixels; \
+ src += leading_pixels * src_stride; \
+ W -= leading_pixels; \
+ } \
+ \
+ if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \
+ { \
+ trailing_pixels = (((uintptr_t)(dst + W) & \
+ (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+ if (trailing_pixels > W) \
+ trailing_pixels = W; \
+ W -= trailing_pixels; \
+ } \
+ \
+ for (x = 0; x < W; x += TILE_SIZE) \
+ { \
+ /* aligned middle part TILE_SIZExH */ \
+ blt_rotated_90_trivial_##suffix ( \
+ dst + x, \
+ dst_stride, \
+ src + src_stride * x, \
+ src_stride, \
+ TILE_SIZE, \
+ H); \
+ } \
+ \
+ if (trailing_pixels) \
+ { \
+ /* unaligned trailing part NxH (where N < TILE_SIZE) */ \
+ blt_rotated_90_trivial_##suffix ( \
+ dst + W, \
+ dst_stride, \
+ src + W * src_stride, \
+ src_stride, \
+ trailing_pixels, \
+ H); \
+ } \
+} \
+ \
+static void \
+blt_rotated_270_##suffix (pix_type *dst, \
+ int dst_stride, \
+ const pix_type *src, \
+ int src_stride, \
+ int W, \
+ int H) \
+{ \
+ int x; \
+ int leading_pixels = 0, trailing_pixels = 0; \
+ const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \
+ \
+ /* \
+ * split processing into handling destination as TILE_SIZExH cache line \
+ * aligned vertical stripes (optimistically assuming that destination \
+ * stride is a multiple of cache line, if not - it will be just a bit \
+ * slower) \
+ */ \
+ \
+ if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \
+ { \
+ leading_pixels = TILE_SIZE - (((uintptr_t)dst & \
+ (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+ if (leading_pixels > W) \
+ leading_pixels = W; \
+ \
+ /* unaligned leading part NxH (where N < TILE_SIZE) */ \
+ blt_rotated_270_trivial_##suffix ( \
+ dst, \
+ dst_stride, \
+ src + src_stride * (W - leading_pixels), \
+ src_stride, \
+ leading_pixels, \
+ H); \
+ \
+ dst += leading_pixels; \
+ W -= leading_pixels; \
+ } \
+ \
+ if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \
+ { \
+ trailing_pixels = (((uintptr_t)(dst + W) & \
+ (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+ if (trailing_pixels > W) \
+ trailing_pixels = W; \
+ W -= trailing_pixels; \
+ src += trailing_pixels * src_stride; \
+ } \
+ \
+ for (x = 0; x < W; x += TILE_SIZE) \
+ { \
+ /* aligned middle part TILE_SIZExH */ \
+ blt_rotated_270_trivial_##suffix ( \
+ dst + x, \
+ dst_stride, \
+ src + src_stride * (W - x - TILE_SIZE), \
+ src_stride, \
+ TILE_SIZE, \
+ H); \
+ } \
+ \
+ if (trailing_pixels) \
+ { \
+ /* unaligned trailing part NxH (where N < TILE_SIZE) */ \
+ blt_rotated_270_trivial_##suffix ( \
+ dst + W, \
+ dst_stride, \
+ src - trailing_pixels * src_stride, \
+ src_stride, \
+ trailing_pixels, \
+ H); \
+ } \
+} \
+ \
+static void \
+fast_composite_rotate_90_##suffix (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dest_x, \
+ int32_t dest_y, \
+ int32_t width, \
+ int32_t height) \
+{ \
+ pix_type *dst_line; \
+ pix_type *src_line; \
+ int dst_stride, src_stride; \
+ int src_x_t, src_y_t; \
+ \
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \
+ dst_stride, dst_line, 1); \
+ src_x_t = -src_y + pixman_fixed_to_int ( \
+ src_image->common.transform->matrix[0][2] + \
+ pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
+ src_y_t = src_x + pixman_fixed_to_int ( \
+ src_image->common.transform->matrix[1][2] + \
+ pixman_fixed_1 / 2 - pixman_fixed_e); \
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \
+ src_stride, src_line, 1); \
+ blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride, \
+ width, height); \
+} \
+ \
+static void \
+fast_composite_rotate_270_##suffix (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dest_x, \
+ int32_t dest_y, \
+ int32_t width, \
+ int32_t height) \
+{ \
+ pix_type *dst_line; \
+ pix_type *src_line; \
+ int dst_stride, src_stride; \
+ int src_x_t, src_y_t; \
+ \
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \
+ dst_stride, dst_line, 1); \
+ src_x_t = src_y + pixman_fixed_to_int ( \
+ src_image->common.transform->matrix[0][2] + \
+ pixman_fixed_1 / 2 - pixman_fixed_e); \
+ src_y_t = -src_x + pixman_fixed_to_int ( \
+ src_image->common.transform->matrix[1][2] + \
+ pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \
+ src_stride, src_line, 1); \
+ blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride, \
+ width, height); \
+}
+
+FAST_SIMPLE_ROTATE (8, uint8_t)
+FAST_SIMPLE_ROTATE (565, uint16_t)
+FAST_SIMPLE_ROTATE (8888, uint32_t)
+
+static const pixman_fast_path_t c_fast_paths[] =
+{
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5, fast_composite_over_n_1_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5, fast_composite_over_n_1_0565),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
+ PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+ PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
+ PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
+
+ SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
+
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+ SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
+
+ SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
+
+#define NEAREST_FAST_PATH(op,s,d) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, SCALED_NEAREST_FLAGS, \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest, \
+ }
+
+ NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
+ NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
+ NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
+ NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
+
+ NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
+ NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
+ NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
+ NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
+
+ NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
+ NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
+ NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
+ NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
+
+ NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
+ NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
+ NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
+ NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
+
+#define SIMPLE_ROTATE_FLAGS(angle) \
+ (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM | \
+ FAST_PATH_NEAREST_FILTER | \
+ FAST_PATH_SAMPLES_COVER_CLIP | \
+ FAST_PATH_STANDARD_FLAGS)
+
+#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_rotate_90_##suffix, \
+ }, \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_rotate_270_##suffix, \
+ }
+
+ SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
+ SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
+ SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
+ SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
+ SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
+
+ { PIXMAN_OP_NONE },
+};
+
+#ifdef WORDS_BIGENDIAN
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
+#else
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
+#endif
+
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
+{
+ if (offs)
+ {
+ int leading_pixels = 32 - offs;
+ if (leading_pixels >= width)
+ {
+ if (v)
+ *dst |= A1_FILL_MASK (width, offs);
+ else
+ *dst &= ~A1_FILL_MASK (width, offs);
+ return;
+ }
+ else
+ {
+ if (v)
+ *dst++ |= A1_FILL_MASK (leading_pixels, offs);
+ else
+ *dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+ width -= leading_pixels;
+ }
+ }
+ while (width >= 32)
+ {
+ if (v)
+ *dst++ = 0xFFFFFFFF;
+ else
+ *dst++ = 0;
+ width -= 32;
+ }
+ if (width > 0)
+ {
+ if (v)
+ *dst |= A1_FILL_MASK (width, 0);
+ else
+ *dst &= ~A1_FILL_MASK (width, 0);
+ }
+}
+
+static void
+pixman_fill1 (uint32_t *bits,
+ int stride,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ uint32_t *dst = bits + y * stride + (x >> 5);
+ int offs = x & 31;
+
+ if (xor & 1)
+ {
+ while (height--)
+ {
+ pixman_fill1_line (dst, offs, width, 1);
+ dst += stride;
+ }
+ }
+ else
+ {
+ while (height--)
+ {
+ pixman_fill1_line (dst, offs, width, 0);
+ dst += stride;
+ }
+ }
+}
+
+static void
+pixman_fill8 (uint32_t *bits,
+ int stride,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ int byte_stride = stride * (int) sizeof (uint32_t);
+ uint8_t *dst = (uint8_t *) bits;
+ uint8_t v = xor & 0xff;
+ int i;
+
+ dst = dst + y * byte_stride + x;
+
+ while (height--)
+ {
+ for (i = 0; i < width; ++i)
+ dst[i] = v;
+
+ dst += byte_stride;
+ }
+}
+
+static void
+pixman_fill16 (uint32_t *bits,
+ int stride,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ int short_stride =
+ (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
+ uint16_t *dst = (uint16_t *)bits;
+ uint16_t v = xor & 0xffff;
+ int i;
+
+ dst = dst + y * short_stride + x;
+
+ while (height--)
+ {
+ for (i = 0; i < width; ++i)
+ dst[i] = v;
+
+ dst += short_stride;
+ }
+}
+
+static void
+pixman_fill32 (uint32_t *bits,
+ int stride,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ int i;
+
+ bits = bits + y * stride + x;
+
+ while (height--)
+ {
+ for (i = 0; i < width; ++i)
+ bits[i] = xor;
+
+ bits += stride;
+ }
+}
+
+static pixman_bool_t
+fast_path_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ switch (bpp)
+ {
+ case 1:
+ pixman_fill1 (bits, stride, x, y, width, height, xor);
+ break;
+
+ case 8:
+ pixman_fill8 (bits, stride, x, y, width, height, xor);
+ break;
+
+ case 16:
+ pixman_fill16 (bits, stride, x, y, width, height, xor);
+ break;
+
+ case 32:
+ pixman_fill32 (bits, stride, x, y, width, height, xor);
+ break;
+
+ default:
+ return _pixman_implementation_fill (
+ imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+ break;
+ }
+
+ return TRUE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
+{
+ pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
+
+ imp->fill = fast_path_fill;
+
+ return imp;
+}
diff --git a/pixman/pixman/pixman-fast-path.h b/pixman/pixman/pixman-fast-path.h
index bb7032d86..d08122293 100644
--- a/pixman/pixman/pixman-fast-path.h
+++ b/pixman/pixman/pixman-fast-path.h
@@ -1,449 +1,590 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. SuSE makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author: Keith Packard, SuSE, Inc.
- */
-
-#ifndef PIXMAN_FAST_PATH_H__
-#define PIXMAN_FAST_PATH_H__
-
-#include "pixman-private.h"
-
-#define PIXMAN_REPEAT_COVER -1
-
-static force_inline pixman_bool_t
-repeat (pixman_repeat_t repeat, int *c, int size)
-{
- if (repeat == PIXMAN_REPEAT_NONE)
- {
- if (*c < 0 || *c >= size)
- return FALSE;
- }
- else if (repeat == PIXMAN_REPEAT_NORMAL)
- {
- while (*c >= size)
- *c -= size;
- while (*c < 0)
- *c += size;
- }
- else if (repeat == PIXMAN_REPEAT_PAD)
- {
- *c = CLIP (*c, 0, size - 1);
- }
- else /* REFLECT */
- {
- *c = MOD (*c, size * 2);
- if (*c >= size)
- *c = size * 2 - *c - 1;
- }
- return TRUE;
-}
-
-/*
- * For each scanline fetched from source image with PAD repeat:
- * - calculate how many pixels need to be padded on the left side
- * - calculate how many pixels need to be padded on the right side
- * - update width to only count pixels which are fetched from the image
- * All this information is returned via 'width', 'left_pad', 'right_pad'
- * arguments. The code is assuming that 'unit_x' is positive.
- *
- * Note: 64-bit math is used in order to avoid potential overflows, which
- * is probably excessive in many cases. This particular function
- * may need its own correctness test and performance tuning.
- */
-static force_inline void
-pad_repeat_get_scanline_bounds (int32_t source_image_width,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- int32_t * width,
- int32_t * left_pad,
- int32_t * right_pad)
-{
- int64_t max_vx = (int64_t) source_image_width << 16;
- int64_t tmp;
- if (vx < 0)
- {
- tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
- if (tmp > *width)
- {
- *left_pad = *width;
- *width = 0;
- }
- else
- {
- *left_pad = (int32_t) tmp;
- *width -= (int32_t) tmp;
- }
- }
- else
- {
- *left_pad = 0;
- }
- tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
- if (tmp < 0)
- {
- *right_pad = *width;
- *width = 0;
- }
- else if (tmp >= *width)
- {
- *right_pad = 0;
- }
- else
- {
- *right_pad = *width - (int32_t) tmp;
- *width = (int32_t) tmp;
- }
-}
-
-/* A macroified version of specialized nearest scalers for some
- * common 8888 and 565 formats. It supports SRC and OVER ops.
- *
- * There are two repeat versions, one that handles repeat normal,
- * and one without repeat handling that only works if the src region
- * used is completely covered by the pre-repeated source samples.
- *
- * The loops are unrolled to process two pixels per iteration for better
- * performance on most CPU architectures (superscalar processors
- * can issue several operations simultaneously, other processors can hide
- * instructions latencies by pipelining operations). Unrolling more
- * does not make much sense because the compiler will start running out
- * of spare registers soon.
- */
-
-#define GET_8888_ALPHA(s) ((s) >> 24)
- /* This is not actually used since we don't have an OVER with
- 565 source, but it is needed to build. */
-#define GET_0565_ALPHA(s) 0xff
-
-#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT, \
- src_type_t, dst_type_t, OP, repeat_mode) \
-static force_inline void \
-scanline_func_name (dst_type_t *dst, \
- src_type_t *src, \
- int32_t w, \
- pixman_fixed_t vx, \
- pixman_fixed_t unit_x, \
- pixman_fixed_t max_vx) \
-{ \
- uint32_t d; \
- src_type_t s1, s2; \
- uint8_t a1, a2; \
- int x1, x2; \
- \
- if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER) \
- abort(); \
- \
- while ((w -= 2) >= 0) \
- { \
- x1 = vx >> 16; \
- vx += unit_x; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* This works because we know that unit_x is positive */ \
- while (vx >= max_vx) \
- vx -= max_vx; \
- } \
- s1 = src[x1]; \
- \
- x2 = vx >> 16; \
- vx += unit_x; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* This works because we know that unit_x is positive */ \
- while (vx >= max_vx) \
- vx -= max_vx; \
- } \
- s2 = src[x2]; \
- \
- if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
- { \
- a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
- a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \
- \
- if (a1 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- else if (s1) \
- { \
- d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst); \
- s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
- a1 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- \
- if (a2 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
- } \
- else if (s2) \
- { \
- d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
- s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2); \
- a2 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- } \
- else /* PIXMAN_OP_SRC */ \
- { \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
- } \
- } \
- \
- if (w & 1) \
- { \
- x1 = vx >> 16; \
- s1 = src[x1]; \
- \
- if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
- { \
- a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
- \
- if (a1 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- else if (s1) \
- { \
- d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
- s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
- a1 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- } \
- else /* PIXMAN_OP_SRC */ \
- { \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- } \
-}
-
-#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, dst_type_t, \
- repeat_mode) \
-static void \
-fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, \
- pixman_op_t op, \
- pixman_image_t * src_image, \
- pixman_image_t * mask_image, \
- pixman_image_t * dst_image, \
- int32_t src_x, \
- int32_t src_y, \
- int32_t mask_x, \
- int32_t mask_y, \
- int32_t dst_x, \
- int32_t dst_y, \
- int32_t width, \
- int32_t height) \
-{ \
- dst_type_t *dst_line; \
- src_type_t *src_first_line; \
- int y; \
- pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \
- pixman_fixed_t max_vy; \
- pixman_vector_t v; \
- pixman_fixed_t vx, vy; \
- pixman_fixed_t unit_x, unit_y; \
- int32_t left_pad, right_pad; \
- \
- src_type_t *src; \
- dst_type_t *dst; \
- int src_stride, dst_stride; \
- \
- PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \
- /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \
- * transformed from destination space to source space */ \
- PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \
- \
- /* reference point is the center of the pixel */ \
- v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \
- v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \
- v.vector[2] = pixman_fixed_1; \
- \
- if (!pixman_transform_point_3d (src_image->common.transform, &v)) \
- return; \
- \
- unit_x = src_image->common.transform->matrix[0][0]; \
- unit_y = src_image->common.transform->matrix[1][1]; \
- \
- /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ \
- v.vector[0] -= pixman_fixed_e; \
- v.vector[1] -= pixman_fixed_e; \
- \
- vx = v.vector[0]; \
- vy = v.vector[1]; \
- \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* Clamp repeating positions inside the actual samples */ \
- max_vx = src_image->bits.width << 16; \
- max_vy = src_image->bits.height << 16; \
- \
- repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \
- repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
- } \
- \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \
- PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
- { \
- pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x, \
- &width, &left_pad, &right_pad); \
- vx += left_pad * unit_x; \
- } \
- \
- while (--height >= 0) \
- { \
- dst = dst_line; \
- dst_line += dst_stride; \
- \
- y = vy >> 16; \
- vy += unit_y; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \
- { \
- repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height); \
- src = src_first_line + src_stride * y; \
- if (left_pad > 0) \
- { \
- scanline_func (dst, src, left_pad, 0, 0, 0); \
- } \
- if (width > 0) \
- { \
- scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \
- } \
- if (right_pad > 0) \
- { \
- scanline_func (dst + left_pad + width, src + src_image->bits.width - 1, \
- right_pad, 0, 0, 0); \
- } \
- } \
- else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
- { \
- static src_type_t zero[1] = { 0 }; \
- if (y < 0 || y >= src_image->bits.height) \
- { \
- scanline_func (dst, zero, left_pad + width + right_pad, 0, 0, 0); \
- continue; \
- } \
- src = src_first_line + src_stride * y; \
- if (left_pad > 0) \
- { \
- scanline_func (dst, zero, left_pad, 0, 0, 0); \
- } \
- if (width > 0) \
- { \
- scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \
- } \
- if (right_pad > 0) \
- { \
- scanline_func (dst + left_pad + width, zero, right_pad, 0, 0, 0); \
- } \
- } \
- else \
- { \
- src = src_first_line + src_stride * y; \
- scanline_func (dst, src, width, vx, unit_x, max_vx); \
- } \
- } \
-}
-
-/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
-#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t, \
- repeat_mode) \
- FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, dst_type_t, \
- repeat_mode) \
-
-#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT, \
- src_type_t, dst_type_t, OP, repeat_mode) \
- FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
- SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t, \
- OP, repeat_mode) \
- FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name ## _ ## OP, \
- scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
- src_type_t, dst_type_t, repeat_mode)
-
-
-#define SCALED_NEAREST_FLAGS \
- (FAST_PATH_SCALE_TRANSFORM | \
- FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NEAREST_FILTER | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_NARROW_FORMAT)
-
-#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_NORMAL_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_PAD_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_NONE_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \
- }
-
-/* Prefer the use of 'cover' variant, because it is faster */
-#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \
- SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
-
-#endif
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. SuSE makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Keith Packard, SuSE, Inc.
+ */
+
+#ifndef PIXMAN_FAST_PATH_H__
+#define PIXMAN_FAST_PATH_H__
+
+#include "pixman-private.h"
+
+#define PIXMAN_REPEAT_COVER -1
+
+static force_inline pixman_bool_t
+repeat (pixman_repeat_t repeat, int *c, int size)
+{
+ if (repeat == PIXMAN_REPEAT_NONE)
+ {
+ if (*c < 0 || *c >= size)
+ return FALSE;
+ }
+ else if (repeat == PIXMAN_REPEAT_NORMAL)
+ {
+ while (*c >= size)
+ *c -= size;
+ while (*c < 0)
+ *c += size;
+ }
+ else if (repeat == PIXMAN_REPEAT_PAD)
+ {
+ *c = CLIP (*c, 0, size - 1);
+ }
+ else /* REFLECT */
+ {
+ *c = MOD (*c, size * 2);
+ if (*c >= size)
+ *c = size * 2 - *c - 1;
+ }
+ return TRUE;
+}
+
+/*
+ * For each scanline fetched from source image with PAD repeat:
+ * - calculate how many pixels need to be padded on the left side
+ * - calculate how many pixels need to be padded on the right side
+ * - update width to only count pixels which are fetched from the image
+ * All this information is returned via 'width', 'left_pad', 'right_pad'
+ * arguments. The code is assuming that 'unit_x' is positive.
+ *
+ * Note: 64-bit math is used in order to avoid potential overflows, which
+ * is probably excessive in many cases. This particular function
+ * may need its own correctness test and performance tuning.
+ */
+static force_inline void
+pad_repeat_get_scanline_bounds (int32_t source_image_width,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ int32_t * width,
+ int32_t * left_pad,
+ int32_t * right_pad)
+{
+ int64_t max_vx = (int64_t) source_image_width << 16;
+ int64_t tmp;
+ if (vx < 0)
+ {
+ tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
+ if (tmp > *width)
+ {
+ *left_pad = *width;
+ *width = 0;
+ }
+ else
+ {
+ *left_pad = (int32_t) tmp;
+ *width -= (int32_t) tmp;
+ }
+ }
+ else
+ {
+ *left_pad = 0;
+ }
+ tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
+ if (tmp < 0)
+ {
+ *right_pad = *width;
+ *width = 0;
+ }
+ else if (tmp >= *width)
+ {
+ *right_pad = 0;
+ }
+ else
+ {
+ *right_pad = *width - (int32_t) tmp;
+ *width = (int32_t) tmp;
+ }
+}
+
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+ 565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+
+#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT, \
+ src_type_t, dst_type_t, OP, repeat_mode) \
+static force_inline void \
+scanline_func_name (dst_type_t *dst, \
+ const src_type_t *src, \
+ int32_t w, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx, \
+ pixman_bool_t fully_transparent_src) \
+{ \
+ uint32_t d; \
+ src_type_t s1, s2; \
+ uint8_t a1, a2; \
+ int x1, x2; \
+ \
+ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src) \
+ return; \
+ \
+ if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER) \
+ abort(); \
+ \
+ while ((w -= 2) >= 0) \
+ { \
+ x1 = vx >> 16; \
+ vx += unit_x; \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
+ { \
+ /* This works because we know that unit_x is positive */ \
+ while (vx >= max_vx) \
+ vx -= max_vx; \
+ } \
+ s1 = src[x1]; \
+ \
+ x2 = vx >> 16; \
+ vx += unit_x; \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
+ { \
+ /* This works because we know that unit_x is positive */ \
+ while (vx >= max_vx) \
+ vx -= max_vx; \
+ } \
+ s2 = src[x2]; \
+ \
+ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
+ { \
+ a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
+ a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \
+ \
+ if (a1 == 0xff) \
+ { \
+ *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
+ } \
+ else if (s1) \
+ { \
+ d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst); \
+ s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
+ a1 ^= 0xff; \
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
+ *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
+ } \
+ dst++; \
+ \
+ if (a2 == 0xff) \
+ { \
+ *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
+ } \
+ else if (s2) \
+ { \
+ d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
+ s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2); \
+ a2 ^= 0xff; \
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2); \
+ *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
+ } \
+ dst++; \
+ } \
+ else /* PIXMAN_OP_SRC */ \
+ { \
+ *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
+ *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
+ } \
+ } \
+ \
+ if (w & 1) \
+ { \
+ x1 = vx >> 16; \
+ s1 = src[x1]; \
+ \
+ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
+ { \
+ a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
+ \
+ if (a1 == 0xff) \
+ { \
+ *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
+ } \
+ else if (s1) \
+ { \
+ d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
+ s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
+ a1 ^= 0xff; \
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
+ *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
+ } \
+ dst++; \
+ } \
+ else /* PIXMAN_OP_SRC */ \
+ { \
+ *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
+ } \
+ } \
+}
+
+#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \
+ dst_type_t, repeat_mode, have_mask, mask_is_solid) \
+static void \
+fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dst_x, \
+ int32_t dst_y, \
+ int32_t width, \
+ int32_t height) \
+{ \
+ dst_type_t *dst_line; \
+ mask_type_t *mask_line; \
+ src_type_t *src_first_line; \
+ int y; \
+ pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \
+ pixman_fixed_t max_vy; \
+ pixman_vector_t v; \
+ pixman_fixed_t vx, vy; \
+ pixman_fixed_t unit_x, unit_y; \
+ int32_t left_pad, right_pad; \
+ \
+ src_type_t *src; \
+ dst_type_t *dst; \
+ mask_type_t solid_mask; \
+ const mask_type_t *mask = &solid_mask; \
+ int src_stride, mask_stride, dst_stride; \
+ \
+ PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \
+ if (have_mask) \
+ { \
+ if (mask_is_solid) \
+ solid_mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format); \
+ else \
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t, \
+ mask_stride, mask_line, 1); \
+ } \
+ /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \
+ * transformed from destination space to source space */ \
+ PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \
+ \
+ /* reference point is the center of the pixel */ \
+ v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \
+ v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \
+ v.vector[2] = pixman_fixed_1; \
+ \
+ if (!pixman_transform_point_3d (src_image->common.transform, &v)) \
+ return; \
+ \
+ unit_x = src_image->common.transform->matrix[0][0]; \
+ unit_y = src_image->common.transform->matrix[1][1]; \
+ \
+ /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ \
+ v.vector[0] -= pixman_fixed_e; \
+ v.vector[1] -= pixman_fixed_e; \
+ \
+ vx = v.vector[0]; \
+ vy = v.vector[1]; \
+ \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
+ { \
+ /* Clamp repeating positions inside the actual samples */ \
+ max_vx = src_image->bits.width << 16; \
+ max_vy = src_image->bits.height << 16; \
+ \
+ repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \
+ repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
+ } \
+ \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \
+ PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
+ { \
+ pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x, \
+ &width, &left_pad, &right_pad); \
+ vx += left_pad * unit_x; \
+ } \
+ \
+ while (--height >= 0) \
+ { \
+ dst = dst_line; \
+ dst_line += dst_stride; \
+ if (have_mask && !mask_is_solid) \
+ { \
+ mask = mask_line; \
+ mask_line += mask_stride; \
+ } \
+ \
+ y = vy >> 16; \
+ vy += unit_y; \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
+ repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \
+ { \
+ repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height); \
+ src = src_first_line + src_stride * y; \
+ if (left_pad > 0) \
+ { \
+ scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE); \
+ } \
+ if (width > 0) \
+ { \
+ scanline_func (mask + (mask_is_solid ? 0 : left_pad), \
+ dst + left_pad, src, width, vx, unit_x, 0, FALSE); \
+ } \
+ if (right_pad > 0) \
+ { \
+ scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \
+ dst + left_pad + width, src + src_image->bits.width - 1, \
+ right_pad, 0, 0, 0, FALSE); \
+ } \
+ } \
+ else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
+ { \
+ static const src_type_t zero[1] = { 0 }; \
+ if (y < 0 || y >= src_image->bits.height) \
+ { \
+ scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE); \
+ continue; \
+ } \
+ src = src_first_line + src_stride * y; \
+ if (left_pad > 0) \
+ { \
+ scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE); \
+ } \
+ if (width > 0) \
+ { \
+ scanline_func (mask + (mask_is_solid ? 0 : left_pad), \
+ dst + left_pad, src, width, vx, unit_x, 0, FALSE); \
+ } \
+ if (right_pad > 0) \
+ { \
+ scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \
+ dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE); \
+ } \
+ } \
+ else \
+ { \
+ src = src_first_line + src_stride * y; \
+ scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE); \
+ } \
+ } \
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \
+ dst_type_t, repeat_mode, have_mask, mask_is_solid) \
+ FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t, \
+ dst_type_t, repeat_mode, have_mask, mask_is_solid)
+
+#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t, \
+ repeat_mode) \
+ static force_inline void \
+ scanline_func##scale_func_name##_wrapper ( \
+ const uint8_t *mask, \
+ dst_type_t *dst, \
+ const src_type_t *src, \
+ int32_t w, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx, \
+ pixman_bool_t fully_transparent_src) \
+ { \
+ scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src); \
+ } \
+ FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper, \
+ src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
+
+#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t, \
+ repeat_mode) \
+ FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t, \
+ dst_type_t, repeat_mode)
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT, \
+ src_type_t, dst_type_t, OP, repeat_mode) \
+ FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
+ SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t, \
+ OP, repeat_mode) \
+ FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP, \
+ scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
+ src_type_t, dst_type_t, repeat_mode)
+
+
+#define SCALED_NEAREST_FLAGS \
+ (FAST_PATH_SCALE_TRANSFORM | \
+ FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NEAREST_FILTER | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_NORMAL_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_PAD_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_NORMAL_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_PAD_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
+ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_NORMAL_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_PAD_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_NEAREST_FLAGS | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \
+ }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
+ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \
+ }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \
+ SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \
+ SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func) \
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#endif
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index 91adc0560..2e135e2fe 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -5800,7 +5800,8 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
int32_t w,
pixman_fixed_t vx,
pixman_fixed_t unit_x,
- pixman_fixed_t max_vx)
+ pixman_fixed_t max_vx,
+ pixman_bool_t fully_transparent_src)
{
uint32_t s, d;
const uint32_t* pm = NULL;
@@ -5809,6 +5810,9 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
+ if (fully_transparent_src)
+ return;
+
/* Align dst on a 16-byte boundary */
while (w && ((unsigned long)pd & 15))
{
@@ -5894,6 +5898,119 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
scaled_nearest_scanline_sse2_8888_8888_OVER,
uint32_t, uint32_t, PAD)
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+ uint32_t * dst,
+ const uint32_t * src,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ __m128i xmm_mask;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ if (zero_src || (*mask >> 24) == 0)
+ return;
+
+ xmm_mask = create_mask_16_128 (*mask >> 24);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m64 ms = unpack_32_1x64 (s);
+ __m64 alpha = expand_alpha_1x64 (ms);
+ __m64 dest = _mm_movepi64_pi64 (xmm_mask);
+ __m64 alpha_dst = unpack_32_1x64 (d);
+
+ *dst = pack_1x64_32 (
+ in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+ }
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ uint32_t tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp2 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp3 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp4 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+
+ xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+ if (!is_zero (xmm_src))
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ dst += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t s = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m64 ms = unpack_32_1x64 (s);
+ __m64 alpha = expand_alpha_1x64 (ms);
+ __m64 mask = _mm_movepi64_pi64 (xmm_mask);
+ __m64 dest = unpack_32_1x64 (d);
+
+ *dst = pack_1x64_32 (
+ in_over_1x64 (&ms, &alpha, &mask, &dest));
+ }
+
+ dst++;
+ w--;
+ }
+
+ _mm_empty ();
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+
static const pixman_fast_path_t sse2_fast_paths[] =
{
/* PIXMAN_OP_OVER */
@@ -5990,6 +6107,11 @@ static const pixman_fast_path_t sse2_fast_paths[] =
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
{ PIXMAN_OP_NONE },
};
diff --git a/pixman/test/Makefile.am b/pixman/test/Makefile.am
index 8d8471d1c..3ce466eec 100644
--- a/pixman/test/Makefile.am
+++ b/pixman/test/Makefile.am
@@ -1,7 +1,6 @@
AM_CFLAGS = @OPENMP_CFLAGS@
-AM_LDFLAGS = @OPENMP_CFLAGS@
-
-TEST_LDADD = $(top_builddir)/pixman/libpixman-1.la -lm
+AM_LDFLAGS = @OPENMP_CFLAGS@ @TESTPROGS_EXTRA_LDFLAGS@
+LDADD = $(top_builddir)/pixman/libpixman-1.la -lm
INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman
TESTPROGRAMS = \
@@ -22,121 +21,24 @@ TESTPROGRAMS = \
affine-test \
composite
-a1_trap_test_LDADD = $(TEST_LDADD)
-a1_trap_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-fetch_test_LDADD = $(TEST_LDADD)
-fetch_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-trap_crasher_LDADD = $(TEST_LDADD)
-trap_crasher_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-oob_test_LDADD = $(TEST_LDADD)
-oob_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-scaling_crash_test_LDADD = $(TEST_LDADD)
-scaling_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-region_translate_test_LDADD = $(TEST_LDADD)
-region_translate_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-pdf_op_test_LDADD = $(TEST_LDADD)
-pdf_op_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
pdf_op_test_SOURCES = pdf-op-test.c utils.c utils.h
-
-region_test_LDADD = $(TEST_LDADD)
-region_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
region_test_SOURCES = region-test.c utils.c utils.h
-
-blitters_test_LDADD = $(TEST_LDADD)
-blitters_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
blitters_test_SOURCES = blitters-test.c utils.c utils.h
-
-scaling_test_LDADD = $(TEST_LDADD)
-scaling_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
scaling_test_SOURCES = scaling-test.c utils.c utils.h
-
-affine_test_LDADD = $(TEST_LDADD)
-affine_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
affine_test_SOURCES = affine-test.c utils.c utils.h
-
-alphamap_LDADD = $(TEST_LDADD)
-alphamap_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
alphamap_SOURCES = alphamap.c utils.c utils.h
-
-alpha_loop_LDADD = $(TEST_LDADD)
-alpha_loop_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
alpha_loop_SOURCES = alpha-loop.c utils.c utils.h
-
-composite_LDADD = $(TEST_LDADD)
-composite_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
composite_SOURCES = composite.c utils.c utils.h
-
-gradient_crash_test_LDADD = $(TEST_LDADD)
-gradient_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h
-
-stress_test_LDADD = $(TEST_LDADD)
-stress_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
stress_test_SOURCES = stress-test.c utils.c utils.h
-# GTK using test programs
-
-if HAVE_GTK
-
-GTK_LDADD = $(TEST_LDADD) $(GTK_LIBS)
-GTK_UTILS = gtk-utils.c gtk-utils.h
-
-TESTPROGRAMS_GTK = \
- clip-test \
- clip-in \
- composite-test \
- gradient-test \
- radial-test \
- alpha-test \
- screen-test \
- convolution-test \
- trap-test
-
-INCLUDES += $(GTK_CFLAGS)
-
-gradient_test_LDADD = $(GTK_LDADD)
-gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
-
-radial_test_LDADD = $(GTK_LDADD)
-radial_test_SOURCES = radial-test.c utils.c utils.h $(GTK_UTILS)
-
-alpha_test_LDADD = $(GTK_LDADD)
-alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
-
-composite_test_LDADD = $(GTK_LDADD)
-composite_test_SOURCES = composite-test.c $(GTK_UTILS)
-
-clip_test_LDADD = $(GTK_LDADD)
-clip_test_SOURCES = clip-test.c $(GTK_UTILS)
-
-clip_in_LDADD = $(GTK_LDADD)
-clip_in_SOURCES = clip-in.c $(GTK_UTILS)
-
-trap_test_LDADD = $(GTK_LDADD)
-trap_test_SOURCES = trap-test.c $(GTK_UTILS)
-
-screen_test_LDADD = $(GTK_LDADD)
-screen_test_SOURCES = screen-test.c $(GTK_UTILS)
-
-convolution_test_LDADD = $(GTK_LDADD)
-convolution_test_SOURCES = convolution-test.c $(GTK_UTILS)
-
-endif
-
# Benchmarks
BENCHMARKS = \
lowlevel-blt-bench
lowlevel_blt_bench_SOURCES = lowlevel-blt-bench.c utils.c utils.h
-lowlevel_blt_bench_LDADD = $(TEST_LDADD)
-noinst_PROGRAMS = $(TESTPROGRAMS) $(TESTPROGRAMS_GTK) $(BENCHMARKS)
+noinst_PROGRAMS = $(TESTPROGRAMS) $(BENCHMARKS)
TESTS = $(TESTPROGRAMS)
diff --git a/pixman/test/scaling-test.c b/pixman/test/scaling-test.c
index 7b78017a3..dbb9d39b0 100644
--- a/pixman/test/scaling-test.c
+++ b/pixman/test/scaling-test.c
@@ -1,250 +1,368 @@
-/*
- * Test program, which can detect some problems with nearest neighbour
- * and bilinear scaling in pixman. Testing is done by running lots
- * of random SRC and OVER compositing operations a8r8g8b8, x8a8r8g8b8
- * and r5g6b5 color formats.
- *
- * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
- * the case of test failure.
- */
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "utils.h"
-
-#define MAX_SRC_WIDTH 16
-#define MAX_SRC_HEIGHT 16
-#define MAX_DST_WIDTH 16
-#define MAX_DST_HEIGHT 16
-#define MAX_STRIDE 4
-
-/*
- * Composite operation with pseudorandom images
- */
-uint32_t
-test_composite (int testnum,
- int verbose)
-{
- int i;
- pixman_image_t * src_img;
- pixman_image_t * dst_img;
- pixman_transform_t transform;
- pixman_region16_t clip;
- int src_width, src_height;
- int dst_width, dst_height;
- int src_stride, dst_stride;
- int src_x, src_y;
- int dst_x, dst_y;
- int src_bpp;
- int dst_bpp;
- int w, h;
- pixman_fixed_t scale_x = 65536, scale_y = 65536;
- pixman_fixed_t translate_x = 0, translate_y = 0;
- pixman_op_t op;
- pixman_repeat_t repeat = PIXMAN_REPEAT_NONE;
- pixman_format_code_t src_fmt, dst_fmt;
- uint32_t * srcbuf;
- uint32_t * dstbuf;
- uint32_t crc32;
- FLOAT_REGS_CORRUPTION_DETECTOR_START ();
-
- lcg_srand (testnum);
-
- src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
- dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
- op = (lcg_rand_n (2) == 0) ? PIXMAN_OP_SRC : PIXMAN_OP_OVER;
-
- src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
- src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
- dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
- dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
- src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
- dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
-
- if (src_stride & 3)
- src_stride += 2;
-
- if (dst_stride & 3)
- dst_stride += 2;
-
- src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
- src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
- dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
- dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
- w = lcg_rand_n (dst_width * 3 / 2 - dst_x);
- h = lcg_rand_n (dst_height * 3 / 2 - dst_y);
-
- srcbuf = (uint32_t *)malloc (src_stride * src_height);
- dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
-
- for (i = 0; i < src_stride * src_height; i++)
- *((uint8_t *)srcbuf + i) = lcg_rand_n (256);
-
- for (i = 0; i < dst_stride * dst_height; i++)
- *((uint8_t *)dstbuf + i) = lcg_rand_n (256);
-
- src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ?
- PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
-
- dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ?
- PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
-
- src_img = pixman_image_create_bits (
- src_fmt, src_width, src_height, srcbuf, src_stride);
-
- dst_img = pixman_image_create_bits (
- dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
-
- image_endian_swap (src_img, src_bpp * 8);
- image_endian_swap (dst_img, dst_bpp * 8);
-
- if (lcg_rand_n (8) > 0)
- {
- scale_x = -32768 * 3 + lcg_rand_N (65536 * 5);
- scale_y = -32768 * 3 + lcg_rand_N (65536 * 5);
- translate_x = lcg_rand_N (65536);
- translate_y = lcg_rand_N (65536);
- pixman_transform_init_scale (&transform, scale_x, scale_y);
- pixman_transform_translate (&transform, NULL, translate_x, translate_y);
- pixman_image_set_transform (src_img, &transform);
- }
-
- switch (lcg_rand_n (4))
- {
- case 0:
- repeat = PIXMAN_REPEAT_NONE;
- break;
-
- case 1:
- repeat = PIXMAN_REPEAT_NORMAL;
- break;
-
- case 2:
- repeat = PIXMAN_REPEAT_PAD;
- break;
-
- case 3:
- repeat = PIXMAN_REPEAT_REFLECT;
- break;
-
- default:
- break;
- }
- pixman_image_set_repeat (src_img, repeat);
-
- if (lcg_rand_n (2))
- pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
- else
- pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
-
- if (verbose)
- {
- printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
- printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n",
- op, scale_x, scale_y, repeat);
- printf ("translate_x=%d, translate_y=%d\n",
- translate_x, translate_y);
- printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
- src_width, src_height, dst_width, dst_height);
- printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
- src_x, src_y, dst_x, dst_y);
- printf ("w=%d, h=%d\n", w, h);
- }
-
- if (lcg_rand_n (8) == 0)
- {
- pixman_box16_t clip_boxes[2];
- int n = lcg_rand_n (2) + 1;
-
- for (i = 0; i < n; i++)
- {
- clip_boxes[i].x1 = lcg_rand_n (src_width);
- clip_boxes[i].y1 = lcg_rand_n (src_height);
- clip_boxes[i].x2 =
- clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
- clip_boxes[i].y2 =
- clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
-
- if (verbose)
- {
- printf ("source clip box: [%d,%d-%d,%d]\n",
- clip_boxes[i].x1, clip_boxes[i].y1,
- clip_boxes[i].x2, clip_boxes[i].y2);
- }
- }
-
- pixman_region_init_rects (&clip, clip_boxes, n);
- pixman_image_set_clip_region (src_img, &clip);
- pixman_image_set_source_clipping (src_img, 1);
- pixman_region_fini (&clip);
- }
-
- if (lcg_rand_n (8) == 0)
- {
- pixman_box16_t clip_boxes[2];
- int n = lcg_rand_n (2) + 1;
- for (i = 0; i < n; i++)
- {
- clip_boxes[i].x1 = lcg_rand_n (dst_width);
- clip_boxes[i].y1 = lcg_rand_n (dst_height);
- clip_boxes[i].x2 =
- clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
- clip_boxes[i].y2 =
- clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
-
- if (verbose)
- {
- printf ("destination clip box: [%d,%d-%d,%d]\n",
- clip_boxes[i].x1, clip_boxes[i].y1,
- clip_boxes[i].x2, clip_boxes[i].y2);
- }
- }
- pixman_region_init_rects (&clip, clip_boxes, n);
- pixman_image_set_clip_region (dst_img, &clip);
- pixman_region_fini (&clip);
- }
-
- pixman_image_composite (op, src_img, NULL, dst_img,
- src_x, src_y, 0, 0, dst_x, dst_y, w, h);
-
- if (dst_fmt == PIXMAN_x8r8g8b8)
- {
- /* ignore unused part */
- for (i = 0; i < dst_stride * dst_height / 4; i++)
- dstbuf[i] &= 0xFFFFFF;
- }
-
- image_endian_swap (dst_img, dst_bpp * 8);
-
- if (verbose)
- {
- int j;
-
- for (i = 0; i < dst_height; i++)
- {
- for (j = 0; j < dst_stride; j++)
- printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
-
- printf ("\n");
- }
- }
-
- pixman_image_unref (src_img);
- pixman_image_unref (dst_img);
-
- crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height);
- free (srcbuf);
- free (dstbuf);
-
- FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
- return crc32;
-}
-
-int
-main (int argc, const char *argv[])
-{
- pixman_disable_out_of_bounds_workaround ();
-
- return fuzzer_test_main("scaling", 8000000, 0x7F1AB59F,
- test_composite, argc, argv);
-}
+/*
+ * Test program, which can detect some problems with nearest neighbour
+ * and bilinear scaling in pixman. Testing is done by running lots
+ * of random SRC and OVER compositing operations a8r8g8b8, x8a8r8g8b8
+ * and r5g6b5 color formats.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH 48
+#define MAX_SRC_HEIGHT 8
+#define MAX_DST_WIDTH 48
+#define MAX_DST_HEIGHT 8
+#define MAX_STRIDE 4
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int testnum,
+ int verbose)
+{
+ int i;
+ pixman_image_t * src_img;
+ pixman_image_t * mask_img;
+ pixman_image_t * dst_img;
+ pixman_transform_t transform;
+ pixman_region16_t clip;
+ int src_width, src_height;
+ int mask_width, mask_height;
+ int dst_width, dst_height;
+ int src_stride, mask_stride, dst_stride;
+ int src_x, src_y;
+ int mask_x, mask_y;
+ int dst_x, dst_y;
+ int src_bpp;
+ int mask_bpp = 1;
+ int dst_bpp;
+ int w, h;
+ pixman_fixed_t scale_x = 65536, scale_y = 65536;
+ pixman_fixed_t translate_x = 0, translate_y = 0;
+ pixman_fixed_t mask_scale_x = 65536, mask_scale_y = 65536;
+ pixman_fixed_t mask_translate_x = 0, mask_translate_y = 0;
+ pixman_op_t op;
+ pixman_repeat_t repeat = PIXMAN_REPEAT_NONE;
+ pixman_repeat_t mask_repeat = PIXMAN_REPEAT_NONE;
+ pixman_format_code_t src_fmt, dst_fmt;
+ uint32_t * srcbuf;
+ uint32_t * dstbuf;
+ uint32_t * maskbuf;
+ uint32_t crc32;
+ FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+ lcg_srand (testnum);
+
+ src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+ dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+ switch (lcg_rand_n (3))
+ {
+ case 0:
+ op = PIXMAN_OP_SRC;
+ break;
+ case 1:
+ op = PIXMAN_OP_OVER;
+ break;
+ default:
+ op = PIXMAN_OP_ADD;
+ break;
+ }
+
+ src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+ src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+
+ if (lcg_rand_n (2))
+ {
+ mask_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+ mask_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+ }
+ else
+ {
+ mask_width = mask_height = 1;
+ }
+
+ dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
+ dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
+ src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
+ mask_stride = mask_width * mask_bpp + lcg_rand_n (MAX_STRIDE) * mask_bpp;
+ dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
+
+ if (src_stride & 3)
+ src_stride += 2;
+
+ if (mask_stride & 1)
+ mask_stride += 1;
+ if (mask_stride & 2)
+ mask_stride += 2;
+
+ if (dst_stride & 3)
+ dst_stride += 2;
+
+ src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
+ src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
+ mask_x = -(mask_width / 4) + lcg_rand_n (mask_width * 3 / 2);
+ mask_y = -(mask_height / 4) + lcg_rand_n (mask_height * 3 / 2);
+ dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
+ dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
+ w = lcg_rand_n (dst_width * 3 / 2 - dst_x);
+ h = lcg_rand_n (dst_height * 3 / 2 - dst_y);
+
+ srcbuf = (uint32_t *)malloc (src_stride * src_height);
+ maskbuf = (uint32_t *)malloc (mask_stride * mask_height);
+ dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
+
+ for (i = 0; i < src_stride * src_height; i++)
+ *((uint8_t *)srcbuf + i) = lcg_rand_n (256);
+
+ for (i = 0; i < mask_stride * mask_height; i++)
+ *((uint8_t *)maskbuf + i) = lcg_rand_n (256);
+
+ for (i = 0; i < dst_stride * dst_height; i++)
+ *((uint8_t *)dstbuf + i) = lcg_rand_n (256);
+
+ src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+ PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+ dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+ PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+ src_img = pixman_image_create_bits (
+ src_fmt, src_width, src_height, srcbuf, src_stride);
+
+ mask_img = pixman_image_create_bits (
+ PIXMAN_a8, mask_width, mask_height, maskbuf, mask_stride);
+
+ dst_img = pixman_image_create_bits (
+ dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
+
+ image_endian_swap (src_img, src_bpp * 8);
+ image_endian_swap (dst_img, dst_bpp * 8);
+
+ if (lcg_rand_n (4) > 0)
+ {
+ scale_x = -32768 * 3 + lcg_rand_N (65536 * 5);
+ scale_y = -32768 * 3 + lcg_rand_N (65536 * 5);
+ translate_x = lcg_rand_N (65536);
+ translate_y = lcg_rand_N (65536);
+ pixman_transform_init_scale (&transform, scale_x, scale_y);
+ pixman_transform_translate (&transform, NULL, translate_x, translate_y);
+ pixman_image_set_transform (src_img, &transform);
+ }
+
+ if (lcg_rand_n (2) > 0)
+ {
+ mask_scale_x = -32768 * 3 + lcg_rand_N (65536 * 5);
+ mask_scale_y = -32768 * 3 + lcg_rand_N (65536 * 5);
+ mask_translate_x = lcg_rand_N (65536);
+ mask_translate_y = lcg_rand_N (65536);
+ pixman_transform_init_scale (&transform, mask_scale_x, mask_scale_y);
+ pixman_transform_translate (&transform, NULL, mask_translate_x, mask_translate_y);
+ pixman_image_set_transform (mask_img, &transform);
+ }
+
+ switch (lcg_rand_n (4))
+ {
+ case 0:
+ mask_repeat = PIXMAN_REPEAT_NONE;
+ break;
+
+ case 1:
+ mask_repeat = PIXMAN_REPEAT_NORMAL;
+ break;
+
+ case 2:
+ mask_repeat = PIXMAN_REPEAT_PAD;
+ break;
+
+ case 3:
+ mask_repeat = PIXMAN_REPEAT_REFLECT;
+ break;
+
+ default:
+ break;
+ }
+ pixman_image_set_repeat (mask_img, mask_repeat);
+
+ switch (lcg_rand_n (4))
+ {
+ case 0:
+ repeat = PIXMAN_REPEAT_NONE;
+ break;
+
+ case 1:
+ repeat = PIXMAN_REPEAT_NORMAL;
+ break;
+
+ case 2:
+ repeat = PIXMAN_REPEAT_PAD;
+ break;
+
+ case 3:
+ repeat = PIXMAN_REPEAT_REFLECT;
+ break;
+
+ default:
+ break;
+ }
+ pixman_image_set_repeat (src_img, repeat);
+
+ if (lcg_rand_n (2))
+ pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+ else
+ pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+ if (lcg_rand_n (2))
+ pixman_image_set_filter (mask_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+ else
+ pixman_image_set_filter (mask_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+ if (verbose)
+ {
+ printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
+ printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n",
+ op, scale_x, scale_y, repeat);
+ printf ("translate_x=%d, translate_y=%d\n",
+ translate_x, translate_y);
+ printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+ src_width, src_height, dst_width, dst_height);
+ printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+ src_x, src_y, dst_x, dst_y);
+ printf ("w=%d, h=%d\n", w, h);
+ }
+
+ if (lcg_rand_n (8) == 0)
+ {
+ pixman_box16_t clip_boxes[2];
+ int n = lcg_rand_n (2) + 1;
+
+ for (i = 0; i < n; i++)
+ {
+ clip_boxes[i].x1 = lcg_rand_n (src_width);
+ clip_boxes[i].y1 = lcg_rand_n (src_height);
+ clip_boxes[i].x2 =
+ clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
+ clip_boxes[i].y2 =
+ clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
+
+ if (verbose)
+ {
+ printf ("source clip box: [%d,%d-%d,%d]\n",
+ clip_boxes[i].x1, clip_boxes[i].y1,
+ clip_boxes[i].x2, clip_boxes[i].y2);
+ }
+ }
+
+ pixman_region_init_rects (&clip, clip_boxes, n);
+ pixman_image_set_clip_region (src_img, &clip);
+ pixman_image_set_source_clipping (src_img, 1);
+ pixman_region_fini (&clip);
+ }
+
+ if (lcg_rand_n (8) == 0)
+ {
+ pixman_box16_t clip_boxes[2];
+ int n = lcg_rand_n (2) + 1;
+
+ for (i = 0; i < n; i++)
+ {
+ clip_boxes[i].x1 = lcg_rand_n (mask_width);
+ clip_boxes[i].y1 = lcg_rand_n (mask_height);
+ clip_boxes[i].x2 =
+ clip_boxes[i].x1 + lcg_rand_n (mask_width - clip_boxes[i].x1);
+ clip_boxes[i].y2 =
+ clip_boxes[i].y1 + lcg_rand_n (mask_height - clip_boxes[i].y1);
+
+ if (verbose)
+ {
+ printf ("mask clip box: [%d,%d-%d,%d]\n",
+ clip_boxes[i].x1, clip_boxes[i].y1,
+ clip_boxes[i].x2, clip_boxes[i].y2);
+ }
+ }
+
+ pixman_region_init_rects (&clip, clip_boxes, n);
+ pixman_image_set_clip_region (mask_img, &clip);
+ pixman_image_set_source_clipping (mask_img, 1);
+ pixman_region_fini (&clip);
+ }
+
+ if (lcg_rand_n (8) == 0)
+ {
+ pixman_box16_t clip_boxes[2];
+ int n = lcg_rand_n (2) + 1;
+ for (i = 0; i < n; i++)
+ {
+ clip_boxes[i].x1 = lcg_rand_n (dst_width);
+ clip_boxes[i].y1 = lcg_rand_n (dst_height);
+ clip_boxes[i].x2 =
+ clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
+ clip_boxes[i].y2 =
+ clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
+
+ if (verbose)
+ {
+ printf ("destination clip box: [%d,%d-%d,%d]\n",
+ clip_boxes[i].x1, clip_boxes[i].y1,
+ clip_boxes[i].x2, clip_boxes[i].y2);
+ }
+ }
+ pixman_region_init_rects (&clip, clip_boxes, n);
+ pixman_image_set_clip_region (dst_img, &clip);
+ pixman_region_fini (&clip);
+ }
+
+ if (lcg_rand_n (2) == 0)
+ pixman_image_composite (op, src_img, NULL, dst_img,
+ src_x, src_y, 0, 0, dst_x, dst_y, w, h);
+ else
+ pixman_image_composite (op, src_img, mask_img, dst_img,
+ src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
+
+ if (dst_fmt == PIXMAN_x8r8g8b8)
+ {
+ /* ignore unused part */
+ for (i = 0; i < dst_stride * dst_height / 4; i++)
+ dstbuf[i] &= 0xFFFFFF;
+ }
+
+ image_endian_swap (dst_img, dst_bpp * 8);
+
+ if (verbose)
+ {
+ int j;
+
+ for (i = 0; i < dst_height; i++)
+ {
+ for (j = 0; j < dst_stride; j++)
+ printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+
+ printf ("\n");
+ }
+ }
+
+ pixman_image_unref (src_img);
+ pixman_image_unref (mask_img);
+ pixman_image_unref (dst_img);
+
+ crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height);
+ free (srcbuf);
+ free (maskbuf);
+ free (dstbuf);
+
+ FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+ return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+ pixman_disable_out_of_bounds_workaround ();
+
+ return fuzzer_test_main("scaling", 8000000, 0x80DF1CB2,
+ test_composite, argc, argv);
+}