diff options
Diffstat (limited to 'pixman')
-rw-r--r-- | pixman/Makefile.am | 2 | ||||
-rw-r--r-- | pixman/configure.ac | 1 | ||||
-rw-r--r-- | pixman/demos/Makefile.am | 34 | ||||
-rw-r--r-- | pixman/demos/alpha-test.c (renamed from pixman/test/alpha-test.c) | 0 | ||||
-rw-r--r-- | pixman/demos/clip-in.c (renamed from pixman/test/clip-in.c) | 0 | ||||
-rw-r--r-- | pixman/demos/clip-test.c (renamed from pixman/test/clip-test.c) | 0 | ||||
-rw-r--r-- | pixman/demos/composite-test.c (renamed from pixman/test/composite-test.c) | 0 | ||||
-rw-r--r-- | pixman/demos/convolution-test.c (renamed from pixman/test/convolution-test.c) | 0 | ||||
-rw-r--r-- | pixman/demos/gradient-test.c (renamed from pixman/test/gradient-test.c) | 0 | ||||
-rw-r--r-- | pixman/demos/gtk-utils.c (renamed from pixman/test/gtk-utils.c) | 0 | ||||
-rw-r--r-- | pixman/demos/gtk-utils.h (renamed from pixman/test/gtk-utils.h) | 0 | ||||
-rw-r--r-- | pixman/demos/radial-test.c (renamed from pixman/test/radial-test.c) | 2 | ||||
-rw-r--r-- | pixman/demos/screen-test.c (renamed from pixman/test/screen-test.c) | 0 | ||||
-rw-r--r-- | pixman/demos/trap-test.c (renamed from pixman/test/trap-test.c) | 0 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-common.h | 59 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-neon-asm.S | 4758 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-neon.c | 11 | ||||
-rw-r--r-- | pixman/pixman/pixman-fast-path.c | 4455 | ||||
-rw-r--r-- | pixman/pixman/pixman-fast-path.h | 1039 | ||||
-rw-r--r-- | pixman/pixman/pixman-sse2.c | 124 | ||||
-rw-r--r-- | pixman/test/Makefile.am | 104 | ||||
-rw-r--r-- | pixman/test/scaling-test.c | 618 |
22 files changed, 5805 insertions, 5402 deletions
diff --git a/pixman/Makefile.am b/pixman/Makefile.am index 63b08c1fb..062c58a89 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = pixman test +SUBDIRS = pixman test demos pkgconfigdir=$(libdir)/pkgconfig pkgconfig_DATA=pixman-1.pc diff --git a/pixman/configure.ac b/pixman/configure.ac index ab2ecde1b..5242799bb 100644 --- a/pixman/configure.ac +++ b/pixman/configure.ac @@ -794,6 +794,7 @@ AC_OUTPUT([pixman-1.pc Makefile pixman/Makefile pixman/pixman-version.h + demos/Makefile test/Makefile]) m4_if(m4_eval(pixman_minor % 2), [1], [ diff --git a/pixman/demos/Makefile.am b/pixman/demos/Makefile.am new file mode 100644 index 000000000..2dcdfd350 --- /dev/null +++ b/pixman/demos/Makefile.am @@ -0,0 +1,34 @@ +if HAVE_GTK + +AM_CFLAGS = @OPENMP_CFLAGS@ +AM_LDFLAGS = @OPENMP_CFLAGS@ + +LDADD = $(GTK_LIBS) $(top_builddir)/pixman/libpixman-1.la -lm +INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(GTK_CFLAGS) + +GTK_UTILS = gtk-utils.c gtk-utils.h + +DEMOS = \ + clip-test \ + clip-in \ + composite-test \ + gradient-test \ + radial-test \ + alpha-test \ + screen-test \ + convolution-test \ + trap-test + +gradient_test_SOURCES = gradient-test.c $(GTK_UTILS) +alpha_test_SOURCES = alpha-test.c $(GTK_UTILS) +composite_test_SOURCES = composite-test.c $(GTK_UTILS) +clip_test_SOURCES = clip-test.c $(GTK_UTILS) +clip_in_SOURCES = clip-in.c $(GTK_UTILS) +trap_test_SOURCES = trap-test.c $(GTK_UTILS) +screen_test_SOURCES = screen-test.c $(GTK_UTILS) +convolution_test_SOURCES = convolution-test.c $(GTK_UTILS) +radial_test_SOURCES = radial-test.c ../test/utils.c ../test/utils.h $(GTK_UTILS) + +noinst_PROGRAMS = $(DEMOS) + +endif diff --git a/pixman/test/alpha-test.c b/pixman/demos/alpha-test.c index 92c208142..92c208142 100644 --- a/pixman/test/alpha-test.c +++ b/pixman/demos/alpha-test.c diff --git a/pixman/test/clip-in.c b/pixman/demos/clip-in.c index 51579811f..51579811f 100644 --- a/pixman/test/clip-in.c +++ b/pixman/demos/clip-in.c diff --git a/pixman/test/clip-test.c b/pixman/demos/clip-test.c index aa0df4482..aa0df4482 100644 --- a/pixman/test/clip-test.c +++ b/pixman/demos/clip-test.c diff --git a/pixman/test/composite-test.c b/pixman/demos/composite-test.c index 79d5d5eac..79d5d5eac 100644 --- a/pixman/test/composite-test.c +++ b/pixman/demos/composite-test.c diff --git a/pixman/test/convolution-test.c b/pixman/demos/convolution-test.c index da284af7b..da284af7b 100644 --- a/pixman/test/convolution-test.c +++ b/pixman/demos/convolution-test.c diff --git a/pixman/test/gradient-test.c b/pixman/demos/gradient-test.c index fc84844b0..fc84844b0 100644 --- a/pixman/test/gradient-test.c +++ b/pixman/demos/gradient-test.c diff --git a/pixman/test/gtk-utils.c b/pixman/demos/gtk-utils.c index f45cdc912..f45cdc912 100644 --- a/pixman/test/gtk-utils.c +++ b/pixman/demos/gtk-utils.c diff --git a/pixman/test/gtk-utils.h b/pixman/demos/gtk-utils.h index 2cb13bcf0..2cb13bcf0 100644 --- a/pixman/test/gtk-utils.h +++ b/pixman/demos/gtk-utils.h diff --git a/pixman/test/radial-test.c b/pixman/demos/radial-test.c index 5d716c339..35e90d786 100644 --- a/pixman/test/radial-test.c +++ b/pixman/demos/radial-test.c @@ -1,4 +1,4 @@ -#include "utils.h" +#include "../test/utils.h" #include "gtk-utils.h" #define NUM_GRADIENTS 7 diff --git a/pixman/test/screen-test.c b/pixman/demos/screen-test.c index e69dba3de..e69dba3de 100644 --- a/pixman/test/screen-test.c +++ b/pixman/demos/screen-test.c diff --git a/pixman/test/trap-test.c b/pixman/demos/trap-test.c index 19295e7a5..19295e7a5 100644 --- a/pixman/test/trap-test.c +++ b/pixman/demos/trap-test.c diff --git a/pixman/pixman/pixman-arm-common.h b/pixman/pixman/pixman-arm-common.h index 372e9f9a8..9b1322b6c 100644 --- a/pixman/pixman/pixman-arm-common.h +++ b/pixman/pixman/pixman-arm-common.h @@ -282,19 +282,20 @@ cputype##_composite_##name (pixman_implementation_t *imp, \ src_type, dst_type) \ void \ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \ - int32_t w, \ - dst_type * dst, \ - src_type * src, \ - pixman_fixed_t vx, \ - pixman_fixed_t unit_x);\ + int32_t w, \ + dst_type * dst, \ + const src_type * src, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x); \ \ static force_inline void \ scaled_nearest_scanline_##cputype##_##name##_##op (dst_type * pd, \ - src_type * ps, \ + const src_type * ps, \ int32_t w, \ pixman_fixed_t vx, \ pixman_fixed_t unit_x, \ - pixman_fixed_t max_vx) \ + pixman_fixed_t max_vx, \ + pixman_bool_t zero_src) \ { \ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \ vx, unit_x);\ @@ -316,4 +317,48 @@ FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op, \ SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \ SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func) +#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op, \ + src_type, dst_type) \ +void \ +pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \ + int32_t w, \ + dst_type * dst, \ + const src_type * src, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x, \ + const uint8_t * mask); \ + \ +static force_inline void \ +scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t * mask, \ + dst_type * pd, \ + const src_type * ps, \ + int32_t w, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx, \ + pixman_bool_t zero_src) \ +{ \ + if ((flags & SKIP_ZERO_SRC) && zero_src) \ + return; \ + pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \ + vx, unit_x, \ + mask); \ +} \ + \ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ + scaled_nearest_scanline_##cputype##_##name##_##op,\ + src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ + scaled_nearest_scanline_##cputype##_##name##_##op,\ + src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ + scaled_nearest_scanline_##cputype##_##name##_##op,\ + src_type, uint8_t, dst_type, PAD, TRUE, FALSE) + +/* Provide entries for the fast path table */ +#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func) + #endif diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S index 51533d42f..47daf457c 100644 --- a/pixman/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman/pixman-arm-neon-asm.S @@ -1,2365 +1,2393 @@ -/*
- * Copyright © 2009 Nokia Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
- */
-
-/*
- * This file contains implementations of NEON optimized pixel processing
- * functions. There is no full and detailed tutorial, but some functions
- * (those which are exposing some new or interesting features) are
- * extensively commented and can be used as examples.
- *
- * You may want to have a look at the comments for following functions:
- * - pixman_composite_over_8888_0565_asm_neon
- * - pixman_composite_over_n_8_0565_asm_neon
- */
-
-/* Prevent the stack from becoming executable for no reason... */
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
- .text
- .fpu neon
- .arch armv7a
- .object_arch armv4
- .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
- .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
- .arm
- .altmacro
-
-#include "pixman-arm-neon-asm.h"
-
-/* Global configuration options and preferences */
-
-/*
- * The code can optionally make use of unaligned memory accesses to improve
- * performance of handling leading/trailing pixels for each scanline.
- * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
- * example in linux if unaligned memory accesses are not configured to
- * generate.exceptions.
- */
-.set RESPECT_STRICT_ALIGNMENT, 1
-
-/*
- * Set default prefetch type. There is a choice between the following options:
- *
- * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
- * as NOP to workaround some HW bugs or for whatever other reason)
- *
- * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
- * advanced prefetch intruduces heavy overhead)
- *
- * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
- * which can run ARM and NEON instructions simultaneously so that extra ARM
- * instructions do not add (many) extra cycles, but improve prefetch efficiency)
- *
- * Note: some types of function can't support advanced prefetch and fallback
- * to simple one (those which handle 24bpp pixels)
- */
-.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
-
-/* Prefetch distance in pixels for simple prefetch */
-.set PREFETCH_DISTANCE_SIMPLE, 64
-
-/*
- * Implementation of pixman_composite_over_8888_0565_asm_neon
- *
- * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
- * performs OVER compositing operation. Function fast_composite_over_8888_0565
- * from pixman-fast-path.c does the same in C and can be used as a reference.
- *
- * First we need to have some NEON assembly code which can do the actual
- * operation on the pixels and provide it to the template macro.
- *
- * Template macro quite conveniently takes care of emitting all the necessary
- * code for memory reading and writing (including quite tricky cases of
- * handling unaligned leading/trailing pixels), so we only need to deal with
- * the data in NEON registers.
- *
- * NEON registers allocation in general is recommented to be the following:
- * d0, d1, d2, d3 - contain loaded source pixel data
- * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
- * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
- * d28, d29, d30, d31 - place for storing the result (destination pixels)
- *
- * As can be seen above, four 64-bit NEON registers are used for keeping
- * intermediate pixel data and up to 8 pixels can be processed in one step
- * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
- *
- * This particular function uses the following registers allocation:
- * d0, d1, d2, d3 - contain loaded source pixel data
- * d4, d5 - contain loaded destination pixels (they are needed)
- * d28, d29 - place for storing the result (destination pixels)
- */
-
-/*
- * Step one. We need to have some code to do some arithmetics on pixel data.
- * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
- * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
- * perform all the needed calculations and write the result to {d28, d29}.
- * The rationale for having two macros and not just one will be explained
- * later. In practice, any single monolitic function which does the work can
- * be split into two parts in any arbitrary way without affecting correctness.
- *
- * There is one special trick here too. Common template macro can optionally
- * make our life a bit easier by doing R, G, B, A color components
- * deinterleaving for 32bpp pixel formats (and this feature is used in
- * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
- * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
- * actually use d0 register for blue channel (a vector of eight 8-bit
- * values), d1 register for green, d2 for red and d3 for alpha. This
- * simple conversion can be also done with a few NEON instructions:
- *
- * Packed to planar conversion:
- * vuzp.8 d0, d1
- * vuzp.8 d2, d3
- * vuzp.8 d1, d3
- * vuzp.8 d0, d2
- *
- * Planar to packed conversion:
- * vzip.8 d0, d2
- * vzip.8 d1, d3
- * vzip.8 d2, d3
- * vzip.8 d0, d1
- *
- * But pixel can be loaded directly in planar format using VLD4.8 NEON
- * instruction. It is 1 cycle slower than VLD1.32, so this is not always
- * desirable, that's why deinterleaving is optional.
- *
- * But anyway, here is the code:
- */
-.macro pixman_composite_over_8888_0565_process_pixblock_head
- /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
- and put data into d6 - red, d7 - green, d30 - blue */
- vshrn.u16 d6, q2, #8
- vshrn.u16 d7, q2, #3
- vsli.u16 q2, q2, #5
- vsri.u8 d6, d6, #5
- vmvn.8 d3, d3 /* invert source alpha */
- vsri.u8 d7, d7, #6
- vshrn.u16 d30, q2, #2
- /* now do alpha blending, storing results in 8-bit planar format
- into d16 - red, d19 - green, d18 - blue */
- vmull.u8 q10, d3, d6
- vmull.u8 q11, d3, d7
- vmull.u8 q12, d3, d30
- vrshr.u16 q13, q10, #8
- vrshr.u16 q3, q11, #8
- vrshr.u16 q15, q12, #8
- vraddhn.u16 d20, q10, q13
- vraddhn.u16 d23, q11, q3
- vraddhn.u16 d22, q12, q15
-.endm
-
-.macro pixman_composite_over_8888_0565_process_pixblock_tail
- /* ... continue alpha blending */
- vqadd.u8 d16, d2, d20
- vqadd.u8 q9, q0, q11
- /* convert the result to r5g6b5 and store it into {d28, d29} */
- vshll.u8 q14, d16, #8
- vshll.u8 q8, d19, #8
- vshll.u8 q9, d18, #8
- vsri.u16 q14, q8, #5
- vsri.u16 q14, q9, #11
-.endm
-
-/*
- * OK, now we got almost everything that we need. Using the above two
- * macros, the work can be done right. But now we want to optimize
- * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
- * a lot from good code scheduling and software pipelining.
- *
- * Let's construct some code, which will run in the core main loop.
- * Some pseudo-code of the main loop will look like this:
- * head
- * while (...) {
- * tail
- * head
- * }
- * tail
- *
- * It may look a bit weird, but this setup allows to hide instruction
- * latencies better and also utilize dual-issue capability more
- * efficiently (make pairs of load-store and ALU instructions).
- *
- * So what we need now is a '*_tail_head' macro, which will be used
- * in the core main loop. A trivial straightforward implementation
- * of this macro would look like this:
- *
- * pixman_composite_over_8888_0565_process_pixblock_tail
- * vst1.16 {d28, d29}, [DST_W, :128]!
- * vld1.16 {d4, d5}, [DST_R, :128]!
- * vld4.32 {d0, d1, d2, d3}, [SRC]!
- * pixman_composite_over_8888_0565_process_pixblock_head
- * cache_preload 8, 8
- *
- * Now it also got some VLD/VST instructions. We simply can't move from
- * processing one block of pixels to the other one with just arithmetics.
- * The previously processed data needs to be written to memory and new
- * data needs to be fetched. Fortunately, this main loop does not deal
- * with partial leading/trailing pixels and can load/store a full block
- * of pixels in a bulk. Additionally, destination buffer is already
- * 16 bytes aligned here (which is good for performance).
- *
- * New things here are DST_R, DST_W, SRC and MASK identifiers. These
- * are the aliases for ARM registers which are used as pointers for
- * accessing data. We maintain separate pointers for reading and writing
- * destination buffer (DST_R and DST_W).
- *
- * Another new thing is 'cache_preload' macro. It is used for prefetching
- * data into CPU L2 cache and improve performance when dealing with large
- * images which are far larger than cache size. It uses one argument
- * (actually two, but they need to be the same here) - number of pixels
- * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
- * details about this macro. Moreover, if good performance is needed
- * the code from this macro needs to be copied into '*_tail_head' macro
- * and mixed with the rest of code for optimal instructions scheduling.
- * We are actually doing it below.
- *
- * Now after all the explanations, here is the optimized code.
- * Different instruction streams (originaling from '*_head', '*_tail'
- * and 'cache_preload' macro) use different indentation levels for
- * better readability. Actually taking the code from one of these
- * indentation levels and ignoring a few VLD/VST instructions would
- * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
- * macro!
- */
-
-#if 1
-
-.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
- vqadd.u8 d16, d2, d20
- vld1.16 {d4, d5}, [DST_R, :128]!
- vqadd.u8 q9, q0, q11
- vshrn.u16 d6, q2, #8
- fetch_src_pixblock
- vshrn.u16 d7, q2, #3
- vsli.u16 q2, q2, #5
- vshll.u8 q14, d16, #8
- PF add PF_X, PF_X, #8
- vshll.u8 q8, d19, #8
- PF tst PF_CTL, #0xF
- vsri.u8 d6, d6, #5
- PF addne PF_X, PF_X, #8
- vmvn.8 d3, d3
- PF subne PF_CTL, PF_CTL, #1
- vsri.u8 d7, d7, #6
- vshrn.u16 d30, q2, #2
- vmull.u8 q10, d3, d6
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- vmull.u8 q11, d3, d7
- vmull.u8 q12, d3, d30
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- vsri.u16 q14, q8, #5
- PF cmp PF_X, ORIG_W
- vshll.u8 q9, d18, #8
- vrshr.u16 q13, q10, #8
- PF subge PF_X, PF_X, ORIG_W
- vrshr.u16 q3, q11, #8
- vrshr.u16 q15, q12, #8
- PF subges PF_CTL, PF_CTL, #0x10
- vsri.u16 q14, q9, #11
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- vraddhn.u16 d20, q10, q13
- vraddhn.u16 d23, q11, q3
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vraddhn.u16 d22, q12, q15
- vst1.16 {d28, d29}, [DST_W, :128]!
-.endm
-
-#else
-
-/* If we did not care much about the performance, we would just use this... */
-.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
- pixman_composite_over_8888_0565_process_pixblock_tail
- vst1.16 {d28, d29}, [DST_W, :128]!
- vld1.16 {d4, d5}, [DST_R, :128]!
- fetch_src_pixblock
- pixman_composite_over_8888_0565_process_pixblock_head
- cache_preload 8, 8
-.endm
-
-#endif
-
-/*
- * And now the final part. We are using 'generate_composite_function' macro
- * to put all the stuff together. We are specifying the name of the function
- * which we want to get, number of bits per pixel for the source, mask and
- * destination (0 if unused, like mask in this case). Next come some bit
- * flags:
- * FLAG_DST_READWRITE - tells that the destination buffer is both read
- * and written, for write-only buffer we would use
- * FLAG_DST_WRITEONLY flag instead
- * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
- * and separate color channels for 32bpp format.
- * The next things are:
- * - the number of pixels processed per iteration (8 in this case, because
- * that's the maximum what can fit into four 64-bit NEON registers).
- * - prefetch distance, measured in pixel blocks. In this case it is 5 times
- * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
- * prefetch distance can be selected by running some benchmarks.
- *
- * After that we specify some macros, these are 'default_init',
- * 'default_cleanup' here which are empty (but it is possible to have custom
- * init/cleanup macros to be able to save/restore some extra NEON registers
- * like d8-d15 or do anything else) followed by
- * 'pixman_composite_over_8888_0565_process_pixblock_head',
- * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
- * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
- * which we got implemented above.
- *
- * The last part is the NEON registers allocation scheme.
- */
-generate_composite_function \
- pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_over_8888_0565_process_pixblock_head, \
- pixman_composite_over_8888_0565_process_pixblock_tail, \
- pixman_composite_over_8888_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 24 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_0565_process_pixblock_head
- /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
- and put data into d6 - red, d7 - green, d30 - blue */
- vshrn.u16 d6, q2, #8
- vshrn.u16 d7, q2, #3
- vsli.u16 q2, q2, #5
- vsri.u8 d6, d6, #5
- vsri.u8 d7, d7, #6
- vshrn.u16 d30, q2, #2
- /* now do alpha blending, storing results in 8-bit planar format
- into d16 - red, d19 - green, d18 - blue */
- vmull.u8 q10, d3, d6
- vmull.u8 q11, d3, d7
- vmull.u8 q12, d3, d30
- vrshr.u16 q13, q10, #8
- vrshr.u16 q3, q11, #8
- vrshr.u16 q15, q12, #8
- vraddhn.u16 d20, q10, q13
- vraddhn.u16 d23, q11, q3
- vraddhn.u16 d22, q12, q15
-.endm
-
-.macro pixman_composite_over_n_0565_process_pixblock_tail
- /* ... continue alpha blending */
- vqadd.u8 d16, d2, d20
- vqadd.u8 q9, q0, q11
- /* convert the result to r5g6b5 and store it into {d28, d29} */
- vshll.u8 q14, d16, #8
- vshll.u8 q8, d19, #8
- vshll.u8 q9, d18, #8
- vsri.u16 q14, q8, #5
- vsri.u16 q14, q9, #11
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_0565_process_pixblock_tail_head
- pixman_composite_over_n_0565_process_pixblock_tail
- vld1.16 {d4, d5}, [DST_R, :128]!
- vst1.16 {d28, d29}, [DST_W, :128]!
- pixman_composite_over_n_0565_process_pixblock_head
- cache_preload 8, 8
-.endm
-
-.macro pixman_composite_over_n_0565_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d3[0]}, [DUMMY]
- vdup.8 d0, d3[0]
- vdup.8 d1, d3[1]
- vdup.8 d2, d3[2]
- vdup.8 d3, d3[3]
- vmvn.8 d3, d3 /* invert source alpha */
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_0565_init, \
- default_cleanup, \
- pixman_composite_over_n_0565_process_pixblock_head, \
- pixman_composite_over_n_0565_process_pixblock_tail, \
- pixman_composite_over_n_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 24 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_8888_0565_process_pixblock_head
- vshll.u8 q8, d1, #8
- vshll.u8 q14, d2, #8
- vshll.u8 q9, d0, #8
-.endm
-
-.macro pixman_composite_src_8888_0565_process_pixblock_tail
- vsri.u16 q14, q8, #5
- vsri.u16 q14, q9, #11
-.endm
-
-.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
- vsri.u16 q14, q8, #5
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- fetch_src_pixblock
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vsri.u16 q14, q9, #11
- PF cmp PF_X, ORIG_W
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- vshll.u8 q8, d1, #8
- vst1.16 {d28, d29}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- vshll.u8 q14, d2, #8
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- vshll.u8 q9, d0, #8
-.endm
-
-generate_composite_function \
- pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_8888_0565_process_pixblock_head, \
- pixman_composite_src_8888_0565_process_pixblock_tail, \
- pixman_composite_src_8888_0565_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0565_8888_process_pixblock_head
- vshrn.u16 d30, q0, #8
- vshrn.u16 d29, q0, #3
- vsli.u16 q0, q0, #5
- vmov.u8 d31, #255
- vsri.u8 d30, d30, #5
- vsri.u8 d29, d29, #6
- vshrn.u16 d28, q0, #2
-.endm
-
-.macro pixman_composite_src_0565_8888_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
- pixman_composite_src_0565_8888_process_pixblock_tail
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- fetch_src_pixblock
- pixman_composite_src_0565_8888_process_pixblock_head
- cache_preload 8, 8
-.endm
-
-generate_composite_function \
- pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_0565_8888_process_pixblock_head, \
- pixman_composite_src_0565_8888_process_pixblock_tail, \
- pixman_composite_src_0565_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8_8_process_pixblock_head
- vqadd.u8 q14, q0, q2
- vqadd.u8 q15, q1, q3
-.endm
-
-.macro pixman_composite_add_8_8_process_pixblock_tail
-.endm
-
-.macro pixman_composite_add_8_8_process_pixblock_tail_head
- fetch_src_pixblock
- PF add PF_X, PF_X, #32
- PF tst PF_CTL, #0xF
- vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- PF addne PF_X, PF_X, #32
- PF subne PF_CTL, PF_CTL, #1
- vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- vqadd.u8 q14, q0, q2
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vqadd.u8 q15, q1, q3
-.endm
-
-generate_composite_function \
- pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
- FLAG_DST_READWRITE, \
- 32, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8_8_process_pixblock_head, \
- pixman_composite_add_8_8_process_pixblock_tail, \
- pixman_composite_add_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
- fetch_src_pixblock
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- vqadd.u8 q14, q0, q2
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vqadd.u8 q15, q1, q3
-.endm
-
-generate_composite_function \
- pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8_8_process_pixblock_head, \
- pixman_composite_add_8_8_process_pixblock_tail, \
- pixman_composite_add_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8_8_process_pixblock_head, \
- pixman_composite_add_8_8_process_pixblock_tail, \
- pixman_composite_add_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
- vmvn.8 d24, d3 /* get inverted alpha */
- /* do alpha blending */
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d24, d5
- vmull.u8 q10, d24, d6
- vmull.u8 q11, d24, d7
-.endm
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
-.endm
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
- fetch_src_pixblock
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- vmvn.8 d22, d3
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
- vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
- vmull.u8 q9, d22, d5
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vmull.u8 q11, d22, d7
-.endm
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
- pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
- pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_8888_process_pixblock_head
- pixman_composite_out_reverse_8888_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_8888_8888_process_pixblock_tail
- pixman_composite_out_reverse_8888_8888_process_pixblock_tail
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
-.endm
-
-.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
- fetch_src_pixblock
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- vmvn.8 d22, d3
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
- vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
- vmull.u8 q9, d22, d5
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vmull.u8 q11, d22, d7
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_over_8888_8888_process_pixblock_head, \
- pixman_composite_over_8888_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_over_8888_8888_process_pixblock_head, \
- pixman_composite_over_8888_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_8888_process_pixblock_tail_head
- pixman_composite_over_8888_8888_process_pixblock_tail
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- pixman_composite_over_8888_8888_process_pixblock_head
- cache_preload 8, 8
-.endm
-
-.macro pixman_composite_over_n_8888_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d3[0]}, [DUMMY]
- vdup.8 d0, d3[0]
- vdup.8 d1, d3[1]
- vdup.8 d2, d3[2]
- vdup.8 d3, d3[3]
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_8888_init, \
- default_cleanup, \
- pixman_composite_over_8888_8888_process_pixblock_head, \
- pixman_composite_over_8888_8888_process_pixblock_tail, \
- pixman_composite_over_n_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
- vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
- vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
- vmvn.8 d22, d3
- PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
- vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
- vmull.u8 q9, d22, d5
- vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
- vmull.u8 q11, d22, d7
-.endm
-
-.macro pixman_composite_over_reverse_n_8888_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d7[0]}, [DUMMY]
- vdup.8 d4, d7[0]
- vdup.8 d5, d7[1]
- vdup.8 d6, d7[2]
- vdup.8 d7, d7[3]
-.endm
-
-generate_composite_function \
- pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_reverse_n_8888_init, \
- default_cleanup, \
- pixman_composite_over_8888_8888_process_pixblock_head, \
- pixman_composite_over_8888_8888_process_pixblock_tail, \
- pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 4, /* src_basereg */ \
- 24 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_head
- vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
- vmull.u8 q1, d24, d9
- vmull.u8 q6, d24, d10
- vmull.u8 q7, d24, d11
- vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
- vshrn.u16 d7, q2, #3
- vsli.u16 q2, q2, #5
- vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
- vrshr.u16 q9, q1, #8
- vrshr.u16 q10, q6, #8
- vrshr.u16 q11, q7, #8
- vraddhn.u16 d0, q0, q8
- vraddhn.u16 d1, q1, q9
- vraddhn.u16 d2, q6, q10
- vraddhn.u16 d3, q7, q11
- vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
- vsri.u8 d7, d7, #6
- vmvn.8 d3, d3
- vshrn.u16 d30, q2, #2
- vmull.u8 q8, d3, d6 /* now do alpha blending */
- vmull.u8 q9, d3, d7
- vmull.u8 q10, d3, d30
-.endm
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
- /* 3 cycle bubble (after vmull.u8) */
- vrshr.u16 q13, q8, #8
- vrshr.u16 q11, q9, #8
- vrshr.u16 q15, q10, #8
- vraddhn.u16 d16, q8, q13
- vraddhn.u16 d27, q9, q11
- vraddhn.u16 d26, q10, q15
- vqadd.u8 d16, d2, d16
- /* 1 cycle bubble */
- vqadd.u8 q9, q0, q13
- vshll.u8 q14, d16, #8 /* convert to 16bpp */
- vshll.u8 q8, d19, #8
- vshll.u8 q9, d18, #8
- vsri.u16 q14, q8, #5
- /* 1 cycle bubble */
- vsri.u16 q14, q9, #11
-.endm
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
- vld1.16 {d4, d5}, [DST_R, :128]!
- vshrn.u16 d6, q2, #8
- fetch_mask_pixblock
- vshrn.u16 d7, q2, #3
- fetch_src_pixblock
- vmull.u8 q6, d24, d10
- vrshr.u16 q13, q8, #8
- vrshr.u16 q11, q9, #8
- vrshr.u16 q15, q10, #8
- vraddhn.u16 d16, q8, q13
- vraddhn.u16 d27, q9, q11
- vraddhn.u16 d26, q10, q15
- vqadd.u8 d16, d2, d16
- vmull.u8 q1, d24, d9
- vqadd.u8 q9, q0, q13
- vshll.u8 q14, d16, #8
- vmull.u8 q0, d24, d8
- vshll.u8 q8, d19, #8
- vshll.u8 q9, d18, #8
- vsri.u16 q14, q8, #5
- vmull.u8 q7, d24, d11
- vsri.u16 q14, q9, #11
-
- cache_preload 8, 8
-
- vsli.u16 q2, q2, #5
- vrshr.u16 q8, q0, #8
- vrshr.u16 q9, q1, #8
- vrshr.u16 q10, q6, #8
- vrshr.u16 q11, q7, #8
- vraddhn.u16 d0, q0, q8
- vraddhn.u16 d1, q1, q9
- vraddhn.u16 d2, q6, q10
- vraddhn.u16 d3, q7, q11
- vsri.u8 d6, d6, #5
- vsri.u8 d7, d7, #6
- vmvn.8 d3, d3
- vshrn.u16 d30, q2, #2
- vst1.16 {d28, d29}, [DST_W, :128]!
- vmull.u8 q8, d3, d6
- vmull.u8 q9, d3, d7
- vmull.u8 q10, d3, d30
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_8888_8_0565_process_pixblock_head, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 8, /* src_basereg */ \
- 24 /* mask_basereg */
-
-/******************************************************************************/
-
-/*
- * This function needs a special initialization of solid mask.
- * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
- * offset, split into color components and replicated in d8-d11
- * registers. Additionally, this function needs all the NEON registers,
- * so it has to save d8-d15 registers which are callee saved according
- * to ABI. These registers are restored from 'cleanup' macro. All the
- * other NEON registers are caller saved, so can be clobbered freely
- * without introducing any problems.
- */
-.macro pixman_composite_over_n_8_0565_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vpush {d8-d15}
- vld1.32 {d11[0]}, [DUMMY]
- vdup.8 d8, d11[0]
- vdup.8 d9, d11[1]
- vdup.8 d10, d11[2]
- vdup.8 d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8_0565_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_8_0565_init, \
- pixman_composite_over_n_8_0565_cleanup, \
- pixman_composite_over_8888_8_0565_process_pixblock_head, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_n_0565_init
- add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
- vpush {d8-d15}
- vld1.32 {d24[0]}, [DUMMY]
- vdup.8 d24, d24[3]
-.endm
-
-.macro pixman_composite_over_8888_n_0565_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_8888_n_0565_init, \
- pixman_composite_over_8888_n_0565_cleanup, \
- pixman_composite_over_8888_8_0565_process_pixblock_head, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail, \
- pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 8, /* src_basereg */ \
- 24 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0565_0565_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_0565_0565_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
- vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
- fetch_src_pixblock
- cache_preload 16, 16
-.endm
-
-generate_composite_function \
- pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
- FLAG_DST_WRITEONLY, \
- 16, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_0565_0565_process_pixblock_head, \
- pixman_composite_src_0565_0565_process_pixblock_tail, \
- pixman_composite_src_0565_0565_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_8_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_8_process_pixblock_tail_head
- vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_8_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d0[0]}, [DUMMY]
- vsli.u64 d0, d0, #8
- vsli.u64 d0, d0, #16
- vsli.u64 d0, d0, #32
- vorr d1, d0, d0
- vorr q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_8_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
- FLAG_DST_WRITEONLY, \
- 32, /* number of pixels, processed in a single block */ \
- 0, /* prefetch distance */ \
- pixman_composite_src_n_8_init, \
- pixman_composite_src_n_8_cleanup, \
- pixman_composite_src_n_8_process_pixblock_head, \
- pixman_composite_src_n_8_process_pixblock_tail, \
- pixman_composite_src_n_8_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_0565_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_0565_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_0565_process_pixblock_tail_head
- vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_0565_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d0[0]}, [DUMMY]
- vsli.u64 d0, d0, #16
- vsli.u64 d0, d0, #32
- vorr d1, d0, d0
- vorr q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_0565_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
- FLAG_DST_WRITEONLY, \
- 16, /* number of pixels, processed in a single block */ \
- 0, /* prefetch distance */ \
- pixman_composite_src_n_0565_init, \
- pixman_composite_src_n_0565_cleanup, \
- pixman_composite_src_n_0565_process_pixblock_head, \
- pixman_composite_src_n_0565_process_pixblock_tail, \
- pixman_composite_src_n_0565_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_8888_process_pixblock_tail_head
- vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_8888_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d0[0]}, [DUMMY]
- vsli.u64 d0, d0, #32
- vorr d1, d0, d0
- vorr q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_8888_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
- FLAG_DST_WRITEONLY, \
- 8, /* number of pixels, processed in a single block */ \
- 0, /* prefetch distance */ \
- pixman_composite_src_n_8888_init, \
- pixman_composite_src_n_8888_cleanup, \
- pixman_composite_src_n_8888_process_pixblock_head, \
- pixman_composite_src_n_8888_process_pixblock_tail, \
- pixman_composite_src_n_8888_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_8888_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_8888_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
- vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
- fetch_src_pixblock
- cache_preload 8, 8
-.endm
-
-generate_composite_function \
- pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_WRITEONLY, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_8888_8888_process_pixblock_head, \
- pixman_composite_src_8888_8888_process_pixblock_tail, \
- pixman_composite_src_8888_8888_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_x888_8888_process_pixblock_head
- vorr q0, q0, q2
- vorr q1, q1, q2
-.endm
-
-.macro pixman_composite_src_x888_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
- vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
- fetch_src_pixblock
- vorr q0, q0, q2
- vorr q1, q1, q2
- cache_preload 8, 8
-.endm
-
-.macro pixman_composite_src_x888_8888_init
- vmov.u8 q2, #0xFF
- vshl.u32 q2, q2, #24
-.endm
-
-generate_composite_function \
- pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_WRITEONLY, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- pixman_composite_src_x888_8888_init, \
- default_cleanup, \
- pixman_composite_src_x888_8888_process_pixblock_head, \
- pixman_composite_src_x888_8888_process_pixblock_tail, \
- pixman_composite_src_x888_8888_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8_8888_process_pixblock_head
- /* expecting deinterleaved source data in {d8, d9, d10, d11} */
- /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
- /* and destination data in {d4, d5, d6, d7} */
- /* mask is in d24 (d25, d26, d27 are unused) */
-
- /* in */
- vmull.u8 q0, d24, d8
- vmull.u8 q1, d24, d9
- vmull.u8 q6, d24, d10
- vmull.u8 q7, d24, d11
- vrshr.u16 q10, q0, #8
- vrshr.u16 q11, q1, #8
- vrshr.u16 q12, q6, #8
- vrshr.u16 q13, q7, #8
- vraddhn.u16 d0, q0, q10
- vraddhn.u16 d1, q1, q11
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d3, q7, q13
- vmvn.8 d24, d3 /* get inverted alpha */
- /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
- /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
- /* now do alpha blending */
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d24, d5
- vmull.u8 q10, d24, d6
- vmull.u8 q11, d24, d7
-.endm
-
-.macro pixman_composite_over_n_8_8888_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
- pixman_composite_over_n_8_8888_process_pixblock_tail
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- fetch_mask_pixblock
- cache_preload 8, 8
- pixman_composite_over_n_8_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_n_8_8888_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vpush {d8-d15}
- vld1.32 {d11[0]}, [DUMMY]
- vdup.8 d8, d11[0]
- vdup.8 d9, d11[1]
- vdup.8 d10, d11[2]
- vdup.8 d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8_8888_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_8_8888_init, \
- pixman_composite_over_n_8_8888_cleanup, \
- pixman_composite_over_n_8_8888_process_pixblock_head, \
- pixman_composite_over_n_8_8888_process_pixblock_tail, \
- pixman_composite_over_n_8_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8_8_process_pixblock_head
- vmull.u8 q0, d24, d8
- vmull.u8 q1, d25, d8
- vmull.u8 q6, d26, d8
- vmull.u8 q7, d27, d8
- vrshr.u16 q10, q0, #8
- vrshr.u16 q11, q1, #8
- vrshr.u16 q12, q6, #8
- vrshr.u16 q13, q7, #8
- vraddhn.u16 d0, q0, q10
- vraddhn.u16 d1, q1, q11
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d3, q7, q13
- vmvn.8 q12, q0
- vmvn.8 q13, q1
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d25, d5
- vmull.u8 q10, d26, d6
- vmull.u8 q11, d27, d7
-.endm
-
-.macro pixman_composite_over_n_8_8_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
- vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- pixman_composite_over_n_8_8_process_pixblock_tail
- fetch_mask_pixblock
- cache_preload 32, 32
- vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
- pixman_composite_over_n_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_n_8_8_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vpush {d8-d15}
- vld1.32 {d8[0]}, [DUMMY]
- vdup.8 d8, d8[3]
-.endm
-
-.macro pixman_composite_over_n_8_8_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
- FLAG_DST_READWRITE, \
- 32, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_8_8_init, \
- pixman_composite_over_n_8_8_cleanup, \
- pixman_composite_over_n_8_8_process_pixblock_head, \
- pixman_composite_over_n_8_8_process_pixblock_tail, \
- pixman_composite_over_n_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
- /*
- * 'combine_mask_ca' replacement
- *
- * input: solid src (n) in {d8, d9, d10, d11}
- * dest in {d4, d5, d6, d7 }
- * mask in {d24, d25, d26, d27}
- * output: updated src in {d0, d1, d2, d3 }
- * updated mask in {d24, d25, d26, d3 }
- */
- vmull.u8 q0, d24, d8
- vmull.u8 q1, d25, d9
- vmull.u8 q6, d26, d10
- vmull.u8 q7, d27, d11
- vmull.u8 q9, d11, d25
- vmull.u8 q12, d11, d24
- vmull.u8 q13, d11, d26
- vrshr.u16 q8, q0, #8
- vrshr.u16 q10, q1, #8
- vrshr.u16 q11, q6, #8
- vraddhn.u16 d0, q0, q8
- vraddhn.u16 d1, q1, q10
- vraddhn.u16 d2, q6, q11
- vrshr.u16 q11, q12, #8
- vrshr.u16 q8, q9, #8
- vrshr.u16 q6, q13, #8
- vrshr.u16 q10, q7, #8
- vraddhn.u16 d24, q12, q11
- vraddhn.u16 d25, q9, q8
- vraddhn.u16 d26, q13, q6
- vraddhn.u16 d3, q7, q10
- /*
- * 'combine_over_ca' replacement
- *
- * output: updated dest in {d28, d29, d30, d31}
- */
- vmvn.8 d24, d24
- vmvn.8 d25, d25
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d25, d5
- vmvn.8 d26, d26
- vmvn.8 d27, d3
- vmull.u8 q10, d26, d6
- vmull.u8 q11, d27, d7
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
- /* ... continue 'combine_over_ca' replacement */
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q6, q10, #8
- vrshr.u16 q7, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q6, q10
- vraddhn.u16 d31, q7, q11
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vrshr.u16 q6, q10, #8
- vrshr.u16 q7, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q6, q10
- vraddhn.u16 d31, q7, q11
- fetch_mask_pixblock
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
- cache_preload 8, 8
- pixman_composite_over_n_8888_8888_ca_process_pixblock_head
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vpush {d8-d15}
- vld1.32 {d11[0]}, [DUMMY]
- vdup.8 d8, d11[0]
- vdup.8 d9, d11[1]
- vdup.8 d10, d11[2]
- vdup.8 d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_n_8888_8888_ca_init, \
- pixman_composite_over_n_8888_8888_ca_cleanup, \
- pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
- pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
- pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_in_n_8_process_pixblock_head
- /* expecting source data in {d0, d1, d2, d3} */
- /* and destination data in {d4, d5, d6, d7} */
- vmull.u8 q8, d4, d3
- vmull.u8 q9, d5, d3
- vmull.u8 q10, d6, d3
- vmull.u8 q11, d7, d3
-.endm
-
-.macro pixman_composite_in_n_8_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d28, q8, q14
- vraddhn.u16 d29, q9, q15
- vraddhn.u16 d30, q10, q12
- vraddhn.u16 d31, q11, q13
-.endm
-
-.macro pixman_composite_in_n_8_process_pixblock_tail_head
- pixman_composite_in_n_8_process_pixblock_tail
- vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- cache_preload 32, 32
- pixman_composite_in_n_8_process_pixblock_head
- vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_in_n_8_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d3[0]}, [DUMMY]
- vdup.8 d3, d3[3]
-.endm
-
-.macro pixman_composite_in_n_8_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
- FLAG_DST_READWRITE, \
- 32, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_in_n_8_init, \
- pixman_composite_in_n_8_cleanup, \
- pixman_composite_in_n_8_process_pixblock_head, \
- pixman_composite_in_n_8_process_pixblock_tail, \
- pixman_composite_in_n_8_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 24 /* mask_basereg */
-
-.macro pixman_composite_add_n_8_8_process_pixblock_head
- /* expecting source data in {d8, d9, d10, d11} */
- /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
- /* and destination data in {d4, d5, d6, d7} */
- /* mask is in d24, d25, d26, d27 */
- vmull.u8 q0, d24, d11
- vmull.u8 q1, d25, d11
- vmull.u8 q6, d26, d11
- vmull.u8 q7, d27, d11
- vrshr.u16 q10, q0, #8
- vrshr.u16 q11, q1, #8
- vrshr.u16 q12, q6, #8
- vrshr.u16 q13, q7, #8
- vraddhn.u16 d0, q0, q10
- vraddhn.u16 d1, q1, q11
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d3, q7, q13
- vqadd.u8 q14, q0, q2
- vqadd.u8 q15, q1, q3
-.endm
-
-.macro pixman_composite_add_n_8_8_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
- pixman_composite_add_n_8_8_process_pixblock_tail
- vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
- vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- fetch_mask_pixblock
- cache_preload 32, 32
- pixman_composite_add_n_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_add_n_8_8_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vpush {d8-d15}
- vld1.32 {d11[0]}, [DUMMY]
- vdup.8 d11, d11[3]
-.endm
-
-.macro pixman_composite_add_n_8_8_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
- FLAG_DST_READWRITE, \
- 32, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_add_n_8_8_init, \
- pixman_composite_add_n_8_8_cleanup, \
- pixman_composite_add_n_8_8_process_pixblock_head, \
- pixman_composite_add_n_8_8_process_pixblock_tail, \
- pixman_composite_add_n_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8_8_8_process_pixblock_head
- /* expecting source data in {d0, d1, d2, d3} */
- /* destination data in {d4, d5, d6, d7} */
- /* mask in {d24, d25, d26, d27} */
- vmull.u8 q8, d24, d0
- vmull.u8 q9, d25, d1
- vmull.u8 q10, d26, d2
- vmull.u8 q11, d27, d3
- vrshr.u16 q0, q8, #8
- vrshr.u16 q1, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d0, q0, q8
- vraddhn.u16 d1, q1, q9
- vraddhn.u16 d2, q12, q10
- vraddhn.u16 d3, q13, q11
- vqadd.u8 q14, q0, q2
- vqadd.u8 q15, q1, q3
-.endm
-
-.macro pixman_composite_add_8_8_8_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
- pixman_composite_add_8_8_8_process_pixblock_tail
- vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
- vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- fetch_mask_pixblock
- fetch_src_pixblock
- cache_preload 32, 32
- pixman_composite_add_8_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_add_8_8_8_init
-.endm
-
-.macro pixman_composite_add_8_8_8_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
- FLAG_DST_READWRITE, \
- 32, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_add_8_8_8_init, \
- pixman_composite_add_8_8_8_cleanup, \
- pixman_composite_add_8_8_8_process_pixblock_head, \
- pixman_composite_add_8_8_8_process_pixblock_tail, \
- pixman_composite_add_8_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
- /* expecting source data in {d0, d1, d2, d3} */
- /* destination data in {d4, d5, d6, d7} */
- /* mask in {d24, d25, d26, d27} */
- vmull.u8 q8, d27, d0
- vmull.u8 q9, d27, d1
- vmull.u8 q10, d27, d2
- vmull.u8 q11, d27, d3
- /* 1 cycle bubble */
- vrsra.u16 q8, q8, #8
- vrsra.u16 q9, q9, #8
- vrsra.u16 q10, q10, #8
- vrsra.u16 q11, q11, #8
-.endm
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
- /* 2 cycle bubble */
- vrshrn.u16 d28, q8, #8
- vrshrn.u16 d29, q9, #8
- vrshrn.u16 d30, q10, #8
- vrshrn.u16 d31, q11, #8
- vqadd.u8 q14, q2, q14
- /* 1 cycle bubble */
- vqadd.u8 q15, q3, q15
-.endm
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
- fetch_src_pixblock
- vrshrn.u16 d28, q8, #8
- fetch_mask_pixblock
- vrshrn.u16 d29, q9, #8
- vmull.u8 q8, d27, d0
- vrshrn.u16 d30, q10, #8
- vmull.u8 q9, d27, d1
- vrshrn.u16 d31, q11, #8
- vmull.u8 q10, d27, d2
- vqadd.u8 q14, q2, q14
- vmull.u8 q11, d27, d3
- vqadd.u8 q15, q3, q15
- vrsra.u16 q8, q8, #8
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vrsra.u16 q9, q9, #8
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- vrsra.u16 q10, q10, #8
-
- cache_preload 8, 8
-
- vrsra.u16 q11, q11, #8
-.endm
-
-generate_composite_function \
- pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8888_8888_8888_process_pixblock_head, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8888_8888_8888_process_pixblock_head, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-generate_composite_function \
- pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_add_8888_8888_8888_process_pixblock_head, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 27 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_n_8_8888_init
- add DUMMY, sp, #ARGS_STACK_OFFSET
- vld1.32 {d3[0]}, [DUMMY]
- vdup.8 d0, d3[0]
- vdup.8 d1, d3[1]
- vdup.8 d2, d3[2]
- vdup.8 d3, d3[3]
-.endm
-
-.macro pixman_composite_add_n_8_8888_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_add_n_8_8888_init, \
- pixman_composite_add_n_8_8888_cleanup, \
- pixman_composite_add_8888_8888_8888_process_pixblock_head, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 27 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_n_8888_init
- add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
- vld1.32 {d27[0]}, [DUMMY]
- vdup.8 d27, d27[3]
-.endm
-
-.macro pixman_composite_add_8888_n_8888_cleanup
-.endm
-
-generate_composite_function \
- pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_add_8888_n_8888_init, \
- pixman_composite_add_8888_n_8888_cleanup, \
- pixman_composite_add_8888_8888_8888_process_pixblock_head, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
- pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 27 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
- /* expecting source data in {d0, d1, d2, d3} */
- /* destination data in {d4, d5, d6, d7} */
- /* solid mask is in d15 */
-
- /* 'in' */
- vmull.u8 q8, d15, d3
- vmull.u8 q6, d15, d2
- vmull.u8 q5, d15, d1
- vmull.u8 q4, d15, d0
- vrshr.u16 q13, q8, #8
- vrshr.u16 q12, q6, #8
- vrshr.u16 q11, q5, #8
- vrshr.u16 q10, q4, #8
- vraddhn.u16 d3, q8, q13
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d1, q5, q11
- vraddhn.u16 d0, q4, q10
- vmvn.8 d24, d3 /* get inverted alpha */
- /* now do alpha blending */
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d24, d5
- vmull.u8 q10, d24, d6
- vmull.u8 q11, d24, d7
-.endm
-
-.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q12, q10
- vraddhn.u16 d31, q13, q11
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
- fetch_src_pixblock
- cache_preload 8, 8
- fetch_mask_pixblock
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
- pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 12 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_n_8888_process_pixblock_head
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
- vqadd.u8 q14, q0, q14
- vqadd.u8 q15, q1, q15
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- pixman_composite_over_8888_n_8888_process_pixblock_tail
- fetch_src_pixblock
- cache_preload 8, 8
- pixman_composite_over_8888_n_8888_process_pixblock_head
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_over_8888_n_8888_init
- add DUMMY, sp, #48
- vpush {d8-d15}
- vld1.32 {d15[0]}, [DUMMY]
- vdup.8 d15, d15[3]
-.endm
-
-.macro pixman_composite_over_8888_n_8888_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_8888_n_8888_init, \
- pixman_composite_over_8888_n_8888_cleanup, \
- pixman_composite_over_8888_n_8888_process_pixblock_head, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- pixman_composite_over_8888_n_8888_process_pixblock_tail
- fetch_src_pixblock
- cache_preload 8, 8
- fetch_mask_pixblock
- pixman_composite_over_8888_n_8888_process_pixblock_head
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_8888_n_8888_process_pixblock_head, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 12 /* mask_basereg */
-
-generate_composite_function_single_scanline \
- pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_8888_n_8888_process_pixblock_head, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 12 /* mask_basereg */
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- pixman_composite_over_8888_n_8888_process_pixblock_tail
- fetch_src_pixblock
- cache_preload 8, 8
- fetch_mask_pixblock
- pixman_composite_over_8888_n_8888_process_pixblock_head
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_8888_n_8888_process_pixblock_head, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 15 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_0888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_0888_0888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
- vst3.8 {d0, d1, d2}, [DST_W]!
- fetch_src_pixblock
- cache_preload 8, 8
-.endm
-
-generate_composite_function \
- pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
- FLAG_DST_WRITEONLY, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_0888_0888_process_pixblock_head, \
- pixman_composite_src_0888_0888_process_pixblock_tail, \
- pixman_composite_src_0888_0888_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
- vswp d0, d2
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
- vst4.8 {d0, d1, d2, d3}, [DST_W]!
- fetch_src_pixblock
- vswp d0, d2
- cache_preload 8, 8
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_init
- veor d3, d3, d3
-.endm
-
-generate_composite_function \
- pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- pixman_composite_src_0888_8888_rev_init, \
- default_cleanup, \
- pixman_composite_src_0888_8888_rev_process_pixblock_head, \
- pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
- pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
- 0, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
- vshll.u8 q8, d1, #8
- vshll.u8 q9, d2, #8
-.endm
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
- vshll.u8 q14, d0, #8
- vsri.u16 q14, q8, #5
- vsri.u16 q14, q9, #11
-.endm
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
- vshll.u8 q14, d0, #8
- fetch_src_pixblock
- vsri.u16 q14, q8, #5
- vsri.u16 q14, q9, #11
- vshll.u8 q8, d1, #8
- vst1.16 {d28, d29}, [DST_W, :128]!
- vshll.u8 q9, d2, #8
-.endm
-
-generate_composite_function \
- pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
- FLAG_DST_WRITEONLY, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_0888_0565_rev_process_pixblock_head, \
- pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
- pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d3, d1
- vmull.u8 q10, d3, d2
-.endm
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
- vrshr.u16 q11, q8, #8
- vswp d3, d31
- vrshr.u16 q12, q9, #8
- vrshr.u16 q13, q10, #8
- vraddhn.u16 d30, q11, q8
- vraddhn.u16 d29, q12, q9
- vraddhn.u16 d28, q13, q10
-.endm
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
- vrshr.u16 q11, q8, #8
- vswp d3, d31
- vrshr.u16 q12, q9, #8
- vrshr.u16 q13, q10, #8
- fetch_src_pixblock
- vraddhn.u16 d30, q11, q8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vraddhn.u16 d29, q12, q9
- vraddhn.u16 d28, q13, q10
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d3, d1
- vmull.u8 q10, d3, d2
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-.endm
-
-generate_composite_function \
- pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_pixbuf_8888_process_pixblock_head, \
- pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
- pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d3, d1
- vmull.u8 q10, d3, d2
-.endm
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
- vrshr.u16 q11, q8, #8
- vswp d3, d31
- vrshr.u16 q12, q9, #8
- vrshr.u16 q13, q10, #8
- vraddhn.u16 d28, q11, q8
- vraddhn.u16 d29, q12, q9
- vraddhn.u16 d30, q13, q10
-.endm
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
- vrshr.u16 q11, q8, #8
- vswp d3, d31
- vrshr.u16 q12, q9, #8
- vrshr.u16 q13, q10, #8
- fetch_src_pixblock
- vraddhn.u16 d28, q11, q8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
- vraddhn.u16 d29, q12, q9
- vraddhn.u16 d30, q13, q10
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d3, d1
- vmull.u8 q10, d3, d2
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
- PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-.endm
-
-generate_composite_function \
- pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
- pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
- pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 0, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_0565_8_0565_process_pixblock_head
- /* mask is in d15 */
- convert_0565_to_x888 q4, d2, d1, d0
- convert_0565_to_x888 q5, d6, d5, d4
- /* source pixel data is in {d0, d1, d2, XX} */
- /* destination pixel data is in {d4, d5, d6, XX} */
- vmvn.8 d7, d15
- vmull.u8 q6, d15, d2
- vmull.u8 q5, d15, d1
- vmull.u8 q4, d15, d0
- vmull.u8 q8, d7, d4
- vmull.u8 q9, d7, d5
- vmull.u8 q13, d7, d6
- vrshr.u16 q12, q6, #8
- vrshr.u16 q11, q5, #8
- vrshr.u16 q10, q4, #8
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d1, q5, q11
- vraddhn.u16 d0, q4, q10
-.endm
-
-.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q13, #8
- vraddhn.u16 d28, q14, q8
- vraddhn.u16 d29, q15, q9
- vraddhn.u16 d30, q12, q13
- vqadd.u8 q0, q0, q14
- vqadd.u8 q1, q1, q15
- /* 32bpp result is in {d0, d1, d2, XX} */
- convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
- fetch_mask_pixblock
- pixman_composite_over_0565_8_0565_process_pixblock_tail
- fetch_src_pixblock
- vld1.16 {d10, d11}, [DST_R, :128]!
- cache_preload 8, 8
- pixman_composite_over_0565_8_0565_process_pixblock_head
- vst1.16 {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
- pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_0565_8_0565_process_pixblock_head, \
- pixman_composite_over_0565_8_0565_process_pixblock_tail, \
- pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 10, /* dst_r_basereg */ \
- 8, /* src_basereg */ \
- 15 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_0565_n_0565_init
- add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
- vpush {d8-d15}
- vld1.32 {d15[0]}, [DUMMY]
- vdup.8 d15, d15[3]
-.endm
-
-.macro pixman_composite_over_0565_n_0565_cleanup
- vpop {d8-d15}
-.endm
-
-generate_composite_function \
- pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- pixman_composite_over_0565_n_0565_init, \
- pixman_composite_over_0565_n_0565_cleanup, \
- pixman_composite_over_0565_8_0565_process_pixblock_head, \
- pixman_composite_over_0565_8_0565_process_pixblock_tail, \
- pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 10, /* dst_r_basereg */ \
- 8, /* src_basereg */ \
- 15 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_0565_8_0565_process_pixblock_head
- /* mask is in d15 */
- convert_0565_to_x888 q4, d2, d1, d0
- convert_0565_to_x888 q5, d6, d5, d4
- /* source pixel data is in {d0, d1, d2, XX} */
- /* destination pixel data is in {d4, d5, d6, XX} */
- vmull.u8 q6, d15, d2
- vmull.u8 q5, d15, d1
- vmull.u8 q4, d15, d0
- vrshr.u16 q12, q6, #8
- vrshr.u16 q11, q5, #8
- vrshr.u16 q10, q4, #8
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d1, q5, q11
- vraddhn.u16 d0, q4, q10
-.endm
-
-.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
- vqadd.u8 q0, q0, q2
- vqadd.u8 q1, q1, q3
- /* 32bpp result is in {d0, d1, d2, XX} */
- convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
- fetch_mask_pixblock
- pixman_composite_add_0565_8_0565_process_pixblock_tail
- fetch_src_pixblock
- vld1.16 {d10, d11}, [DST_R, :128]!
- cache_preload 8, 8
- pixman_composite_add_0565_8_0565_process_pixblock_head
- vst1.16 {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
- pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_add_0565_8_0565_process_pixblock_head, \
- pixman_composite_add_0565_8_0565_process_pixblock_tail, \
- pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 10, /* dst_r_basereg */ \
- 8, /* src_basereg */ \
- 15 /* mask_basereg */
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
- /* mask is in d15 */
- convert_0565_to_x888 q5, d6, d5, d4
- /* destination pixel data is in {d4, d5, d6, xx} */
- vmvn.8 d24, d15 /* get inverted alpha */
- /* now do alpha blending */
- vmull.u8 q8, d24, d4
- vmull.u8 q9, d24, d5
- vmull.u8 q10, d24, d6
-.endm
-
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
- vrshr.u16 q14, q8, #8
- vrshr.u16 q15, q9, #8
- vrshr.u16 q12, q10, #8
- vraddhn.u16 d0, q14, q8
- vraddhn.u16 d1, q15, q9
- vraddhn.u16 d2, q12, q10
- /* 32bpp result is in {d0, d1, d2, XX} */
- convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
- fetch_src_pixblock
- pixman_composite_out_reverse_8_0565_process_pixblock_tail
- vld1.16 {d10, d11}, [DST_R, :128]!
- cache_preload 8, 8
- pixman_composite_out_reverse_8_0565_process_pixblock_head
- vst1.16 {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
- pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
- FLAG_DST_READWRITE, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_out_reverse_8_0565_process_pixblock_head, \
- pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
- pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 10, /* dst_r_basereg */ \
- 15, /* src_basereg */ \
- 0 /* mask_basereg */
-
-/******************************************************************************/
-
-generate_composite_function_nearest_scanline \
- pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_over_8888_8888_process_pixblock_head, \
- pixman_composite_over_8888_8888_process_pixblock_tail, \
- pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_nearest_scanline \
- pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_over_8888_0565_process_pixblock_head, \
- pixman_composite_over_8888_0565_process_pixblock_tail, \
- pixman_composite_over_8888_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 24 /* mask_basereg */
-
-generate_composite_function_nearest_scanline \
- pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_8888_0565_process_pixblock_head, \
- pixman_composite_src_8888_0565_process_pixblock_tail, \
- pixman_composite_src_8888_0565_process_pixblock_tail_head
-
-generate_composite_function_nearest_scanline \
- pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init, \
- default_cleanup, \
- pixman_composite_src_0565_8888_process_pixblock_head, \
- pixman_composite_src_0565_8888_process_pixblock_tail, \
- pixman_composite_src_0565_8888_process_pixblock_tail_head
+/* + * Copyright © 2009 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +/* + * This file contains implementations of NEON optimized pixel processing + * functions. There is no full and detailed tutorial, but some functions + * (those which are exposing some new or interesting features) are + * extensively commented and can be used as examples. + * + * You may want to have a look at the comments for following functions: + * - pixman_composite_over_8888_0565_asm_neon + * - pixman_composite_over_n_8_0565_asm_neon + */ + +/* Prevent the stack from becoming executable for no reason... */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .fpu neon + .arch armv7a + .object_arch armv4 + .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ + .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ + .arm + .altmacro + +#include "pixman-arm-neon-asm.h" + +/* Global configuration options and preferences */ + +/* + * The code can optionally make use of unaligned memory accesses to improve + * performance of handling leading/trailing pixels for each scanline. + * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for + * example in linux if unaligned memory accesses are not configured to + * generate.exceptions. + */ +.set RESPECT_STRICT_ALIGNMENT, 1 + +/* + * Set default prefetch type. There is a choice between the following options: + * + * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work + * as NOP to workaround some HW bugs or for whatever other reason) + * + * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where + * advanced prefetch intruduces heavy overhead) + * + * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 + * which can run ARM and NEON instructions simultaneously so that extra ARM + * instructions do not add (many) extra cycles, but improve prefetch efficiency) + * + * Note: some types of function can't support advanced prefetch and fallback + * to simple one (those which handle 24bpp pixels) + */ +.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED + +/* Prefetch distance in pixels for simple prefetch */ +.set PREFETCH_DISTANCE_SIMPLE, 64 + +/* + * Implementation of pixman_composite_over_8888_0565_asm_neon + * + * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and + * performs OVER compositing operation. Function fast_composite_over_8888_0565 + * from pixman-fast-path.c does the same in C and can be used as a reference. + * + * First we need to have some NEON assembly code which can do the actual + * operation on the pixels and provide it to the template macro. + * + * Template macro quite conveniently takes care of emitting all the necessary + * code for memory reading and writing (including quite tricky cases of + * handling unaligned leading/trailing pixels), so we only need to deal with + * the data in NEON registers. + * + * NEON registers allocation in general is recommented to be the following: + * d0, d1, d2, d3 - contain loaded source pixel data + * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) + * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) + * d28, d29, d30, d31 - place for storing the result (destination pixels) + * + * As can be seen above, four 64-bit NEON registers are used for keeping + * intermediate pixel data and up to 8 pixels can be processed in one step + * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). + * + * This particular function uses the following registers allocation: + * d0, d1, d2, d3 - contain loaded source pixel data + * d4, d5 - contain loaded destination pixels (they are needed) + * d28, d29 - place for storing the result (destination pixels) + */ + +/* + * Step one. We need to have some code to do some arithmetics on pixel data. + * This is implemented as a pair of macros: '*_head' and '*_tail'. When used + * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, + * perform all the needed calculations and write the result to {d28, d29}. + * The rationale for having two macros and not just one will be explained + * later. In practice, any single monolitic function which does the work can + * be split into two parts in any arbitrary way without affecting correctness. + * + * There is one special trick here too. Common template macro can optionally + * make our life a bit easier by doing R, G, B, A color components + * deinterleaving for 32bpp pixel formats (and this feature is used in + * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that + * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we + * actually use d0 register for blue channel (a vector of eight 8-bit + * values), d1 register for green, d2 for red and d3 for alpha. This + * simple conversion can be also done with a few NEON instructions: + * + * Packed to planar conversion: + * vuzp.8 d0, d1 + * vuzp.8 d2, d3 + * vuzp.8 d1, d3 + * vuzp.8 d0, d2 + * + * Planar to packed conversion: + * vzip.8 d0, d2 + * vzip.8 d1, d3 + * vzip.8 d2, d3 + * vzip.8 d0, d1 + * + * But pixel can be loaded directly in planar format using VLD4.8 NEON + * instruction. It is 1 cycle slower than VLD1.32, so this is not always + * desirable, that's why deinterleaving is optional. + * + * But anyway, here is the code: + */ +.macro pixman_composite_over_8888_0565_process_pixblock_head + /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format + and put data into d6 - red, d7 - green, d30 - blue */ + vshrn.u16 d6, q2, #8 + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vsri.u8 d6, d6, #5 + vmvn.8 d3, d3 /* invert source alpha */ + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + /* now do alpha blending, storing results in 8-bit planar format + into d16 - red, d19 - green, d18 - blue */ + vmull.u8 q10, d3, d6 + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + vrshr.u16 q13, q10, #8 + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 + vraddhn.u16 d22, q12, q15 +.endm + +.macro pixman_composite_over_8888_0565_process_pixblock_tail + /* ... continue alpha blending */ + vqadd.u8 d16, d2, d20 + vqadd.u8 q9, q0, q11 + /* convert the result to r5g6b5 and store it into {d28, d29} */ + vshll.u8 q14, d16, #8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +/* + * OK, now we got almost everything that we need. Using the above two + * macros, the work can be done right. But now we want to optimize + * it a bit. ARM Cortex-A8 is an in-order core, and benefits really + * a lot from good code scheduling and software pipelining. + * + * Let's construct some code, which will run in the core main loop. + * Some pseudo-code of the main loop will look like this: + * head + * while (...) { + * tail + * head + * } + * tail + * + * It may look a bit weird, but this setup allows to hide instruction + * latencies better and also utilize dual-issue capability more + * efficiently (make pairs of load-store and ALU instructions). + * + * So what we need now is a '*_tail_head' macro, which will be used + * in the core main loop. A trivial straightforward implementation + * of this macro would look like this: + * + * pixman_composite_over_8888_0565_process_pixblock_tail + * vst1.16 {d28, d29}, [DST_W, :128]! + * vld1.16 {d4, d5}, [DST_R, :128]! + * vld4.32 {d0, d1, d2, d3}, [SRC]! + * pixman_composite_over_8888_0565_process_pixblock_head + * cache_preload 8, 8 + * + * Now it also got some VLD/VST instructions. We simply can't move from + * processing one block of pixels to the other one with just arithmetics. + * The previously processed data needs to be written to memory and new + * data needs to be fetched. Fortunately, this main loop does not deal + * with partial leading/trailing pixels and can load/store a full block + * of pixels in a bulk. Additionally, destination buffer is already + * 16 bytes aligned here (which is good for performance). + * + * New things here are DST_R, DST_W, SRC and MASK identifiers. These + * are the aliases for ARM registers which are used as pointers for + * accessing data. We maintain separate pointers for reading and writing + * destination buffer (DST_R and DST_W). + * + * Another new thing is 'cache_preload' macro. It is used for prefetching + * data into CPU L2 cache and improve performance when dealing with large + * images which are far larger than cache size. It uses one argument + * (actually two, but they need to be the same here) - number of pixels + * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some + * details about this macro. Moreover, if good performance is needed + * the code from this macro needs to be copied into '*_tail_head' macro + * and mixed with the rest of code for optimal instructions scheduling. + * We are actually doing it below. + * + * Now after all the explanations, here is the optimized code. + * Different instruction streams (originaling from '*_head', '*_tail' + * and 'cache_preload' macro) use different indentation levels for + * better readability. Actually taking the code from one of these + * indentation levels and ignoring a few VLD/VST instructions would + * result in exactly the code from '*_head', '*_tail' or 'cache_preload' + * macro! + */ + +#if 1 + +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head + vqadd.u8 d16, d2, d20 + vld1.16 {d4, d5}, [DST_R, :128]! + vqadd.u8 q9, q0, q11 + vshrn.u16 d6, q2, #8 + fetch_src_pixblock + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vshll.u8 q14, d16, #8 + PF add PF_X, PF_X, #8 + vshll.u8 q8, d19, #8 + PF tst PF_CTL, #0xF + vsri.u8 d6, d6, #5 + PF addne PF_X, PF_X, #8 + vmvn.8 d3, d3 + PF subne PF_CTL, PF_CTL, #1 + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + vmull.u8 q10, d3, d6 + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vsri.u16 q14, q8, #5 + PF cmp PF_X, ORIG_W + vshll.u8 q9, d18, #8 + vrshr.u16 q13, q10, #8 + PF subge PF_X, PF_X, ORIG_W + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 + PF subges PF_CTL, PF_CTL, #0x10 + vsri.u16 q14, q9, #11 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vraddhn.u16 d22, q12, q15 + vst1.16 {d28, d29}, [DST_W, :128]! +.endm + +#else + +/* If we did not care much about the performance, we would just use this... */ +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head + pixman_composite_over_8888_0565_process_pixblock_tail + vst1.16 {d28, d29}, [DST_W, :128]! + vld1.16 {d4, d5}, [DST_R, :128]! + fetch_src_pixblock + pixman_composite_over_8888_0565_process_pixblock_head + cache_preload 8, 8 +.endm + +#endif + +/* + * And now the final part. We are using 'generate_composite_function' macro + * to put all the stuff together. We are specifying the name of the function + * which we want to get, number of bits per pixel for the source, mask and + * destination (0 if unused, like mask in this case). Next come some bit + * flags: + * FLAG_DST_READWRITE - tells that the destination buffer is both read + * and written, for write-only buffer we would use + * FLAG_DST_WRITEONLY flag instead + * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data + * and separate color channels for 32bpp format. + * The next things are: + * - the number of pixels processed per iteration (8 in this case, because + * that's the maximum what can fit into four 64-bit NEON registers). + * - prefetch distance, measured in pixel blocks. In this case it is 5 times + * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal + * prefetch distance can be selected by running some benchmarks. + * + * After that we specify some macros, these are 'default_init', + * 'default_cleanup' here which are empty (but it is possible to have custom + * init/cleanup macros to be able to save/restore some extra NEON registers + * like d8-d15 or do anything else) followed by + * 'pixman_composite_over_8888_0565_process_pixblock_head', + * 'pixman_composite_over_8888_0565_process_pixblock_tail' and + * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' + * which we got implemented above. + * + * The last part is the NEON registers allocation scheme. + */ +generate_composite_function \ + pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_0565_process_pixblock_head, \ + pixman_composite_over_8888_0565_process_pixblock_tail, \ + pixman_composite_over_8888_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_n_0565_process_pixblock_head + /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format + and put data into d6 - red, d7 - green, d30 - blue */ + vshrn.u16 d6, q2, #8 + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vsri.u8 d6, d6, #5 + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + /* now do alpha blending, storing results in 8-bit planar format + into d16 - red, d19 - green, d18 - blue */ + vmull.u8 q10, d3, d6 + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + vrshr.u16 q13, q10, #8 + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 + vraddhn.u16 d22, q12, q15 +.endm + +.macro pixman_composite_over_n_0565_process_pixblock_tail + /* ... continue alpha blending */ + vqadd.u8 d16, d2, d20 + vqadd.u8 q9, q0, q11 + /* convert the result to r5g6b5 and store it into {d28, d29} */ + vshll.u8 q14, d16, #8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_0565_process_pixblock_tail_head + pixman_composite_over_n_0565_process_pixblock_tail + vld1.16 {d4, d5}, [DST_R, :128]! + vst1.16 {d28, d29}, [DST_W, :128]! + pixman_composite_over_n_0565_process_pixblock_head + cache_preload 8, 8 +.endm + +.macro pixman_composite_over_n_0565_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] + vdup.8 d2, d3[2] + vdup.8 d3, d3[3] + vmvn.8 d3, d3 /* invert source alpha */ +.endm + +generate_composite_function \ + pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_0565_init, \ + default_cleanup, \ + pixman_composite_over_n_0565_process_pixblock_head, \ + pixman_composite_over_n_0565_process_pixblock_tail, \ + pixman_composite_over_n_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_8888_0565_process_pixblock_head + vshll.u8 q8, d1, #8 + vshll.u8 q14, d2, #8 + vshll.u8 q9, d0, #8 +.endm + +.macro pixman_composite_src_8888_0565_process_pixblock_tail + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +.macro pixman_composite_src_8888_0565_process_pixblock_tail_head + vsri.u16 q14, q8, #5 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + fetch_src_pixblock + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vsri.u16 q14, q9, #11 + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vshll.u8 q8, d1, #8 + vst1.16 {d28, d29}, [DST_W, :128]! + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + vshll.u8 q14, d2, #8 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vshll.u8 q9, d0, #8 +.endm + +generate_composite_function \ + pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_8888_0565_process_pixblock_head, \ + pixman_composite_src_8888_0565_process_pixblock_tail, \ + pixman_composite_src_8888_0565_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_src_0565_8888_process_pixblock_head + vshrn.u16 d30, q0, #8 + vshrn.u16 d29, q0, #3 + vsli.u16 q0, q0, #5 + vmov.u8 d31, #255 + vsri.u8 d30, d30, #5 + vsri.u8 d29, d29, #6 + vshrn.u16 d28, q0, #2 +.endm + +.macro pixman_composite_src_0565_8888_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_src_0565_8888_process_pixblock_tail_head + pixman_composite_src_0565_8888_process_pixblock_tail + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + fetch_src_pixblock + pixman_composite_src_0565_8888_process_pixblock_head + cache_preload 8, 8 +.endm + +generate_composite_function \ + pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0565_8888_process_pixblock_head, \ + pixman_composite_src_0565_8888_process_pixblock_tail, \ + pixman_composite_src_0565_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8_8_process_pixblock_head + vqadd.u8 q14, q0, q2 + vqadd.u8 q15, q1, q3 +.endm + +.macro pixman_composite_add_8_8_process_pixblock_tail +.endm + +.macro pixman_composite_add_8_8_process_pixblock_tail_head + fetch_src_pixblock + PF add PF_X, PF_X, #32 + PF tst PF_CTL, #0xF + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + PF addne PF_X, PF_X, #32 + PF subne PF_CTL, PF_CTL, #1 + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + vqadd.u8 q14, q0, q2 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vqadd.u8 q15, q1, q3 +.endm + +generate_composite_function \ + pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ + pixman_composite_add_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8888_8888_process_pixblock_tail_head + fetch_src_pixblock + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + vqadd.u8 q14, q0, q2 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vqadd.u8 q15, q1, q3 +.endm + +generate_composite_function \ + pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ + pixman_composite_add_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ + pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ + pixman_composite_add_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head + vmvn.8 d24, d3 /* get inverted alpha */ + /* do alpha blending */ + vmull.u8 q8, d24, d4 + vmull.u8 q9, d24, d5 + vmull.u8 q10, d24, d6 + vmull.u8 q11, d24, d7 +.endm + +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 +.endm + +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q14, q8, #8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + PF cmp PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + fetch_src_pixblock + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmvn.8 d22, d3 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vmull.u8 q10, d22, d6 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 +.endm + +generate_composite_function_single_scanline \ + pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ + pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ + pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_8888_8888_process_pixblock_head + pixman_composite_out_reverse_8888_8888_process_pixblock_head +.endm + +.macro pixman_composite_over_8888_8888_process_pixblock_tail + pixman_composite_out_reverse_8888_8888_process_pixblock_tail + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +.macro pixman_composite_over_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q14, q8, #8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + PF cmp PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + fetch_src_pixblock + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmvn.8 d22, d3 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vmull.u8 q10, d22, d6 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 +.endm + +generate_composite_function \ + pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ + pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8888_process_pixblock_tail_head + pixman_composite_over_8888_8888_process_pixblock_tail + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + pixman_composite_over_8888_8888_process_pixblock_head + cache_preload 8, 8 +.endm + +.macro pixman_composite_over_n_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] + vdup.8 d2, d3[2] + vdup.8 d3, d3[3] +.endm + +generate_composite_function \ + pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8888_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_n_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head + vrshr.u16 q14, q8, #8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + PF cmp PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! + vmvn.8 d22, d3 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 + vmull.u8 q10, d22, d6 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 +.endm + +.macro pixman_composite_over_reverse_n_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d7[0]}, [DUMMY] + vdup.8 d4, d7[0] + vdup.8 d5, d7[1] + vdup.8 d6, d7[2] + vdup.8 d7, d7[3] +.endm + +generate_composite_function \ + pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_reverse_n_8888_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 4, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_8888_8_0565_process_pixblock_head + vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ + vmull.u8 q1, d24, d9 + vmull.u8 q6, d24, d10 + vmull.u8 q7, d24, d11 + vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ + vrshr.u16 q9, q1, #8 + vrshr.u16 q10, q6, #8 + vrshr.u16 q11, q7, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q9 + vraddhn.u16 d2, q6, q10 + vraddhn.u16 d3, q7, q11 + vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ + vsri.u8 d7, d7, #6 + vmvn.8 d3, d3 + vshrn.u16 d30, q2, #2 + vmull.u8 q8, d3, d6 /* now do alpha blending */ + vmull.u8 q9, d3, d7 + vmull.u8 q10, d3, d30 +.endm + +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail + /* 3 cycle bubble (after vmull.u8) */ + vrshr.u16 q13, q8, #8 + vrshr.u16 q11, q9, #8 + vrshr.u16 q15, q10, #8 + vraddhn.u16 d16, q8, q13 + vraddhn.u16 d27, q9, q11 + vraddhn.u16 d26, q10, q15 + vqadd.u8 d16, d2, d16 + /* 1 cycle bubble */ + vqadd.u8 q9, q0, q13 + vshll.u8 q14, d16, #8 /* convert to 16bpp */ + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + /* 1 cycle bubble */ + vsri.u16 q14, q9, #11 +.endm + +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head + vld1.16 {d4, d5}, [DST_R, :128]! + vshrn.u16 d6, q2, #8 + fetch_mask_pixblock + vshrn.u16 d7, q2, #3 + fetch_src_pixblock + vmull.u8 q6, d24, d10 + vrshr.u16 q13, q8, #8 + vrshr.u16 q11, q9, #8 + vrshr.u16 q15, q10, #8 + vraddhn.u16 d16, q8, q13 + vraddhn.u16 d27, q9, q11 + vraddhn.u16 d26, q10, q15 + vqadd.u8 d16, d2, d16 + vmull.u8 q1, d24, d9 + vqadd.u8 q9, q0, q13 + vshll.u8 q14, d16, #8 + vmull.u8 q0, d24, d8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vmull.u8 q7, d24, d11 + vsri.u16 q14, q9, #11 + + cache_preload 8, 8 + + vsli.u16 q2, q2, #5 + vrshr.u16 q8, q0, #8 + vrshr.u16 q9, q1, #8 + vrshr.u16 q10, q6, #8 + vrshr.u16 q11, q7, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q9 + vraddhn.u16 d2, q6, q10 + vraddhn.u16 d3, q7, q11 + vsri.u8 d6, d6, #5 + vsri.u8 d7, d7, #6 + vmvn.8 d3, d3 + vshrn.u16 d30, q2, #2 + vst1.16 {d28, d29}, [DST_W, :128]! + vmull.u8 q8, d3, d6 + vmull.u8 q9, d3, d7 + vmull.u8 q10, d3, d30 +.endm + +generate_composite_function \ + pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +/* + * This function needs a special initialization of solid mask. + * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET + * offset, split into color components and replicated in d8-d11 + * registers. Additionally, this function needs all the NEON registers, + * so it has to save d8-d15 registers which are callee saved according + * to ABI. These registers are restored from 'cleanup' macro. All the + * other NEON registers are caller saved, so can be clobbered freely + * without introducing any problems. + */ +.macro pixman_composite_over_n_8_0565_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d8, d11[0] + vdup.8 d9, d11[1] + vdup.8 d10, d11[2] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_over_n_8_0565_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8_0565_init, \ + pixman_composite_over_n_8_0565_cleanup, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_8888_n_0565_init + add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) + vpush {d8-d15} + vld1.32 {d24[0]}, [DUMMY] + vdup.8 d24, d24[3] +.endm + +.macro pixman_composite_over_8888_n_0565_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_8888_n_0565_init, \ + pixman_composite_over_8888_n_0565_cleanup, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_0565_0565_process_pixblock_head +.endm + +.macro pixman_composite_src_0565_0565_process_pixblock_tail +.endm + +.macro pixman_composite_src_0565_0565_process_pixblock_tail_head + vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! + fetch_src_pixblock + cache_preload 16, 16 +.endm + +generate_composite_function \ + pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ + FLAG_DST_WRITEONLY, \ + 16, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0565_0565_process_pixblock_head, \ + pixman_composite_src_0565_0565_process_pixblock_tail, \ + pixman_composite_src_0565_0565_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_8_process_pixblock_head +.endm + +.macro pixman_composite_src_n_8_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_8_process_pixblock_tail_head + vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d0[0]}, [DUMMY] + vsli.u64 d0, d0, #8 + vsli.u64 d0, d0, #16 + vsli.u64 d0, d0, #32 + vorr d1, d0, d0 + vorr q1, q0, q0 +.endm + +.macro pixman_composite_src_n_8_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ + FLAG_DST_WRITEONLY, \ + 32, /* number of pixels, processed in a single block */ \ + 0, /* prefetch distance */ \ + pixman_composite_src_n_8_init, \ + pixman_composite_src_n_8_cleanup, \ + pixman_composite_src_n_8_process_pixblock_head, \ + pixman_composite_src_n_8_process_pixblock_tail, \ + pixman_composite_src_n_8_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_0565_process_pixblock_head +.endm + +.macro pixman_composite_src_n_0565_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_0565_process_pixblock_tail_head + vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_0565_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d0[0]}, [DUMMY] + vsli.u64 d0, d0, #16 + vsli.u64 d0, d0, #32 + vorr d1, d0, d0 + vorr q1, q0, q0 +.endm + +.macro pixman_composite_src_n_0565_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ + FLAG_DST_WRITEONLY, \ + 16, /* number of pixels, processed in a single block */ \ + 0, /* prefetch distance */ \ + pixman_composite_src_n_0565_init, \ + pixman_composite_src_n_0565_cleanup, \ + pixman_composite_src_n_0565_process_pixblock_head, \ + pixman_composite_src_n_0565_process_pixblock_tail, \ + pixman_composite_src_n_0565_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_8888_process_pixblock_head +.endm + +.macro pixman_composite_src_n_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_8888_process_pixblock_tail_head + vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d0[0]}, [DUMMY] + vsli.u64 d0, d0, #32 + vorr d1, d0, d0 + vorr q1, q0, q0 +.endm + +.macro pixman_composite_src_n_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 0, /* prefetch distance */ \ + pixman_composite_src_n_8888_init, \ + pixman_composite_src_n_8888_cleanup, \ + pixman_composite_src_n_8888_process_pixblock_head, \ + pixman_composite_src_n_8888_process_pixblock_tail, \ + pixman_composite_src_n_8888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_8888_8888_process_pixblock_head +.endm + +.macro pixman_composite_src_8888_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_8888_8888_process_pixblock_tail_head + vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! + fetch_src_pixblock + cache_preload 8, 8 +.endm + +generate_composite_function \ + pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_8888_8888_process_pixblock_head, \ + pixman_composite_src_8888_8888_process_pixblock_tail, \ + pixman_composite_src_8888_8888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_x888_8888_process_pixblock_head + vorr q0, q0, q2 + vorr q1, q1, q2 +.endm + +.macro pixman_composite_src_x888_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_x888_8888_process_pixblock_tail_head + vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! + fetch_src_pixblock + vorr q0, q0, q2 + vorr q1, q1, q2 + cache_preload 8, 8 +.endm + +.macro pixman_composite_src_x888_8888_init + vmov.u8 q2, #0xFF + vshl.u32 q2, q2, #24 +.endm + +generate_composite_function \ + pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + pixman_composite_src_x888_8888_init, \ + default_cleanup, \ + pixman_composite_src_x888_8888_process_pixblock_head, \ + pixman_composite_src_x888_8888_process_pixblock_tail, \ + pixman_composite_src_x888_8888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_n_8_8888_process_pixblock_head + /* expecting deinterleaved source data in {d8, d9, d10, d11} */ + /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ + /* and destination data in {d4, d5, d6, d7} */ + /* mask is in d24 (d25, d26, d27 are unused) */ + + /* in */ + vmull.u8 q0, d24, d8 + vmull.u8 q1, d24, d9 + vmull.u8 q6, d24, d10 + vmull.u8 q7, d24, d11 + vrshr.u16 q10, q0, #8 + vrshr.u16 q11, q1, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q13, q7, #8 + vraddhn.u16 d0, q0, q10 + vraddhn.u16 d1, q1, q11 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d3, q7, q13 + vmvn.8 d24, d3 /* get inverted alpha */ + /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ + /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ + /* now do alpha blending */ + vmull.u8 q8, d24, d4 + vmull.u8 q9, d24, d5 + vmull.u8 q10, d24, d6 + vmull.u8 q11, d24, d7 +.endm + +.macro pixman_composite_over_n_8_8888_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head + pixman_composite_over_n_8_8888_process_pixblock_tail + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + fetch_mask_pixblock + cache_preload 8, 8 + pixman_composite_over_n_8_8888_process_pixblock_head +.endm + +.macro pixman_composite_over_n_8_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d8, d11[0] + vdup.8 d9, d11[1] + vdup.8 d10, d11[2] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_over_n_8_8888_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8_8888_init, \ + pixman_composite_over_n_8_8888_cleanup, \ + pixman_composite_over_n_8_8888_process_pixblock_head, \ + pixman_composite_over_n_8_8888_process_pixblock_tail, \ + pixman_composite_over_n_8_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_n_8_8_process_pixblock_head + vmull.u8 q0, d24, d8 + vmull.u8 q1, d25, d8 + vmull.u8 q6, d26, d8 + vmull.u8 q7, d27, d8 + vrshr.u16 q10, q0, #8 + vrshr.u16 q11, q1, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q13, q7, #8 + vraddhn.u16 d0, q0, q10 + vraddhn.u16 d1, q1, q11 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d3, q7, q13 + vmvn.8 q12, q0 + vmvn.8 q13, q1 + vmull.u8 q8, d24, d4 + vmull.u8 q9, d25, d5 + vmull.u8 q10, d26, d6 + vmull.u8 q11, d27, d7 +.endm + +.macro pixman_composite_over_n_8_8_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8_8_process_pixblock_tail_head + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_n_8_8_process_pixblock_tail + fetch_mask_pixblock + cache_preload 32, 32 + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + pixman_composite_over_n_8_8_process_pixblock_head +.endm + +.macro pixman_composite_over_n_8_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d8[0]}, [DUMMY] + vdup.8 d8, d8[3] +.endm + +.macro pixman_composite_over_n_8_8_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8_8_init, \ + pixman_composite_over_n_8_8_cleanup, \ + pixman_composite_over_n_8_8_process_pixblock_head, \ + pixman_composite_over_n_8_8_process_pixblock_tail, \ + pixman_composite_over_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head + /* + * 'combine_mask_ca' replacement + * + * input: solid src (n) in {d8, d9, d10, d11} + * dest in {d4, d5, d6, d7 } + * mask in {d24, d25, d26, d27} + * output: updated src in {d0, d1, d2, d3 } + * updated mask in {d24, d25, d26, d3 } + */ + vmull.u8 q0, d24, d8 + vmull.u8 q1, d25, d9 + vmull.u8 q6, d26, d10 + vmull.u8 q7, d27, d11 + vmull.u8 q9, d11, d25 + vmull.u8 q12, d11, d24 + vmull.u8 q13, d11, d26 + vrshr.u16 q8, q0, #8 + vrshr.u16 q10, q1, #8 + vrshr.u16 q11, q6, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q10 + vraddhn.u16 d2, q6, q11 + vrshr.u16 q11, q12, #8 + vrshr.u16 q8, q9, #8 + vrshr.u16 q6, q13, #8 + vrshr.u16 q10, q7, #8 + vraddhn.u16 d24, q12, q11 + vraddhn.u16 d25, q9, q8 + vraddhn.u16 d26, q13, q6 + vraddhn.u16 d3, q7, q10 + /* + * 'combine_over_ca' replacement + * + * output: updated dest in {d28, d29, d30, d31} + */ + vmvn.8 d24, d24 + vmvn.8 d25, d25 + vmull.u8 q8, d24, d4 + vmull.u8 q9, d25, d5 + vmvn.8 d26, d26 + vmvn.8 d27, d3 + vmull.u8 q10, d26, d6 + vmull.u8 q11, d27, d7 +.endm + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail + /* ... continue 'combine_over_ca' replacement */ + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q6, q10, #8 + vrshr.u16 q7, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q6, q10 + vraddhn.u16 d31, q7, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q6, q10, #8 + vrshr.u16 q7, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q6, q10 + vraddhn.u16 d31, q7, q11 + fetch_mask_pixblock + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + cache_preload 8, 8 + pixman_composite_over_n_8888_8888_ca_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_n_8888_8888_ca_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d8, d11[0] + vdup.8 d9, d11[1] + vdup.8 d10, d11[2] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_over_n_8888_8888_ca_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8888_8888_ca_init, \ + pixman_composite_over_n_8888_8888_ca_cleanup, \ + pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ + pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ + pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_in_n_8_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* and destination data in {d4, d5, d6, d7} */ + vmull.u8 q8, d4, d3 + vmull.u8 q9, d5, d3 + vmull.u8 q10, d6, d3 + vmull.u8 q11, d7, d3 +.endm + +.macro pixman_composite_in_n_8_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q8, q14 + vraddhn.u16 d29, q9, q15 + vraddhn.u16 d30, q10, q12 + vraddhn.u16 d31, q11, q13 +.endm + +.macro pixman_composite_in_n_8_process_pixblock_tail_head + pixman_composite_in_n_8_process_pixblock_tail + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + cache_preload 32, 32 + pixman_composite_in_n_8_process_pixblock_head + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_in_n_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d3, d3[3] +.endm + +.macro pixman_composite_in_n_8_cleanup +.endm + +generate_composite_function \ + pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_in_n_8_init, \ + pixman_composite_in_n_8_cleanup, \ + pixman_composite_in_n_8_process_pixblock_head, \ + pixman_composite_in_n_8_process_pixblock_tail, \ + pixman_composite_in_n_8_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ + +.macro pixman_composite_add_n_8_8_process_pixblock_head + /* expecting source data in {d8, d9, d10, d11} */ + /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ + /* and destination data in {d4, d5, d6, d7} */ + /* mask is in d24, d25, d26, d27 */ + vmull.u8 q0, d24, d11 + vmull.u8 q1, d25, d11 + vmull.u8 q6, d26, d11 + vmull.u8 q7, d27, d11 + vrshr.u16 q10, q0, #8 + vrshr.u16 q11, q1, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q13, q7, #8 + vraddhn.u16 d0, q0, q10 + vraddhn.u16 d1, q1, q11 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d3, q7, q13 + vqadd.u8 q14, q0, q2 + vqadd.u8 q15, q1, q3 +.endm + +.macro pixman_composite_add_n_8_8_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_n_8_8_process_pixblock_tail_head + pixman_composite_add_n_8_8_process_pixblock_tail + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + fetch_mask_pixblock + cache_preload 32, 32 + pixman_composite_add_n_8_8_process_pixblock_head +.endm + +.macro pixman_composite_add_n_8_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_add_n_8_8_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_n_8_8_init, \ + pixman_composite_add_n_8_8_cleanup, \ + pixman_composite_add_n_8_8_process_pixblock_head, \ + pixman_composite_add_n_8_8_process_pixblock_tail, \ + pixman_composite_add_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8_8_8_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* destination data in {d4, d5, d6, d7} */ + /* mask in {d24, d25, d26, d27} */ + vmull.u8 q8, d24, d0 + vmull.u8 q9, d25, d1 + vmull.u8 q10, d26, d2 + vmull.u8 q11, d27, d3 + vrshr.u16 q0, q8, #8 + vrshr.u16 q1, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q9 + vraddhn.u16 d2, q12, q10 + vraddhn.u16 d3, q13, q11 + vqadd.u8 q14, q0, q2 + vqadd.u8 q15, q1, q3 +.endm + +.macro pixman_composite_add_8_8_8_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_8_8_8_process_pixblock_tail_head + pixman_composite_add_8_8_8_process_pixblock_tail + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + fetch_mask_pixblock + fetch_src_pixblock + cache_preload 32, 32 + pixman_composite_add_8_8_8_process_pixblock_head +.endm + +.macro pixman_composite_add_8_8_8_init +.endm + +.macro pixman_composite_add_8_8_8_cleanup +.endm + +generate_composite_function \ + pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_8_8_8_init, \ + pixman_composite_add_8_8_8_cleanup, \ + pixman_composite_add_8_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_8_process_pixblock_tail, \ + pixman_composite_add_8_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8888_8888_8888_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* destination data in {d4, d5, d6, d7} */ + /* mask in {d24, d25, d26, d27} */ + vmull.u8 q8, d27, d0 + vmull.u8 q9, d27, d1 + vmull.u8 q10, d27, d2 + vmull.u8 q11, d27, d3 + /* 1 cycle bubble */ + vrsra.u16 q8, q8, #8 + vrsra.u16 q9, q9, #8 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 +.endm + +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail + /* 2 cycle bubble */ + vrshrn.u16 d28, q8, #8 + vrshrn.u16 d29, q9, #8 + vrshrn.u16 d30, q10, #8 + vrshrn.u16 d31, q11, #8 + vqadd.u8 q14, q2, q14 + /* 1 cycle bubble */ + vqadd.u8 q15, q3, q15 +.endm + +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + fetch_src_pixblock + vrshrn.u16 d28, q8, #8 + fetch_mask_pixblock + vrshrn.u16 d29, q9, #8 + vmull.u8 q8, d27, d0 + vrshrn.u16 d30, q10, #8 + vmull.u8 q9, d27, d1 + vrshrn.u16 d31, q11, #8 + vmull.u8 q10, d27, d2 + vqadd.u8 q14, q2, q14 + vmull.u8 q11, d27, d3 + vqadd.u8 q15, q3, q15 + vrsra.u16 q8, q8, #8 + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrsra.u16 q9, q9, #8 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + vrsra.u16 q10, q10, #8 + + cache_preload 8, 8 + + vrsra.u16 q11, q11, #8 +.endm + +generate_composite_function \ + pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ + pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +generate_composite_function \ + pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 27 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_add_n_8_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] + vdup.8 d2, d3[2] + vdup.8 d3, d3[3] +.endm + +.macro pixman_composite_add_n_8_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_n_8_8888_init, \ + pixman_composite_add_n_8_8888_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 27 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_add_8888_n_8888_init + add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) + vld1.32 {d27[0]}, [DUMMY] + vdup.8 d27, d27[3] +.endm + +.macro pixman_composite_add_8888_n_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_8888_n_8888_init, \ + pixman_composite_add_8888_n_8888_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 27 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* destination data in {d4, d5, d6, d7} */ + /* solid mask is in d15 */ + + /* 'in' */ + vmull.u8 q8, d15, d3 + vmull.u8 q6, d15, d2 + vmull.u8 q5, d15, d1 + vmull.u8 q4, d15, d0 + vrshr.u16 q13, q8, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q11, q5, #8 + vrshr.u16 q10, q4, #8 + vraddhn.u16 d3, q8, q13 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d1, q5, q11 + vraddhn.u16 d0, q4, q10 + vmvn.8 d24, d3 /* get inverted alpha */ + /* now do alpha blending */ + vmull.u8 q8, d24, d4 + vmull.u8 q9, d24, d5 + vmull.u8 q10, d24, d6 + vmull.u8 q11, d24, d7 +.endm + +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail + fetch_src_pixblock + cache_preload 8, 8 + fetch_mask_pixblock + pixman_composite_out_reverse_8888_n_8888_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +generate_composite_function_single_scanline \ + pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ + pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ + pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_8888_n_8888_process_pixblock_head + pixman_composite_out_reverse_8888_n_8888_process_pixblock_head +.endm + +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail + pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_8888_n_8888_process_pixblock_tail + fetch_src_pixblock + cache_preload 8, 8 + pixman_composite_over_8888_n_8888_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_8888_n_8888_init + add DUMMY, sp, #48 + vpush {d8-d15} + vld1.32 {d15[0]}, [DUMMY] + vdup.8 d15, d15[3] +.endm + +.macro pixman_composite_over_8888_n_8888_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_8888_n_8888_init, \ + pixman_composite_over_8888_n_8888_cleanup, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail_head + +/******************************************************************************/ + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_8888_n_8888_process_pixblock_tail + fetch_src_pixblock + cache_preload 8, 8 + fetch_mask_pixblock + pixman_composite_over_8888_n_8888_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +generate_composite_function \ + pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + +generate_composite_function_single_scanline \ + pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + +/******************************************************************************/ + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_8888_n_8888_process_pixblock_tail + fetch_src_pixblock + cache_preload 8, 8 + fetch_mask_pixblock + pixman_composite_over_8888_n_8888_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +generate_composite_function \ + pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 15 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_0888_process_pixblock_head +.endm + +.macro pixman_composite_src_0888_0888_process_pixblock_tail +.endm + +.macro pixman_composite_src_0888_0888_process_pixblock_tail_head + vst3.8 {d0, d1, d2}, [DST_W]! + fetch_src_pixblock + cache_preload 8, 8 +.endm + +generate_composite_function \ + pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0888_0888_process_pixblock_head, \ + pixman_composite_src_0888_0888_process_pixblock_tail, \ + pixman_composite_src_0888_0888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_head + vswp d0, d2 +.endm + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail +.endm + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head + vst4.8 {d0, d1, d2, d3}, [DST_W]! + fetch_src_pixblock + vswp d0, d2 + cache_preload 8, 8 +.endm + +.macro pixman_composite_src_0888_8888_rev_init + veor d3, d3, d3 +.endm + +generate_composite_function \ + pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + pixman_composite_src_0888_8888_rev_init, \ + default_cleanup, \ + pixman_composite_src_0888_8888_rev_process_pixblock_head, \ + pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ + pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_head + vshll.u8 q8, d1, #8 + vshll.u8 q9, d2, #8 +.endm + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail + vshll.u8 q14, d0, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head + vshll.u8 q14, d0, #8 + fetch_src_pixblock + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 + vshll.u8 q8, d1, #8 + vst1.16 {d28, d29}, [DST_W, :128]! + vshll.u8 q9, d2, #8 +.endm + +generate_composite_function \ + pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0888_0565_rev_process_pixblock_head, \ + pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ + pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_head + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 +.endm + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + vraddhn.u16 d30, q11, q8 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d28, q13, q10 +.endm + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + fetch_src_pixblock + vraddhn.u16 d30, q11, q8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d28, q13, q10 + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +.endm + +generate_composite_function \ + pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_pixbuf_8888_process_pixblock_head, \ + pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ + pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 +.endm + +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + vraddhn.u16 d28, q11, q8 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d30, q13, q10 +.endm + +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + fetch_src_pixblock + vraddhn.u16 d28, q11, q8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d30, q13, q10 + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +.endm + +generate_composite_function \ + pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ + pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ + pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_0565_8_0565_process_pixblock_head + /* mask is in d15 */ + convert_0565_to_x888 q4, d2, d1, d0 + convert_0565_to_x888 q5, d6, d5, d4 + /* source pixel data is in {d0, d1, d2, XX} */ + /* destination pixel data is in {d4, d5, d6, XX} */ + vmvn.8 d7, d15 + vmull.u8 q6, d15, d2 + vmull.u8 q5, d15, d1 + vmull.u8 q4, d15, d0 + vmull.u8 q8, d7, d4 + vmull.u8 q9, d7, d5 + vmull.u8 q13, d7, d6 + vrshr.u16 q12, q6, #8 + vrshr.u16 q11, q5, #8 + vrshr.u16 q10, q4, #8 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d1, q5, q11 + vraddhn.u16 d0, q4, q10 +.endm + +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q13, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q13 + vqadd.u8 q0, q0, q14 + vqadd.u8 q1, q1, q15 + /* 32bpp result is in {d0, d1, d2, XX} */ + convert_8888_to_0565 d2, d1, d0, q14, q15, q3 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head + fetch_mask_pixblock + pixman_composite_over_0565_8_0565_process_pixblock_tail + fetch_src_pixblock + vld1.16 {d10, d11}, [DST_R, :128]! + cache_preload 8, 8 + pixman_composite_over_0565_8_0565_process_pixblock_head + vst1.16 {d28, d29}, [DST_W, :128]! +.endm + +generate_composite_function \ + pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_0565_8_0565_process_pixblock_head, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 10, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 15 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_0565_n_0565_init + add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) + vpush {d8-d15} + vld1.32 {d15[0]}, [DUMMY] + vdup.8 d15, d15[3] +.endm + +.macro pixman_composite_over_0565_n_0565_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_0565_n_0565_init, \ + pixman_composite_over_0565_n_0565_cleanup, \ + pixman_composite_over_0565_8_0565_process_pixblock_head, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 10, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 15 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_add_0565_8_0565_process_pixblock_head + /* mask is in d15 */ + convert_0565_to_x888 q4, d2, d1, d0 + convert_0565_to_x888 q5, d6, d5, d4 + /* source pixel data is in {d0, d1, d2, XX} */ + /* destination pixel data is in {d4, d5, d6, XX} */ + vmull.u8 q6, d15, d2 + vmull.u8 q5, d15, d1 + vmull.u8 q4, d15, d0 + vrshr.u16 q12, q6, #8 + vrshr.u16 q11, q5, #8 + vrshr.u16 q10, q4, #8 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d1, q5, q11 + vraddhn.u16 d0, q4, q10 +.endm + +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail + vqadd.u8 q0, q0, q2 + vqadd.u8 q1, q1, q3 + /* 32bpp result is in {d0, d1, d2, XX} */ + convert_8888_to_0565 d2, d1, d0, q14, q15, q3 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head + fetch_mask_pixblock + pixman_composite_add_0565_8_0565_process_pixblock_tail + fetch_src_pixblock + vld1.16 {d10, d11}, [DST_R, :128]! + cache_preload 8, 8 + pixman_composite_add_0565_8_0565_process_pixblock_head + vst1.16 {d28, d29}, [DST_W, :128]! +.endm + +generate_composite_function \ + pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_add_0565_8_0565_process_pixblock_head, \ + pixman_composite_add_0565_8_0565_process_pixblock_tail, \ + pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 10, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 15 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_out_reverse_8_0565_process_pixblock_head + /* mask is in d15 */ + convert_0565_to_x888 q5, d6, d5, d4 + /* destination pixel data is in {d4, d5, d6, xx} */ + vmvn.8 d24, d15 /* get inverted alpha */ + /* now do alpha blending */ + vmull.u8 q8, d24, d4 + vmull.u8 q9, d24, d5 + vmull.u8 q10, d24, d6 +.endm + +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vraddhn.u16 d0, q14, q8 + vraddhn.u16 d1, q15, q9 + vraddhn.u16 d2, q12, q10 + /* 32bpp result is in {d0, d1, d2, XX} */ + convert_8888_to_0565 d2, d1, d0, q14, q15, q3 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head + fetch_src_pixblock + pixman_composite_out_reverse_8_0565_process_pixblock_tail + vld1.16 {d10, d11}, [DST_R, :128]! + cache_preload 8, 8 + pixman_composite_out_reverse_8_0565_process_pixblock_head + vst1.16 {d28, d29}, [DST_W, :128]! +.endm + +generate_composite_function \ + pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_out_reverse_8_0565_process_pixblock_head, \ + pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ + pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 10, /* dst_r_basereg */ \ + 15, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_process_pixblock_tail_head + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_0565_process_pixblock_head, \ + pixman_composite_over_8888_0565_process_pixblock_tail, \ + pixman_composite_over_8888_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_8888_0565_process_pixblock_head, \ + pixman_composite_src_8888_0565_process_pixblock_tail, \ + pixman_composite_src_8888_0565_process_pixblock_tail_head + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0565_8888_process_pixblock_head, \ + pixman_composite_src_0565_8888_process_pixblock_tail, \ + pixman_composite_src_0565_8888_process_pixblock_tail_head + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 24 /* mask_basereg */ + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_0565_8_0565_process_pixblock_head, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 10, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 15 /* mask_basereg */ diff --git a/pixman/pixman/pixman-arm-neon.c b/pixman/pixman/pixman-arm-neon.c index 7d6c83775..3e0c0d1c2 100644 --- a/pixman/pixman/pixman-arm-neon.c +++ b/pixman/pixman/pixman-arm-neon.c @@ -122,6 +122,11 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC, PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC, uint16_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565, + OVER, uint32_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565, + OVER, uint16_t, uint16_t) + void pixman_composite_src_n_8_asm_neon (int32_t w, int32_t h, @@ -332,6 +337,12 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888), SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888), + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565), + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565), + + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565), + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565), + { PIXMAN_OP_NONE }, }; diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c index 4cb8321aa..92f030871 100644 --- a/pixman/pixman/pixman-fast-path.c +++ b/pixman/pixman/pixman-fast-path.c @@ -1,2227 +1,2228 @@ -/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. SuSE makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author: Keith Packard, SuSE, Inc.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <string.h>
-#include <stdlib.h>
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-fast-path.h"
-
-static force_inline uint32_t
-fetch_24 (uint8_t *a)
-{
- if (((unsigned long)a) & 1)
- {
-#ifdef WORDS_BIGENDIAN
- return (*a << 16) | (*(uint16_t *)(a + 1));
-#else
- return *a | (*(uint16_t *)(a + 1) << 8);
-#endif
- }
- else
- {
-#ifdef WORDS_BIGENDIAN
- return (*(uint16_t *)a << 8) | *(a + 2);
-#else
- return *(uint16_t *)a | (*(a + 2) << 16);
-#endif
- }
-}
-
-static force_inline void
-store_24 (uint8_t *a,
- uint32_t v)
-{
- if (((unsigned long)a) & 1)
- {
-#ifdef WORDS_BIGENDIAN
- *a = (uint8_t) (v >> 16);
- *(uint16_t *)(a + 1) = (uint16_t) (v);
-#else
- *a = (uint8_t) (v);
- *(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
-#endif
- }
- else
- {
-#ifdef WORDS_BIGENDIAN
- *(uint16_t *)a = (uint16_t)(v >> 8);
- *(a + 2) = (uint8_t)v;
-#else
- *(uint16_t *)a = (uint16_t)v;
- *(a + 2) = (uint8_t)(v >> 16);
-#endif
- }
-}
-
-static force_inline uint32_t
-over (uint32_t src,
- uint32_t dest)
-{
- uint32_t a = ~src >> 24;
-
- UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
-
- return dest;
-}
-
-static uint32_t
-in (uint32_t x,
- uint8_t y)
-{
- uint16_t a = y;
-
- UN8x4_MUL_UN8 (x, a);
-
- return x;
-}
-
-/*
- * Naming convention:
- *
- * op_src_mask_dest
- */
-static void
-fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *src_line;
- uint32_t *dst, *dst_line;
- uint8_t *mask, *mask_line;
- int src_stride, mask_stride, dst_stride;
- uint8_t m;
- uint32_t s, d;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- src = src_line;
- src_line += src_stride;
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- w = width;
- while (w--)
- {
- m = *mask++;
- if (m)
- {
- s = *src | 0xff000000;
-
- if (m == 0xff)
- {
- *dst = s;
- }
- else
- {
- d = in (s, m);
- *dst = over (d, *dst);
- }
- }
- src++;
- dst++;
- }
- }
-}
-
-static void
-fast_composite_in_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dest_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
- uint16_t t;
-
- src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-
- srca = src >> 24;
-
- PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- if (srca == 0xff)
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
-
- if (m == 0)
- *dst = 0;
- else if (m != 0xff)
- *dst = MUL_UN8 (m, *dst, t);
-
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- m = MUL_UN8 (m, srca, t);
-
- if (m == 0)
- *dst = 0;
- else if (m != 0xff)
- *dst = MUL_UN8 (m, *dst, t);
-
- dst++;
- }
- }
- }
-}
-
-static void
-fast_composite_in_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dest_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint8_t s;
- uint16_t t;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
-
- if (s == 0)
- *dst = 0;
- else if (s != 0xff)
- *dst = MUL_UN8 (s, *dst, t);
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, *dst, d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- *dst = src;
- else
- *dst = over (src, *dst);
- }
- else if (m)
- {
- d = in (src, m);
- *dst = over (d, *dst);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, s;
- uint32_t *dst_line, *dst, d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
-
- if (ma)
- {
- d = *dst;
- s = src;
-
- UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
-
- *dst = s;
- }
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca, s;
- uint32_t *dst_line, *dst, d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
- if (ma == 0xffffffff)
- {
- if (srca == 0xff)
- *dst = src;
- else
- *dst = over (src, *dst);
- }
- else if (ma)
- {
- d = *dst;
- s = src;
-
- UN8x4_MUL_UN8x4 (s, ma);
- UN8x4_MUL_UN8 (ma, srca);
- ma = ~ma;
- UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
- *dst = d;
- }
-
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint8_t *dst_line, *dst;
- uint32_t d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- {
- d = src;
- }
- else
- {
- d = fetch_24 (dst);
- d = over (src, d);
- }
- store_24 (dst, d);
- }
- else if (m)
- {
- d = over (in (src, m), fetch_24 (dst));
- store_24 (dst, d);
- }
- dst += 3;
- }
- }
-}
-
-static void
-fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint8_t *mask_line, *mask, m;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- {
- d = src;
- }
- else
- {
- d = *dst;
- d = over (src, CONVERT_0565_TO_0888 (d));
- }
- *dst = CONVERT_8888_TO_0565 (d);
- }
- else if (m)
- {
- d = *dst;
- d = over (in (src, m), CONVERT_0565_TO_0888 (d));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca, s;
- uint16_t src16;
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint32_t *mask_line, *mask, ma;
- int dst_stride, mask_stride;
- int32_t w;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- src16 = CONVERT_8888_TO_0565 (src);
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- ma = *mask++;
- if (ma == 0xffffffff)
- {
- if (srca == 0xff)
- {
- *dst = src16;
- }
- else
- {
- d = *dst;
- d = over (src, CONVERT_0565_TO_0888 (d));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- }
- else if (ma)
- {
- d = *dst;
- d = CONVERT_0565_TO_0888 (d);
-
- s = src;
-
- UN8x4_MUL_UN8x4 (s, ma);
- UN8x4_MUL_UN8 (ma, srca);
- ma = ~ma;
- UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_over_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- uint8_t a;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (a == 0xff)
- *dst = s;
- else if (s)
- *dst = over (s, *dst);
- dst++;
- }
- }
-}
-
-static void
-fast_composite_src_x888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- *dst++ = (*src++) | 0xff000000;
- }
-}
-
-#if 0
-static void
-fast_composite_over_8888_0888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint32_t d;
- uint32_t *src_line, *src, s;
- uint8_t a;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (a)
- {
- if (a == 0xff)
- d = s;
- else
- d = over (s, fetch_24 (dst));
-
- store_24 (dst, d);
- }
- dst += 3;
- }
- }
-}
-#endif
-
-static void
-fast_composite_over_8888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst;
- uint32_t d;
- uint32_t *src_line, *src, s;
- uint8_t a;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- a = s >> 24;
- if (s)
- {
- if (a == 0xff)
- {
- d = s;
- }
- else
- {
- d = *dst;
- d = over (s, CONVERT_0565_TO_0888 (d));
- }
- *dst = CONVERT_8888_TO_0565 (d);
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_src_x888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint16_t *dst_line, *dst;
- uint32_t *src_line, *src, s;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- *dst = CONVERT_8888_TO_0565 (s);
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint8_t s, d;
- uint16_t t;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- if (s)
- {
- if (s != 0xff)
- {
- d = *dst;
- t = d + s;
- s = t | (0 - (t >> 8));
- }
- *dst = s;
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
- uint32_t s, d;
-
- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- s = *src++;
- if (s)
- {
- if (s != 0xffffffff)
- {
- d = *dst;
- if (d)
- UN8x4_ADD_UN8x4 (s, d);
- }
- *dst = s;
- }
- dst++;
- }
- }
-}
-
-static void
-fast_composite_add_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint8_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
- int32_t w;
- uint32_t src;
- uint8_t sa;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
- sa = (src >> 24);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- while (w--)
- {
- uint16_t tmp;
- uint16_t a;
- uint32_t m, d;
- uint32_t r;
-
- a = *mask++;
- d = *dst;
-
- m = MUL_UN8 (sa, a, tmp);
- r = ADD_UN8 (m, d, tmp);
-
- *dst++ = r;
- }
- }
-}
-
-#ifdef WORDS_BIGENDIAN
-#define CREATE_BITMASK(n) (0x80000000 >> (n))
-#define UPDATE_BITMASK(n) ((n) >> 1)
-#else
-#define CREATE_BITMASK(n) (1 << (n))
-#define UPDATE_BITMASK(n) ((n) << 1)
-#endif
-
-#define TEST_BIT(p, n) \
- (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
-#define SET_BIT(p, n) \
- do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
-
-static void
-fast_composite_add_1000_1000 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line, *dst;
- uint32_t *src_line, *src;
- int dst_stride, src_stride;
- int32_t w;
-
- PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
- src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
- dst_stride, dst_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- while (w--)
- {
- /*
- * TODO: improve performance by processing uint32_t data instead
- * of individual bits
- */
- if (TEST_BIT (src, src_x + w))
- SET_BIT (dst, dest_x + w);
- }
- }
-}
-
-static void
-fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst, *dst_line;
- uint32_t *mask, *mask_line;
- int mask_stride, dst_stride;
- uint32_t bitcache, bitmask;
- int32_t w;
-
- if (width <= 0)
- return;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
- dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
- mask_stride, mask_line, 1);
- mask_line += mask_x >> 5;
-
- if (srca == 0xff)
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = src;
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = over (src, *dst);
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
-}
-
-static void
-fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dst, *dst_line;
- uint32_t *mask, *mask_line;
- int mask_stride, dst_stride;
- uint32_t bitcache, bitmask;
- int32_t w;
- uint32_t d;
- uint16_t src565;
-
- if (width <= 0)
- return;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
- dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
- mask_stride, mask_line, 1);
- mask_line += mask_x >> 5;
-
- if (srca == 0xff)
- {
- src565 = CONVERT_8888_TO_0565 (src);
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- *dst = src565;
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
- else
- {
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (mask_x & 31);
-
- while (w--)
- {
- if (bitmask == 0)
- {
- bitcache = *mask++;
- bitmask = CREATE_BITMASK (0);
- }
- if (bitcache & bitmask)
- {
- d = over (src, CONVERT_0565_TO_0888 (*dst));
- *dst = CONVERT_8888_TO_0565 (d);
- }
- bitmask = UPDATE_BITMASK (bitmask);
- dst++;
- }
- }
- }
-}
-
-/*
- * Simple bitblt
- */
-
-static void
-fast_composite_solid_fill (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- if (dst_image->bits.format == PIXMAN_a1)
- {
- src = src >> 31;
- }
- else if (dst_image->bits.format == PIXMAN_a8)
- {
- src = src >> 24;
- }
- else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
- dst_image->bits.format == PIXMAN_b5g6r5)
- {
- src = CONVERT_8888_TO_0565 (src);
- }
-
- pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
- dest_x, dest_y,
- width, height,
- src);
-}
-
-static void
-fast_composite_src_memcpy (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8;
- uint32_t n_bytes = width * bpp;
- int dst_stride, src_stride;
- uint8_t *dst;
- uint8_t *src;
-
- src_stride = src_image->bits.rowstride * 4;
- dst_stride = dst_image->bits.rowstride * 4;
-
- src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
- dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
-
- while (height--)
- {
- memcpy (dst, src, n_bytes);
-
- dst += dst_stride;
- src += src_stride;
- }
-}
-
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
-FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
-
-/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
-static force_inline void
-scaled_nearest_scanline_565_565_SRC (uint16_t * dst,
- uint16_t * src,
- int32_t w,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- pixman_fixed_t max_vx)
-{
- uint16_t tmp1, tmp2, tmp3, tmp4;
- while ((w -= 4) >= 0)
- {
- tmp1 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp3 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp4 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- *dst++ = tmp1;
- *dst++ = tmp2;
- *dst++ = tmp3;
- *dst++ = tmp4;
- }
- if (w & 2)
- {
- tmp1 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
- vx += unit_x;
- *dst++ = tmp1;
- *dst++ = tmp2;
- }
- if (w & 1)
- *dst++ = src[pixman_fixed_to_int (vx)];
-}
-
-FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, COVER)
-FAST_NEAREST_MAINLOOP (565_565_none_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, NONE)
-FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
- scaled_nearest_scanline_565_565_SRC,
- uint16_t, uint16_t, PAD)
-
-static force_inline uint32_t
-fetch_nearest (pixman_repeat_t src_repeat,
- pixman_format_code_t format,
- uint32_t *src, int x, int src_width)
-{
- if (repeat (src_repeat, &x, src_width))
- {
- if (format == PIXMAN_x8r8g8b8)
- return *(src + x) | 0xff000000;
- else
- return *(src + x);
- }
- else
- {
- return 0;
- }
-}
-
-static force_inline void
-combine_over (uint32_t s, uint32_t *dst)
-{
- if (s)
- {
- uint8_t ia = 0xff - (s >> 24);
-
- if (ia)
- UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
- else
- *dst = s;
- }
-}
-
-static force_inline void
-combine_src (uint32_t s, uint32_t *dst)
-{
- *dst = s;
-}
-
-static void
-fast_composite_scaled_nearest (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t *dst_line;
- uint32_t *src_line;
- int dst_stride, src_stride;
- int src_width, src_height;
- pixman_repeat_t src_repeat;
- pixman_fixed_t unit_x, unit_y;
- pixman_format_code_t src_format;
- pixman_vector_t v;
- pixman_fixed_t vy;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
- * transformed from destination space to source space
- */
- PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
-
- /* reference point is the center of the pixel */
- v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
- v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
- v.vector[2] = pixman_fixed_1;
-
- if (!pixman_transform_point_3d (src_image->common.transform, &v))
- return;
-
- unit_x = src_image->common.transform->matrix[0][0];
- unit_y = src_image->common.transform->matrix[1][1];
-
- /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
- v.vector[0] -= pixman_fixed_e;
- v.vector[1] -= pixman_fixed_e;
-
- src_height = src_image->bits.height;
- src_width = src_image->bits.width;
- src_repeat = src_image->common.repeat;
- src_format = src_image->bits.format;
-
- vy = v.vector[1];
- while (height--)
- {
- pixman_fixed_t vx = v.vector[0];
- int y = pixman_fixed_to_int (vy);
- uint32_t *dst = dst_line;
-
- dst_line += dst_stride;
-
- /* adjust the y location by a unit vector in the y direction
- * this is equivalent to transforming y+1 of the destination point to source space */
- vy += unit_y;
-
- if (!repeat (src_repeat, &y, src_height))
- {
- if (op == PIXMAN_OP_SRC)
- memset (dst, 0, sizeof (*dst) * width);
- }
- else
- {
- int w = width;
-
- uint32_t *src = src_line + y * src_stride;
-
- while (w >= 2)
- {
- uint32_t s1, s2;
- int x1, x2;
-
- x1 = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- x2 = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- w -= 2;
-
- s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
- s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
-
- if (op == PIXMAN_OP_OVER)
- {
- combine_over (s1, dst++);
- combine_over (s2, dst++);
- }
- else
- {
- combine_src (s1, dst++);
- combine_src (s2, dst++);
- }
- }
-
- while (w--)
- {
- uint32_t s;
- int x;
-
- x = pixman_fixed_to_int (vx);
- vx += unit_x;
-
- s = fetch_nearest (src_repeat, src_format, src, x, src_width);
-
- if (op == PIXMAN_OP_OVER)
- combine_over (s, dst++);
- else
- combine_src (s, dst++);
- }
- }
- }
-}
-
-#define CACHE_LINE_SIZE 64
-
-#define FAST_SIMPLE_ROTATE(suffix, pix_type) \
- \
-static void \
-blt_rotated_90_trivial_##suffix (pix_type *dst, \
- int dst_stride, \
- const pix_type *src, \
- int src_stride, \
- int w, \
- int h) \
-{ \
- int x, y; \
- for (y = 0; y < h; y++) \
- { \
- const pix_type *s = src + (h - y - 1); \
- pix_type *d = dst + dst_stride * y; \
- for (x = 0; x < w; x++) \
- { \
- *d++ = *s; \
- s += src_stride; \
- } \
- } \
-} \
- \
-static void \
-blt_rotated_270_trivial_##suffix (pix_type *dst, \
- int dst_stride, \
- const pix_type *src, \
- int src_stride, \
- int w, \
- int h) \
-{ \
- int x, y; \
- for (y = 0; y < h; y++) \
- { \
- const pix_type *s = src + src_stride * (w - 1) + y; \
- pix_type *d = dst + dst_stride * y; \
- for (x = 0; x < w; x++) \
- { \
- *d++ = *s; \
- s -= src_stride; \
- } \
- } \
-} \
- \
-static void \
-blt_rotated_90_##suffix (pix_type *dst, \
- int dst_stride, \
- const pix_type *src, \
- int src_stride, \
- int W, \
- int H) \
-{ \
- int x; \
- int leading_pixels = 0, trailing_pixels = 0; \
- const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \
- \
- /* \
- * split processing into handling destination as TILE_SIZExH cache line \
- * aligned vertical stripes (optimistically assuming that destination \
- * stride is a multiple of cache line, if not - it will be just a bit \
- * slower) \
- */ \
- \
- if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \
- { \
- leading_pixels = TILE_SIZE - (((uintptr_t)dst & \
- (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
- if (leading_pixels > W) \
- leading_pixels = W; \
- \
- /* unaligned leading part NxH (where N < TILE_SIZE) */ \
- blt_rotated_90_trivial_##suffix ( \
- dst, \
- dst_stride, \
- src, \
- src_stride, \
- leading_pixels, \
- H); \
- \
- dst += leading_pixels; \
- src += leading_pixels * src_stride; \
- W -= leading_pixels; \
- } \
- \
- if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \
- { \
- trailing_pixels = (((uintptr_t)(dst + W) & \
- (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
- if (trailing_pixels > W) \
- trailing_pixels = W; \
- W -= trailing_pixels; \
- } \
- \
- for (x = 0; x < W; x += TILE_SIZE) \
- { \
- /* aligned middle part TILE_SIZExH */ \
- blt_rotated_90_trivial_##suffix ( \
- dst + x, \
- dst_stride, \
- src + src_stride * x, \
- src_stride, \
- TILE_SIZE, \
- H); \
- } \
- \
- if (trailing_pixels) \
- { \
- /* unaligned trailing part NxH (where N < TILE_SIZE) */ \
- blt_rotated_90_trivial_##suffix ( \
- dst + W, \
- dst_stride, \
- src + W * src_stride, \
- src_stride, \
- trailing_pixels, \
- H); \
- } \
-} \
- \
-static void \
-blt_rotated_270_##suffix (pix_type *dst, \
- int dst_stride, \
- const pix_type *src, \
- int src_stride, \
- int W, \
- int H) \
-{ \
- int x; \
- int leading_pixels = 0, trailing_pixels = 0; \
- const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \
- \
- /* \
- * split processing into handling destination as TILE_SIZExH cache line \
- * aligned vertical stripes (optimistically assuming that destination \
- * stride is a multiple of cache line, if not - it will be just a bit \
- * slower) \
- */ \
- \
- if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \
- { \
- leading_pixels = TILE_SIZE - (((uintptr_t)dst & \
- (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
- if (leading_pixels > W) \
- leading_pixels = W; \
- \
- /* unaligned leading part NxH (where N < TILE_SIZE) */ \
- blt_rotated_270_trivial_##suffix ( \
- dst, \
- dst_stride, \
- src + src_stride * (W - leading_pixels), \
- src_stride, \
- leading_pixels, \
- H); \
- \
- dst += leading_pixels; \
- W -= leading_pixels; \
- } \
- \
- if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \
- { \
- trailing_pixels = (((uintptr_t)(dst + W) & \
- (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
- if (trailing_pixels > W) \
- trailing_pixels = W; \
- W -= trailing_pixels; \
- src += trailing_pixels * src_stride; \
- } \
- \
- for (x = 0; x < W; x += TILE_SIZE) \
- { \
- /* aligned middle part TILE_SIZExH */ \
- blt_rotated_270_trivial_##suffix ( \
- dst + x, \
- dst_stride, \
- src + src_stride * (W - x - TILE_SIZE), \
- src_stride, \
- TILE_SIZE, \
- H); \
- } \
- \
- if (trailing_pixels) \
- { \
- /* unaligned trailing part NxH (where N < TILE_SIZE) */ \
- blt_rotated_270_trivial_##suffix ( \
- dst + W, \
- dst_stride, \
- src - trailing_pixels * src_stride, \
- src_stride, \
- trailing_pixels, \
- H); \
- } \
-} \
- \
-static void \
-fast_composite_rotate_90_##suffix (pixman_implementation_t *imp, \
- pixman_op_t op, \
- pixman_image_t * src_image, \
- pixman_image_t * mask_image, \
- pixman_image_t * dst_image, \
- int32_t src_x, \
- int32_t src_y, \
- int32_t mask_x, \
- int32_t mask_y, \
- int32_t dest_x, \
- int32_t dest_y, \
- int32_t width, \
- int32_t height) \
-{ \
- pix_type *dst_line; \
- pix_type *src_line; \
- int dst_stride, src_stride; \
- int src_x_t, src_y_t; \
- \
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \
- dst_stride, dst_line, 1); \
- src_x_t = -src_y + pixman_fixed_to_int ( \
- src_image->common.transform->matrix[0][2] + \
- pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
- src_y_t = src_x + pixman_fixed_to_int ( \
- src_image->common.transform->matrix[1][2] + \
- pixman_fixed_1 / 2 - pixman_fixed_e); \
- PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \
- src_stride, src_line, 1); \
- blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride, \
- width, height); \
-} \
- \
-static void \
-fast_composite_rotate_270_##suffix (pixman_implementation_t *imp, \
- pixman_op_t op, \
- pixman_image_t * src_image, \
- pixman_image_t * mask_image, \
- pixman_image_t * dst_image, \
- int32_t src_x, \
- int32_t src_y, \
- int32_t mask_x, \
- int32_t mask_y, \
- int32_t dest_x, \
- int32_t dest_y, \
- int32_t width, \
- int32_t height) \
-{ \
- pix_type *dst_line; \
- pix_type *src_line; \
- int dst_stride, src_stride; \
- int src_x_t, src_y_t; \
- \
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \
- dst_stride, dst_line, 1); \
- src_x_t = src_y + pixman_fixed_to_int ( \
- src_image->common.transform->matrix[0][2] + \
- pixman_fixed_1 / 2 - pixman_fixed_e); \
- src_y_t = -src_x + pixman_fixed_to_int ( \
- src_image->common.transform->matrix[1][2] + \
- pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
- PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \
- src_stride, src_line, 1); \
- blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride, \
- width, height); \
-}
-
-FAST_SIMPLE_ROTATE (8, uint8_t)
-FAST_SIMPLE_ROTATE (565, uint16_t)
-FAST_SIMPLE_ROTATE (8888, uint32_t)
-
-static const pixman_fast_path_t c_fast_paths[] =
-{
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5, fast_composite_over_n_1_0565),
- PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5, fast_composite_over_n_1_0565),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
- PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
- PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
- PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
- PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
- PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
- PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
- SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
-
- SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
-
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
- SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
-
- SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
-
-#define NEAREST_FAST_PATH(op,s,d) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, SCALED_NEAREST_FLAGS, \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest, \
- }
-
- NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
- NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
-
- NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
- NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
-
- NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
- NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
- NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
-
- NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
- NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
- NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
-
-#define SIMPLE_ROTATE_FLAGS(angle) \
- (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM | \
- FAST_PATH_NEAREST_FILTER | \
- FAST_PATH_SAMPLES_COVER_CLIP | \
- FAST_PATH_STANDARD_FLAGS)
-
-#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_rotate_90_##suffix, \
- }, \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_rotate_270_##suffix, \
- }
-
- SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
- SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
- SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
- SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
- SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
-
- { PIXMAN_OP_NONE },
-};
-
-#ifdef WORDS_BIGENDIAN
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
-#else
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
-#endif
-
-static force_inline void
-pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
-{
- if (offs)
- {
- int leading_pixels = 32 - offs;
- if (leading_pixels >= width)
- {
- if (v)
- *dst |= A1_FILL_MASK (width, offs);
- else
- *dst &= ~A1_FILL_MASK (width, offs);
- return;
- }
- else
- {
- if (v)
- *dst++ |= A1_FILL_MASK (leading_pixels, offs);
- else
- *dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
- width -= leading_pixels;
- }
- }
- while (width >= 32)
- {
- if (v)
- *dst++ = 0xFFFFFFFF;
- else
- *dst++ = 0;
- width -= 32;
- }
- if (width > 0)
- {
- if (v)
- *dst |= A1_FILL_MASK (width, 0);
- else
- *dst &= ~A1_FILL_MASK (width, 0);
- }
-}
-
-static void
-pixman_fill1 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- uint32_t *dst = bits + y * stride + (x >> 5);
- int offs = x & 31;
-
- if (xor & 1)
- {
- while (height--)
- {
- pixman_fill1_line (dst, offs, width, 1);
- dst += stride;
- }
- }
- else
- {
- while (height--)
- {
- pixman_fill1_line (dst, offs, width, 0);
- dst += stride;
- }
- }
-}
-
-static void
-pixman_fill8 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int byte_stride = stride * (int) sizeof (uint32_t);
- uint8_t *dst = (uint8_t *) bits;
- uint8_t v = xor & 0xff;
- int i;
-
- dst = dst + y * byte_stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- dst[i] = v;
-
- dst += byte_stride;
- }
-}
-
-static void
-pixman_fill16 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int short_stride =
- (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
- uint16_t *dst = (uint16_t *)bits;
- uint16_t v = xor & 0xffff;
- int i;
-
- dst = dst + y * short_stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- dst[i] = v;
-
- dst += short_stride;
- }
-}
-
-static void
-pixman_fill32 (uint32_t *bits,
- int stride,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- int i;
-
- bits = bits + y * stride + x;
-
- while (height--)
- {
- for (i = 0; i < width; ++i)
- bits[i] = xor;
-
- bits += stride;
- }
-}
-
-static pixman_bool_t
-fast_path_fill (pixman_implementation_t *imp,
- uint32_t * bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- switch (bpp)
- {
- case 1:
- pixman_fill1 (bits, stride, x, y, width, height, xor);
- break;
-
- case 8:
- pixman_fill8 (bits, stride, x, y, width, height, xor);
- break;
-
- case 16:
- pixman_fill16 (bits, stride, x, y, width, height, xor);
- break;
-
- case 32:
- pixman_fill32 (bits, stride, x, y, width, height, xor);
- break;
-
- default:
- return _pixman_implementation_fill (
- imp->delegate, bits, stride, bpp, x, y, width, height, xor);
- break;
- }
-
- return TRUE;
-}
-
-pixman_implementation_t *
-_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
-{
- pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
-
- imp->fill = fast_path_fill;
-
- return imp;
-}
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. SuSE makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author: Keith Packard, SuSE, Inc. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <string.h> +#include <stdlib.h> +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "pixman-fast-path.h" + +static force_inline uint32_t +fetch_24 (uint8_t *a) +{ + if (((unsigned long)a) & 1) + { +#ifdef WORDS_BIGENDIAN + return (*a << 16) | (*(uint16_t *)(a + 1)); +#else + return *a | (*(uint16_t *)(a + 1) << 8); +#endif + } + else + { +#ifdef WORDS_BIGENDIAN + return (*(uint16_t *)a << 8) | *(a + 2); +#else + return *(uint16_t *)a | (*(a + 2) << 16); +#endif + } +} + +static force_inline void +store_24 (uint8_t *a, + uint32_t v) +{ + if (((unsigned long)a) & 1) + { +#ifdef WORDS_BIGENDIAN + *a = (uint8_t) (v >> 16); + *(uint16_t *)(a + 1) = (uint16_t) (v); +#else + *a = (uint8_t) (v); + *(uint16_t *)(a + 1) = (uint16_t) (v >> 8); +#endif + } + else + { +#ifdef WORDS_BIGENDIAN + *(uint16_t *)a = (uint16_t)(v >> 8); + *(a + 2) = (uint8_t)v; +#else + *(uint16_t *)a = (uint16_t)v; + *(a + 2) = (uint8_t)(v >> 16); +#endif + } +} + +static force_inline uint32_t +over (uint32_t src, + uint32_t dest) +{ + uint32_t a = ~src >> 24; + + UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src); + + return dest; +} + +static uint32_t +in (uint32_t x, + uint8_t y) +{ + uint16_t a = y; + + UN8x4_MUL_UN8 (x, a); + + return x; +} + +/* + * Naming convention: + * + * op_src_mask_dest + */ +static void +fast_composite_over_x888_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *src, *src_line; + uint32_t *dst, *dst_line; + uint8_t *mask, *mask_line; + int src_stride, mask_stride, dst_stride; + uint8_t m; + uint32_t s, d; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + while (w--) + { + m = *mask++; + if (m) + { + s = *src | 0xff000000; + + if (m == 0xff) + { + *dst = s; + } + else + { + d = in (s, m); + *dst = over (d, *dst); + } + } + src++; + dst++; + } + } +} + +static void +fast_composite_in_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dest_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + uint16_t t; + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + + srca = src >> 24; + + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + if (srca == 0xff) + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + + if (m == 0) + *dst = 0; + else if (m != 0xff) + *dst = MUL_UN8 (m, *dst, t); + + dst++; + } + } + } + else + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + m = MUL_UN8 (m, srca, t); + + if (m == 0) + *dst = 0; + else if (m != 0xff) + *dst = MUL_UN8 (m, *dst, t); + + dst++; + } + } + } +} + +static void +fast_composite_in_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dest_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint8_t s; + uint16_t t; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + + if (s == 0) + *dst = 0; + else if (s != 0xff) + *dst = MUL_UN8 (s, *dst, t); + + dst++; + } + } +} + +static void +fast_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, *dst, d; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + if (m == 0xff) + { + if (srca == 0xff) + *dst = src; + else + *dst = over (src, *dst); + } + else if (m) + { + d = in (src, m); + *dst = over (d, *dst); + } + dst++; + } + } +} + +static void +fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, s; + uint32_t *dst_line, *dst, d; + uint32_t *mask_line, *mask, ma; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + ma = *mask++; + + if (ma) + { + d = *dst; + s = src; + + UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d); + + *dst = s; + } + + dst++; + } + } +} + +static void +fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca, s; + uint32_t *dst_line, *dst, d; + uint32_t *mask_line, *mask, ma; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + ma = *mask++; + if (ma == 0xffffffff) + { + if (srca == 0xff) + *dst = src; + else + *dst = over (src, *dst); + } + else if (ma) + { + d = *dst; + s = src; + + UN8x4_MUL_UN8x4 (s, ma); + UN8x4_MUL_UN8 (ma, srca); + ma = ~ma; + UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s); + + *dst = d; + } + + dst++; + } + } +} + +static void +fast_composite_over_n_8_0888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint8_t *dst_line, *dst; + uint32_t d; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + if (m == 0xff) + { + if (srca == 0xff) + { + d = src; + } + else + { + d = fetch_24 (dst); + d = over (src, d); + } + store_24 (dst, d); + } + else if (m) + { + d = over (in (src, m), fetch_24 (dst)); + store_24 (dst, d); + } + dst += 3; + } + } +} + +static void +fast_composite_over_n_8_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint16_t *dst_line, *dst; + uint32_t d; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + if (m == 0xff) + { + if (srca == 0xff) + { + d = src; + } + else + { + d = *dst; + d = over (src, CONVERT_0565_TO_0888 (d)); + } + *dst = CONVERT_8888_TO_0565 (d); + } + else if (m) + { + d = *dst; + d = over (in (src, m), CONVERT_0565_TO_0888 (d)); + *dst = CONVERT_8888_TO_0565 (d); + } + dst++; + } + } +} + +static void +fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca, s; + uint16_t src16; + uint16_t *dst_line, *dst; + uint32_t d; + uint32_t *mask_line, *mask, ma; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + src16 = CONVERT_8888_TO_0565 (src); + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + ma = *mask++; + if (ma == 0xffffffff) + { + if (srca == 0xff) + { + *dst = src16; + } + else + { + d = *dst; + d = over (src, CONVERT_0565_TO_0888 (d)); + *dst = CONVERT_8888_TO_0565 (d); + } + } + else if (ma) + { + d = *dst; + d = CONVERT_0565_TO_0888 (d); + + s = src; + + UN8x4_MUL_UN8x4 (s, ma); + UN8x4_MUL_UN8 (ma, srca); + ma = ~ma; + UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s); + + *dst = CONVERT_8888_TO_0565 (d); + } + dst++; + } + } +} + +static void +fast_composite_over_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + uint8_t a; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + a = s >> 24; + if (a == 0xff) + *dst = s; + else if (s) + *dst = over (s, *dst); + dst++; + } + } +} + +static void +fast_composite_src_x888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + *dst++ = (*src++) | 0xff000000; + } +} + +#if 0 +static void +fast_composite_over_8888_0888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint32_t d; + uint32_t *src_line, *src, s; + uint8_t a; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + a = s >> 24; + if (a) + { + if (a == 0xff) + d = s; + else + d = over (s, fetch_24 (dst)); + + store_24 (dst, d); + } + dst += 3; + } + } +} +#endif + +static void +fast_composite_over_8888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst; + uint32_t d; + uint32_t *src_line, *src, s; + uint8_t a; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + a = s >> 24; + if (s) + { + if (a == 0xff) + { + d = s; + } + else + { + d = *dst; + d = over (s, CONVERT_0565_TO_0888 (d)); + } + *dst = CONVERT_8888_TO_0565 (d); + } + dst++; + } + } +} + +static void +fast_composite_src_x888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + *dst = CONVERT_8888_TO_0565 (s); + dst++; + } + } +} + +static void +fast_composite_add_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint8_t s, d; + uint16_t t; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + if (s) + { + if (s != 0xff) + { + d = *dst; + t = d + s; + s = t | (0 - (t >> 8)); + } + *dst = s; + } + dst++; + } + } +} + +static void +fast_composite_add_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint32_t s, d; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + if (s) + { + if (s != 0xffffffff) + { + d = *dst; + if (d) + UN8x4_ADD_UN8x4 (s, d); + } + *dst = s; + } + dst++; + } + } +} + +static void +fast_composite_add_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t src; + uint8_t sa; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + sa = (src >> 24); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + uint16_t tmp; + uint16_t a; + uint32_t m, d; + uint32_t r; + + a = *mask++; + d = *dst; + + m = MUL_UN8 (sa, a, tmp); + r = ADD_UN8 (m, d, tmp); + + *dst++ = r; + } + } +} + +#ifdef WORDS_BIGENDIAN +#define CREATE_BITMASK(n) (0x80000000 >> (n)) +#define UPDATE_BITMASK(n) ((n) >> 1) +#else +#define CREATE_BITMASK(n) (1 << (n)) +#define UPDATE_BITMASK(n) ((n) << 1) +#endif + +#define TEST_BIT(p, n) \ + (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31)) +#define SET_BIT(p, n) \ + do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0); + +static void +fast_composite_add_1000_1000 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t, + src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t, + dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + /* + * TODO: improve performance by processing uint32_t data instead + * of individual bits + */ + if (TEST_BIT (src, src_x + w)) + SET_BIT (dst, dest_x + w); + } + } +} + +static void +fast_composite_over_n_1_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst, *dst_line; + uint32_t *mask, *mask_line; + int mask_stride, dst_stride; + uint32_t bitcache, bitmask; + int32_t w; + + if (width <= 0) + return; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, + dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t, + mask_stride, mask_line, 1); + mask_line += mask_x >> 5; + + if (srca == 0xff) + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + *dst = src; + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } + else + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + *dst = over (src, *dst); + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } +} + +static void +fast_composite_over_n_1_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint16_t *dst, *dst_line; + uint32_t *mask, *mask_line; + int mask_stride, dst_stride; + uint32_t bitcache, bitmask; + int32_t w; + uint32_t d; + uint16_t src565; + + if (width <= 0) + return; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, + dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t, + mask_stride, mask_line, 1); + mask_line += mask_x >> 5; + + if (srca == 0xff) + { + src565 = CONVERT_8888_TO_0565 (src); + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + *dst = src565; + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } + else + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + { + d = over (src, CONVERT_0565_TO_0888 (*dst)); + *dst = CONVERT_8888_TO_0565 (d); + } + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } +} + +/* + * Simple bitblt + */ + +static void +fast_composite_solid_fill (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + if (dst_image->bits.format == PIXMAN_a1) + { + src = src >> 31; + } + else if (dst_image->bits.format == PIXMAN_a8) + { + src = src >> 24; + } + else if (dst_image->bits.format == PIXMAN_r5g6b5 || + dst_image->bits.format == PIXMAN_b5g6r5) + { + src = CONVERT_8888_TO_0565 (src); + } + + pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride, + PIXMAN_FORMAT_BPP (dst_image->bits.format), + dest_x, dest_y, + width, height, + src); +} + +static void +fast_composite_src_memcpy (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8; + uint32_t n_bytes = width * bpp; + int dst_stride, src_stride; + uint8_t *dst; + uint8_t *src; + + src_stride = src_image->bits.rowstride * 4; + dst_stride = dst_image->bits.rowstride * 4; + + src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp; + dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp; + + while (height--) + { + memcpy (dst, src, n_bytes); + + dst += dst_stride; + src += src_stride; + } +} + +FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER) +FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE) +FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD) +FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL) +FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER) +FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE) +FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD) +FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL) +FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER) +FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE) +FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD) +FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL) +FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL) +FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER) +FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE) +FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD) +FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL) + +/* Use more unrolling for src_0565_0565 because it is typically CPU bound */ +static force_inline void +scaled_nearest_scanline_565_565_SRC (uint16_t * dst, + const uint16_t * src, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t fully_transparent_src) +{ + uint16_t tmp1, tmp2, tmp3, tmp4; + while ((w -= 4) >= 0) + { + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp3 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp4 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + *dst++ = tmp1; + *dst++ = tmp2; + *dst++ = tmp3; + *dst++ = tmp4; + } + if (w & 2) + { + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + *dst++ = tmp1; + *dst++ = tmp2; + } + if (w & 1) + *dst++ = src[pixman_fixed_to_int (vx)]; +} + +FAST_NEAREST_MAINLOOP (565_565_cover_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, COVER) +FAST_NEAREST_MAINLOOP (565_565_none_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, NONE) +FAST_NEAREST_MAINLOOP (565_565_pad_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, PAD) + +static force_inline uint32_t +fetch_nearest (pixman_repeat_t src_repeat, + pixman_format_code_t format, + uint32_t *src, int x, int src_width) +{ + if (repeat (src_repeat, &x, src_width)) + { + if (format == PIXMAN_x8r8g8b8) + return *(src + x) | 0xff000000; + else + return *(src + x); + } + else + { + return 0; + } +} + +static force_inline void +combine_over (uint32_t s, uint32_t *dst) +{ + if (s) + { + uint8_t ia = 0xff - (s >> 24); + + if (ia) + UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s); + else + *dst = s; + } +} + +static force_inline void +combine_src (uint32_t s, uint32_t *dst) +{ + *dst = s; +} + +static void +fast_composite_scaled_nearest (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line; + uint32_t *src_line; + int dst_stride, src_stride; + int src_width, src_height; + pixman_repeat_t src_repeat; + pixman_fixed_t unit_x, unit_y; + pixman_format_code_t src_format; + pixman_vector_t v; + pixman_fixed_t vy; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + /* pass in 0 instead of src_x and src_y because src_x and src_y need to be + * transformed from destination space to source space + */ + PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1); + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (src_image->common.transform, &v)) + return; + + unit_x = src_image->common.transform->matrix[0][0]; + unit_y = src_image->common.transform->matrix[1][1]; + + /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ + v.vector[0] -= pixman_fixed_e; + v.vector[1] -= pixman_fixed_e; + + src_height = src_image->bits.height; + src_width = src_image->bits.width; + src_repeat = src_image->common.repeat; + src_format = src_image->bits.format; + + vy = v.vector[1]; + while (height--) + { + pixman_fixed_t vx = v.vector[0]; + int y = pixman_fixed_to_int (vy); + uint32_t *dst = dst_line; + + dst_line += dst_stride; + + /* adjust the y location by a unit vector in the y direction + * this is equivalent to transforming y+1 of the destination point to source space */ + vy += unit_y; + + if (!repeat (src_repeat, &y, src_height)) + { + if (op == PIXMAN_OP_SRC) + memset (dst, 0, sizeof (*dst) * width); + } + else + { + int w = width; + + uint32_t *src = src_line + y * src_stride; + + while (w >= 2) + { + uint32_t s1, s2; + int x1, x2; + + x1 = pixman_fixed_to_int (vx); + vx += unit_x; + + x2 = pixman_fixed_to_int (vx); + vx += unit_x; + + w -= 2; + + s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width); + s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width); + + if (op == PIXMAN_OP_OVER) + { + combine_over (s1, dst++); + combine_over (s2, dst++); + } + else + { + combine_src (s1, dst++); + combine_src (s2, dst++); + } + } + + while (w--) + { + uint32_t s; + int x; + + x = pixman_fixed_to_int (vx); + vx += unit_x; + + s = fetch_nearest (src_repeat, src_format, src, x, src_width); + + if (op == PIXMAN_OP_OVER) + combine_over (s, dst++); + else + combine_src (s, dst++); + } + } + } +} + +#define CACHE_LINE_SIZE 64 + +#define FAST_SIMPLE_ROTATE(suffix, pix_type) \ + \ +static void \ +blt_rotated_90_trivial_##suffix (pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int w, \ + int h) \ +{ \ + int x, y; \ + for (y = 0; y < h; y++) \ + { \ + const pix_type *s = src + (h - y - 1); \ + pix_type *d = dst + dst_stride * y; \ + for (x = 0; x < w; x++) \ + { \ + *d++ = *s; \ + s += src_stride; \ + } \ + } \ +} \ + \ +static void \ +blt_rotated_270_trivial_##suffix (pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int w, \ + int h) \ +{ \ + int x, y; \ + for (y = 0; y < h; y++) \ + { \ + const pix_type *s = src + src_stride * (w - 1) + y; \ + pix_type *d = dst + dst_stride * y; \ + for (x = 0; x < w; x++) \ + { \ + *d++ = *s; \ + s -= src_stride; \ + } \ + } \ +} \ + \ +static void \ +blt_rotated_90_##suffix (pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int W, \ + int H) \ +{ \ + int x; \ + int leading_pixels = 0, trailing_pixels = 0; \ + const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \ + \ + /* \ + * split processing into handling destination as TILE_SIZExH cache line \ + * aligned vertical stripes (optimistically assuming that destination \ + * stride is a multiple of cache line, if not - it will be just a bit \ + * slower) \ + */ \ + \ + if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \ + { \ + leading_pixels = TILE_SIZE - (((uintptr_t)dst & \ + (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (leading_pixels > W) \ + leading_pixels = W; \ + \ + /* unaligned leading part NxH (where N < TILE_SIZE) */ \ + blt_rotated_90_trivial_##suffix ( \ + dst, \ + dst_stride, \ + src, \ + src_stride, \ + leading_pixels, \ + H); \ + \ + dst += leading_pixels; \ + src += leading_pixels * src_stride; \ + W -= leading_pixels; \ + } \ + \ + if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \ + { \ + trailing_pixels = (((uintptr_t)(dst + W) & \ + (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (trailing_pixels > W) \ + trailing_pixels = W; \ + W -= trailing_pixels; \ + } \ + \ + for (x = 0; x < W; x += TILE_SIZE) \ + { \ + /* aligned middle part TILE_SIZExH */ \ + blt_rotated_90_trivial_##suffix ( \ + dst + x, \ + dst_stride, \ + src + src_stride * x, \ + src_stride, \ + TILE_SIZE, \ + H); \ + } \ + \ + if (trailing_pixels) \ + { \ + /* unaligned trailing part NxH (where N < TILE_SIZE) */ \ + blt_rotated_90_trivial_##suffix ( \ + dst + W, \ + dst_stride, \ + src + W * src_stride, \ + src_stride, \ + trailing_pixels, \ + H); \ + } \ +} \ + \ +static void \ +blt_rotated_270_##suffix (pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int W, \ + int H) \ +{ \ + int x; \ + int leading_pixels = 0, trailing_pixels = 0; \ + const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \ + \ + /* \ + * split processing into handling destination as TILE_SIZExH cache line \ + * aligned vertical stripes (optimistically assuming that destination \ + * stride is a multiple of cache line, if not - it will be just a bit \ + * slower) \ + */ \ + \ + if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \ + { \ + leading_pixels = TILE_SIZE - (((uintptr_t)dst & \ + (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (leading_pixels > W) \ + leading_pixels = W; \ + \ + /* unaligned leading part NxH (where N < TILE_SIZE) */ \ + blt_rotated_270_trivial_##suffix ( \ + dst, \ + dst_stride, \ + src + src_stride * (W - leading_pixels), \ + src_stride, \ + leading_pixels, \ + H); \ + \ + dst += leading_pixels; \ + W -= leading_pixels; \ + } \ + \ + if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \ + { \ + trailing_pixels = (((uintptr_t)(dst + W) & \ + (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (trailing_pixels > W) \ + trailing_pixels = W; \ + W -= trailing_pixels; \ + src += trailing_pixels * src_stride; \ + } \ + \ + for (x = 0; x < W; x += TILE_SIZE) \ + { \ + /* aligned middle part TILE_SIZExH */ \ + blt_rotated_270_trivial_##suffix ( \ + dst + x, \ + dst_stride, \ + src + src_stride * (W - x - TILE_SIZE), \ + src_stride, \ + TILE_SIZE, \ + H); \ + } \ + \ + if (trailing_pixels) \ + { \ + /* unaligned trailing part NxH (where N < TILE_SIZE) */ \ + blt_rotated_270_trivial_##suffix ( \ + dst + W, \ + dst_stride, \ + src - trailing_pixels * src_stride, \ + src_stride, \ + trailing_pixels, \ + H); \ + } \ +} \ + \ +static void \ +fast_composite_rotate_90_##suffix (pixman_implementation_t *imp, \ + pixman_op_t op, \ + pixman_image_t * src_image, \ + pixman_image_t * mask_image, \ + pixman_image_t * dst_image, \ + int32_t src_x, \ + int32_t src_y, \ + int32_t mask_x, \ + int32_t mask_y, \ + int32_t dest_x, \ + int32_t dest_y, \ + int32_t width, \ + int32_t height) \ +{ \ + pix_type *dst_line; \ + pix_type *src_line; \ + int dst_stride, src_stride; \ + int src_x_t, src_y_t; \ + \ + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \ + dst_stride, dst_line, 1); \ + src_x_t = -src_y + pixman_fixed_to_int ( \ + src_image->common.transform->matrix[0][2] + \ + pixman_fixed_1 / 2 - pixman_fixed_e) - height;\ + src_y_t = src_x + pixman_fixed_to_int ( \ + src_image->common.transform->matrix[1][2] + \ + pixman_fixed_1 / 2 - pixman_fixed_e); \ + PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \ + src_stride, src_line, 1); \ + blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride, \ + width, height); \ +} \ + \ +static void \ +fast_composite_rotate_270_##suffix (pixman_implementation_t *imp, \ + pixman_op_t op, \ + pixman_image_t * src_image, \ + pixman_image_t * mask_image, \ + pixman_image_t * dst_image, \ + int32_t src_x, \ + int32_t src_y, \ + int32_t mask_x, \ + int32_t mask_y, \ + int32_t dest_x, \ + int32_t dest_y, \ + int32_t width, \ + int32_t height) \ +{ \ + pix_type *dst_line; \ + pix_type *src_line; \ + int dst_stride, src_stride; \ + int src_x_t, src_y_t; \ + \ + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \ + dst_stride, dst_line, 1); \ + src_x_t = src_y + pixman_fixed_to_int ( \ + src_image->common.transform->matrix[0][2] + \ + pixman_fixed_1 / 2 - pixman_fixed_e); \ + src_y_t = -src_x + pixman_fixed_to_int ( \ + src_image->common.transform->matrix[1][2] + \ + pixman_fixed_1 / 2 - pixman_fixed_e) - width; \ + PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \ + src_stride, src_line, 1); \ + blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride, \ + width, height); \ +} + +FAST_SIMPLE_ROTATE (8, uint8_t) +FAST_SIMPLE_ROTATE (565, uint16_t) +FAST_SIMPLE_ROTATE (8888, uint32_t) + +static const pixman_fast_path_t c_fast_paths[] = +{ + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5, fast_composite_over_n_1_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5, fast_composite_over_n_1_0565), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8), + PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000), + PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8), + PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8), + + SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888), + + SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888), + + SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565), + SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565), + + SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565), + + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888), + + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565), + +#define NEAREST_FAST_PATH(op,s,d) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, SCALED_NEAREST_FLAGS, \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest, \ + } + + NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8), + NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8), + + NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8), + NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8), + + NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8), + NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8), + + NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8), + NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8), + +#define SIMPLE_ROTATE_FLAGS(angle) \ + (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM | \ + FAST_PATH_NEAREST_FILTER | \ + FAST_PATH_SAMPLES_COVER_CLIP | \ + FAST_PATH_STANDARD_FLAGS) + +#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_rotate_90_##suffix, \ + }, \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_rotate_270_##suffix, \ + } + + SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888), + SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888), + SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888), + SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565), + SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8), + + { PIXMAN_OP_NONE }, +}; + +#ifdef WORDS_BIGENDIAN +#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n))) +#else +#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs)) +#endif + +static force_inline void +pixman_fill1_line (uint32_t *dst, int offs, int width, int v) +{ + if (offs) + { + int leading_pixels = 32 - offs; + if (leading_pixels >= width) + { + if (v) + *dst |= A1_FILL_MASK (width, offs); + else + *dst &= ~A1_FILL_MASK (width, offs); + return; + } + else + { + if (v) + *dst++ |= A1_FILL_MASK (leading_pixels, offs); + else + *dst++ &= ~A1_FILL_MASK (leading_pixels, offs); + width -= leading_pixels; + } + } + while (width >= 32) + { + if (v) + *dst++ = 0xFFFFFFFF; + else + *dst++ = 0; + width -= 32; + } + if (width > 0) + { + if (v) + *dst |= A1_FILL_MASK (width, 0); + else + *dst &= ~A1_FILL_MASK (width, 0); + } +} + +static void +pixman_fill1 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + uint32_t *dst = bits + y * stride + (x >> 5); + int offs = x & 31; + + if (xor & 1) + { + while (height--) + { + pixman_fill1_line (dst, offs, width, 1); + dst += stride; + } + } + else + { + while (height--) + { + pixman_fill1_line (dst, offs, width, 0); + dst += stride; + } + } +} + +static void +pixman_fill8 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + int byte_stride = stride * (int) sizeof (uint32_t); + uint8_t *dst = (uint8_t *) bits; + uint8_t v = xor & 0xff; + int i; + + dst = dst + y * byte_stride + x; + + while (height--) + { + for (i = 0; i < width; ++i) + dst[i] = v; + + dst += byte_stride; + } +} + +static void +pixman_fill16 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + int short_stride = + (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t); + uint16_t *dst = (uint16_t *)bits; + uint16_t v = xor & 0xffff; + int i; + + dst = dst + y * short_stride + x; + + while (height--) + { + for (i = 0; i < width; ++i) + dst[i] = v; + + dst += short_stride; + } +} + +static void +pixman_fill32 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + int i; + + bits = bits + y * stride + x; + + while (height--) + { + for (i = 0; i < width; ++i) + bits[i] = xor; + + bits += stride; + } +} + +static pixman_bool_t +fast_path_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + switch (bpp) + { + case 1: + pixman_fill1 (bits, stride, x, y, width, height, xor); + break; + + case 8: + pixman_fill8 (bits, stride, x, y, width, height, xor); + break; + + case 16: + pixman_fill16 (bits, stride, x, y, width, height, xor); + break; + + case 32: + pixman_fill32 (bits, stride, x, y, width, height, xor); + break; + + default: + return _pixman_implementation_fill ( + imp->delegate, bits, stride, bpp, x, y, width, height, xor); + break; + } + + return TRUE; +} + +pixman_implementation_t * +_pixman_implementation_create_fast_path (pixman_implementation_t *fallback) +{ + pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths); + + imp->fill = fast_path_fill; + + return imp; +} diff --git a/pixman/pixman/pixman-fast-path.h b/pixman/pixman/pixman-fast-path.h index bb7032d86..d08122293 100644 --- a/pixman/pixman/pixman-fast-path.h +++ b/pixman/pixman/pixman-fast-path.h @@ -1,449 +1,590 @@ -/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. SuSE makes no representations about the
- * suitability of this software for any purpose. It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author: Keith Packard, SuSE, Inc.
- */
-
-#ifndef PIXMAN_FAST_PATH_H__
-#define PIXMAN_FAST_PATH_H__
-
-#include "pixman-private.h"
-
-#define PIXMAN_REPEAT_COVER -1
-
-static force_inline pixman_bool_t
-repeat (pixman_repeat_t repeat, int *c, int size)
-{
- if (repeat == PIXMAN_REPEAT_NONE)
- {
- if (*c < 0 || *c >= size)
- return FALSE;
- }
- else if (repeat == PIXMAN_REPEAT_NORMAL)
- {
- while (*c >= size)
- *c -= size;
- while (*c < 0)
- *c += size;
- }
- else if (repeat == PIXMAN_REPEAT_PAD)
- {
- *c = CLIP (*c, 0, size - 1);
- }
- else /* REFLECT */
- {
- *c = MOD (*c, size * 2);
- if (*c >= size)
- *c = size * 2 - *c - 1;
- }
- return TRUE;
-}
-
-/*
- * For each scanline fetched from source image with PAD repeat:
- * - calculate how many pixels need to be padded on the left side
- * - calculate how many pixels need to be padded on the right side
- * - update width to only count pixels which are fetched from the image
- * All this information is returned via 'width', 'left_pad', 'right_pad'
- * arguments. The code is assuming that 'unit_x' is positive.
- *
- * Note: 64-bit math is used in order to avoid potential overflows, which
- * is probably excessive in many cases. This particular function
- * may need its own correctness test and performance tuning.
- */
-static force_inline void
-pad_repeat_get_scanline_bounds (int32_t source_image_width,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- int32_t * width,
- int32_t * left_pad,
- int32_t * right_pad)
-{
- int64_t max_vx = (int64_t) source_image_width << 16;
- int64_t tmp;
- if (vx < 0)
- {
- tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
- if (tmp > *width)
- {
- *left_pad = *width;
- *width = 0;
- }
- else
- {
- *left_pad = (int32_t) tmp;
- *width -= (int32_t) tmp;
- }
- }
- else
- {
- *left_pad = 0;
- }
- tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
- if (tmp < 0)
- {
- *right_pad = *width;
- *width = 0;
- }
- else if (tmp >= *width)
- {
- *right_pad = 0;
- }
- else
- {
- *right_pad = *width - (int32_t) tmp;
- *width = (int32_t) tmp;
- }
-}
-
-/* A macroified version of specialized nearest scalers for some
- * common 8888 and 565 formats. It supports SRC and OVER ops.
- *
- * There are two repeat versions, one that handles repeat normal,
- * and one without repeat handling that only works if the src region
- * used is completely covered by the pre-repeated source samples.
- *
- * The loops are unrolled to process two pixels per iteration for better
- * performance on most CPU architectures (superscalar processors
- * can issue several operations simultaneously, other processors can hide
- * instructions latencies by pipelining operations). Unrolling more
- * does not make much sense because the compiler will start running out
- * of spare registers soon.
- */
-
-#define GET_8888_ALPHA(s) ((s) >> 24)
- /* This is not actually used since we don't have an OVER with
- 565 source, but it is needed to build. */
-#define GET_0565_ALPHA(s) 0xff
-
-#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT, \
- src_type_t, dst_type_t, OP, repeat_mode) \
-static force_inline void \
-scanline_func_name (dst_type_t *dst, \
- src_type_t *src, \
- int32_t w, \
- pixman_fixed_t vx, \
- pixman_fixed_t unit_x, \
- pixman_fixed_t max_vx) \
-{ \
- uint32_t d; \
- src_type_t s1, s2; \
- uint8_t a1, a2; \
- int x1, x2; \
- \
- if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER) \
- abort(); \
- \
- while ((w -= 2) >= 0) \
- { \
- x1 = vx >> 16; \
- vx += unit_x; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* This works because we know that unit_x is positive */ \
- while (vx >= max_vx) \
- vx -= max_vx; \
- } \
- s1 = src[x1]; \
- \
- x2 = vx >> 16; \
- vx += unit_x; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* This works because we know that unit_x is positive */ \
- while (vx >= max_vx) \
- vx -= max_vx; \
- } \
- s2 = src[x2]; \
- \
- if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
- { \
- a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
- a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \
- \
- if (a1 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- else if (s1) \
- { \
- d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst); \
- s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
- a1 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- \
- if (a2 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
- } \
- else if (s2) \
- { \
- d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
- s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2); \
- a2 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- } \
- else /* PIXMAN_OP_SRC */ \
- { \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \
- } \
- } \
- \
- if (w & 1) \
- { \
- x1 = vx >> 16; \
- s1 = src[x1]; \
- \
- if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
- { \
- a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \
- \
- if (a1 == 0xff) \
- { \
- *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- else if (s1) \
- { \
- d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \
- s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \
- a1 ^= 0xff; \
- UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \
- *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \
- } \
- dst++; \
- } \
- else /* PIXMAN_OP_SRC */ \
- { \
- *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \
- } \
- } \
-}
-
-#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, dst_type_t, \
- repeat_mode) \
-static void \
-fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, \
- pixman_op_t op, \
- pixman_image_t * src_image, \
- pixman_image_t * mask_image, \
- pixman_image_t * dst_image, \
- int32_t src_x, \
- int32_t src_y, \
- int32_t mask_x, \
- int32_t mask_y, \
- int32_t dst_x, \
- int32_t dst_y, \
- int32_t width, \
- int32_t height) \
-{ \
- dst_type_t *dst_line; \
- src_type_t *src_first_line; \
- int y; \
- pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \
- pixman_fixed_t max_vy; \
- pixman_vector_t v; \
- pixman_fixed_t vx, vy; \
- pixman_fixed_t unit_x, unit_y; \
- int32_t left_pad, right_pad; \
- \
- src_type_t *src; \
- dst_type_t *dst; \
- int src_stride, dst_stride; \
- \
- PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \
- /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \
- * transformed from destination space to source space */ \
- PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \
- \
- /* reference point is the center of the pixel */ \
- v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \
- v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \
- v.vector[2] = pixman_fixed_1; \
- \
- if (!pixman_transform_point_3d (src_image->common.transform, &v)) \
- return; \
- \
- unit_x = src_image->common.transform->matrix[0][0]; \
- unit_y = src_image->common.transform->matrix[1][1]; \
- \
- /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ \
- v.vector[0] -= pixman_fixed_e; \
- v.vector[1] -= pixman_fixed_e; \
- \
- vx = v.vector[0]; \
- vy = v.vector[1]; \
- \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- { \
- /* Clamp repeating positions inside the actual samples */ \
- max_vx = src_image->bits.width << 16; \
- max_vy = src_image->bits.height << 16; \
- \
- repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \
- repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
- } \
- \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \
- PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
- { \
- pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x, \
- &width, &left_pad, &right_pad); \
- vx += left_pad * unit_x; \
- } \
- \
- while (--height >= 0) \
- { \
- dst = dst_line; \
- dst_line += dst_stride; \
- \
- y = vy >> 16; \
- vy += unit_y; \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
- repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
- if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \
- { \
- repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height); \
- src = src_first_line + src_stride * y; \
- if (left_pad > 0) \
- { \
- scanline_func (dst, src, left_pad, 0, 0, 0); \
- } \
- if (width > 0) \
- { \
- scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \
- } \
- if (right_pad > 0) \
- { \
- scanline_func (dst + left_pad + width, src + src_image->bits.width - 1, \
- right_pad, 0, 0, 0); \
- } \
- } \
- else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
- { \
- static src_type_t zero[1] = { 0 }; \
- if (y < 0 || y >= src_image->bits.height) \
- { \
- scanline_func (dst, zero, left_pad + width + right_pad, 0, 0, 0); \
- continue; \
- } \
- src = src_first_line + src_stride * y; \
- if (left_pad > 0) \
- { \
- scanline_func (dst, zero, left_pad, 0, 0, 0); \
- } \
- if (width > 0) \
- { \
- scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \
- } \
- if (right_pad > 0) \
- { \
- scanline_func (dst + left_pad + width, zero, right_pad, 0, 0, 0); \
- } \
- } \
- else \
- { \
- src = src_first_line + src_stride * y; \
- scanline_func (dst, src, width, vx, unit_x, max_vx); \
- } \
- } \
-}
-
-/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
-#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t, \
- repeat_mode) \
- FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, dst_type_t, \
- repeat_mode) \
-
-#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT, \
- src_type_t, dst_type_t, OP, repeat_mode) \
- FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
- SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t, \
- OP, repeat_mode) \
- FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name ## _ ## OP, \
- scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \
- src_type_t, dst_type_t, repeat_mode)
-
-
-#define SCALED_NEAREST_FLAGS \
- (FAST_PATH_SCALE_TRANSFORM | \
- FAST_PATH_NO_ALPHA_MAP | \
- FAST_PATH_NEAREST_FILTER | \
- FAST_PATH_NO_ACCESSORS | \
- FAST_PATH_NARROW_FORMAT)
-
-#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_NORMAL_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_PAD_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- (SCALED_NEAREST_FLAGS | \
- FAST_PATH_NONE_REPEAT | \
- FAST_PATH_X_UNIT_POSITIVE), \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \
- }
-
-#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func) \
- { PIXMAN_OP_ ## op, \
- PIXMAN_ ## s, \
- SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
- PIXMAN_null, 0, \
- PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
- fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \
- }
-
-/* Prefer the use of 'cover' variant, because it is faster */
-#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \
- SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
-
-#endif
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. SuSE makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author: Keith Packard, SuSE, Inc. + */ + +#ifndef PIXMAN_FAST_PATH_H__ +#define PIXMAN_FAST_PATH_H__ + +#include "pixman-private.h" + +#define PIXMAN_REPEAT_COVER -1 + +static force_inline pixman_bool_t +repeat (pixman_repeat_t repeat, int *c, int size) +{ + if (repeat == PIXMAN_REPEAT_NONE) + { + if (*c < 0 || *c >= size) + return FALSE; + } + else if (repeat == PIXMAN_REPEAT_NORMAL) + { + while (*c >= size) + *c -= size; + while (*c < 0) + *c += size; + } + else if (repeat == PIXMAN_REPEAT_PAD) + { + *c = CLIP (*c, 0, size - 1); + } + else /* REFLECT */ + { + *c = MOD (*c, size * 2); + if (*c >= size) + *c = size * 2 - *c - 1; + } + return TRUE; +} + +/* + * For each scanline fetched from source image with PAD repeat: + * - calculate how many pixels need to be padded on the left side + * - calculate how many pixels need to be padded on the right side + * - update width to only count pixels which are fetched from the image + * All this information is returned via 'width', 'left_pad', 'right_pad' + * arguments. The code is assuming that 'unit_x' is positive. + * + * Note: 64-bit math is used in order to avoid potential overflows, which + * is probably excessive in many cases. This particular function + * may need its own correctness test and performance tuning. + */ +static force_inline void +pad_repeat_get_scanline_bounds (int32_t source_image_width, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + int32_t * width, + int32_t * left_pad, + int32_t * right_pad) +{ + int64_t max_vx = (int64_t) source_image_width << 16; + int64_t tmp; + if (vx < 0) + { + tmp = ((int64_t) unit_x - 1 - vx) / unit_x; + if (tmp > *width) + { + *left_pad = *width; + *width = 0; + } + else + { + *left_pad = (int32_t) tmp; + *width -= (int32_t) tmp; + } + } + else + { + *left_pad = 0; + } + tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad; + if (tmp < 0) + { + *right_pad = *width; + *width = 0; + } + else if (tmp >= *width) + { + *right_pad = 0; + } + else + { + *right_pad = *width - (int32_t) tmp; + *width = (int32_t) tmp; + } +} + +/* A macroified version of specialized nearest scalers for some + * common 8888 and 565 formats. It supports SRC and OVER ops. + * + * There are two repeat versions, one that handles repeat normal, + * and one without repeat handling that only works if the src region + * used is completely covered by the pre-repeated source samples. + * + * The loops are unrolled to process two pixels per iteration for better + * performance on most CPU architectures (superscalar processors + * can issue several operations simultaneously, other processors can hide + * instructions latencies by pipelining operations). Unrolling more + * does not make much sense because the compiler will start running out + * of spare registers soon. + */ + +#define GET_8888_ALPHA(s) ((s) >> 24) + /* This is not actually used since we don't have an OVER with + 565 source, but it is needed to build. */ +#define GET_0565_ALPHA(s) 0xff + +#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT, \ + src_type_t, dst_type_t, OP, repeat_mode) \ +static force_inline void \ +scanline_func_name (dst_type_t *dst, \ + const src_type_t *src, \ + int32_t w, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx, \ + pixman_bool_t fully_transparent_src) \ +{ \ + uint32_t d; \ + src_type_t s1, s2; \ + uint8_t a1, a2; \ + int x1, x2; \ + \ + if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src) \ + return; \ + \ + if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER) \ + abort(); \ + \ + while ((w -= 2) >= 0) \ + { \ + x1 = vx >> 16; \ + vx += unit_x; \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + { \ + /* This works because we know that unit_x is positive */ \ + while (vx >= max_vx) \ + vx -= max_vx; \ + } \ + s1 = src[x1]; \ + \ + x2 = vx >> 16; \ + vx += unit_x; \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + { \ + /* This works because we know that unit_x is positive */ \ + while (vx >= max_vx) \ + vx -= max_vx; \ + } \ + s2 = src[x2]; \ + \ + if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ + { \ + a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ + a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \ + \ + if (a1 == 0xff) \ + { \ + *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + } \ + else if (s1) \ + { \ + d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst); \ + s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \ + a1 ^= 0xff; \ + UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \ + *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + } \ + dst++; \ + \ + if (a2 == 0xff) \ + { \ + *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \ + } \ + else if (s2) \ + { \ + d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \ + s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2); \ + a2 ^= 0xff; \ + UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2); \ + *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + } \ + dst++; \ + } \ + else /* PIXMAN_OP_SRC */ \ + { \ + *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \ + } \ + } \ + \ + if (w & 1) \ + { \ + x1 = vx >> 16; \ + s1 = src[x1]; \ + \ + if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ + { \ + a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ + \ + if (a1 == 0xff) \ + { \ + *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + } \ + else if (s1) \ + { \ + d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \ + s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \ + a1 ^= 0xff; \ + UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \ + *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + } \ + dst++; \ + } \ + else /* PIXMAN_OP_SRC */ \ + { \ + *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + } \ + } \ +} + +#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \ + dst_type_t, repeat_mode, have_mask, mask_is_solid) \ +static void \ +fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, \ + pixman_op_t op, \ + pixman_image_t * src_image, \ + pixman_image_t * mask_image, \ + pixman_image_t * dst_image, \ + int32_t src_x, \ + int32_t src_y, \ + int32_t mask_x, \ + int32_t mask_y, \ + int32_t dst_x, \ + int32_t dst_y, \ + int32_t width, \ + int32_t height) \ +{ \ + dst_type_t *dst_line; \ + mask_type_t *mask_line; \ + src_type_t *src_first_line; \ + int y; \ + pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \ + pixman_fixed_t max_vy; \ + pixman_vector_t v; \ + pixman_fixed_t vx, vy; \ + pixman_fixed_t unit_x, unit_y; \ + int32_t left_pad, right_pad; \ + \ + src_type_t *src; \ + dst_type_t *dst; \ + mask_type_t solid_mask; \ + const mask_type_t *mask = &solid_mask; \ + int src_stride, mask_stride, dst_stride; \ + \ + PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \ + if (have_mask) \ + { \ + if (mask_is_solid) \ + solid_mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format); \ + else \ + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t, \ + mask_stride, mask_line, 1); \ + } \ + /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \ + * transformed from destination space to source space */ \ + PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \ + \ + /* reference point is the center of the pixel */ \ + v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \ + v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \ + v.vector[2] = pixman_fixed_1; \ + \ + if (!pixman_transform_point_3d (src_image->common.transform, &v)) \ + return; \ + \ + unit_x = src_image->common.transform->matrix[0][0]; \ + unit_y = src_image->common.transform->matrix[1][1]; \ + \ + /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ \ + v.vector[0] -= pixman_fixed_e; \ + v.vector[1] -= pixman_fixed_e; \ + \ + vx = v.vector[0]; \ + vy = v.vector[1]; \ + \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + { \ + /* Clamp repeating positions inside the actual samples */ \ + max_vx = src_image->bits.width << 16; \ + max_vy = src_image->bits.height << 16; \ + \ + repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \ + repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ + } \ + \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \ + PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ + { \ + pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x, \ + &width, &left_pad, &right_pad); \ + vx += left_pad * unit_x; \ + } \ + \ + while (--height >= 0) \ + { \ + dst = dst_line; \ + dst_line += dst_stride; \ + if (have_mask && !mask_is_solid) \ + { \ + mask = mask_line; \ + mask_line += mask_stride; \ + } \ + \ + y = vy >> 16; \ + vy += unit_y; \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \ + { \ + repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height); \ + src = src_first_line + src_stride * y; \ + if (left_pad > 0) \ + { \ + scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE); \ + } \ + if (width > 0) \ + { \ + scanline_func (mask + (mask_is_solid ? 0 : left_pad), \ + dst + left_pad, src, width, vx, unit_x, 0, FALSE); \ + } \ + if (right_pad > 0) \ + { \ + scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \ + dst + left_pad + width, src + src_image->bits.width - 1, \ + right_pad, 0, 0, 0, FALSE); \ + } \ + } \ + else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ + { \ + static const src_type_t zero[1] = { 0 }; \ + if (y < 0 || y >= src_image->bits.height) \ + { \ + scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE); \ + continue; \ + } \ + src = src_first_line + src_stride * y; \ + if (left_pad > 0) \ + { \ + scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE); \ + } \ + if (width > 0) \ + { \ + scanline_func (mask + (mask_is_solid ? 0 : left_pad), \ + dst + left_pad, src, width, vx, unit_x, 0, FALSE); \ + } \ + if (right_pad > 0) \ + { \ + scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \ + dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE); \ + } \ + } \ + else \ + { \ + src = src_first_line + src_stride * y; \ + scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE); \ + } \ + } \ +} + +/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ +#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \ + dst_type_t, repeat_mode, have_mask, mask_is_solid) \ + FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t, \ + dst_type_t, repeat_mode, have_mask, mask_is_solid) + +#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t, \ + repeat_mode) \ + static force_inline void \ + scanline_func##scale_func_name##_wrapper ( \ + const uint8_t *mask, \ + dst_type_t *dst, \ + const src_type_t *src, \ + int32_t w, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx, \ + pixman_bool_t fully_transparent_src) \ + { \ + scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src); \ + } \ + FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper, \ + src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE) + +#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t, \ + repeat_mode) \ + FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t, \ + dst_type_t, repeat_mode) + +#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT, \ + src_type_t, dst_type_t, OP, repeat_mode) \ + FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \ + SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t, \ + OP, repeat_mode) \ + FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP, \ + scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \ + src_type_t, dst_type_t, repeat_mode) + + +#define SCALED_NEAREST_FLAGS \ + (FAST_PATH_SCALE_TRANSFORM | \ + FAST_PATH_NO_ALPHA_MAP | \ + FAST_PATH_NEAREST_FILTER | \ + FAST_PATH_NO_ACCESSORS | \ + FAST_PATH_NARROW_FORMAT) + +#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NORMAL_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \ + } + +#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_PAD_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \ + } + +#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NONE_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \ + } + +#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \ + } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NORMAL_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \ + } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_PAD_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \ + } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NONE_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \ + } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \ + PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \ + } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NORMAL_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \ + } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_PAD_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \ + } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NONE_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \ + } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \ + PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \ + } + +/* Prefer the use of 'cover' variant, because it is faster */ +#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \ + SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \ + SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \ + SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func) + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func) + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func) \ + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func), \ + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func) + +#endif diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index 91adc0560..2e135e2fe 100644 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -5800,7 +5800,8 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, int32_t w, pixman_fixed_t vx, pixman_fixed_t unit_x, - pixman_fixed_t max_vx) + pixman_fixed_t max_vx, + pixman_bool_t fully_transparent_src) { uint32_t s, d; const uint32_t* pm = NULL; @@ -5809,6 +5810,9 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; + if (fully_transparent_src) + return; + /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) { @@ -5894,6 +5898,119 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, scaled_nearest_scanline_sse2_8888_8888_OVER, uint32_t, uint32_t, PAD) +static force_inline void +scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, + uint32_t * dst, + const uint32_t * src, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + __m128i xmm_mask; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + if (zero_src || (*mask >> 24) == 0) + return; + + xmm_mask = create_mask_16_128 (*mask >> 24); + + while (w && (unsigned long)dst & 15) + { + uint32_t s = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + if (s) + { + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + __m64 alpha = expand_alpha_1x64 (ms); + __m64 dest = _mm_movepi64_pi64 (xmm_mask); + __m64 alpha_dst = unpack_32_1x64 (d); + + *dst = pack_1x64_32 ( + in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); + } + dst++; + w--; + } + + while (w >= 4) + { + uint32_t tmp1, tmp2, tmp3, tmp4; + + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp3 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp4 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); + + if (!is_zero (xmm_src)) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + dst += 4; + w -= 4; + } + + while (w) + { + uint32_t s = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + if (s) + { + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + __m64 alpha = expand_alpha_1x64 (ms); + __m64 mask = _mm_movepi64_pi64 (xmm_mask); + __m64 dest = unpack_32_1x64 (d); + + *dst = pack_1x64_32 ( + in_over_1x64 (&ms, &alpha, &mask, &dest)); + } + + dst++; + w--; + } + + _mm_empty (); +} + +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) + static const pixman_fast_path_t sse2_fast_paths[] = { /* PIXMAN_OP_OVER */ @@ -5990,6 +6107,11 @@ static const pixman_fast_path_t sse2_fast_paths[] = SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), + { PIXMAN_OP_NONE }, }; diff --git a/pixman/test/Makefile.am b/pixman/test/Makefile.am index 8d8471d1c..3ce466eec 100644 --- a/pixman/test/Makefile.am +++ b/pixman/test/Makefile.am @@ -1,7 +1,6 @@ AM_CFLAGS = @OPENMP_CFLAGS@ -AM_LDFLAGS = @OPENMP_CFLAGS@ - -TEST_LDADD = $(top_builddir)/pixman/libpixman-1.la -lm +AM_LDFLAGS = @OPENMP_CFLAGS@ @TESTPROGS_EXTRA_LDFLAGS@ +LDADD = $(top_builddir)/pixman/libpixman-1.la -lm INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman TESTPROGRAMS = \ @@ -22,121 +21,24 @@ TESTPROGRAMS = \ affine-test \ composite -a1_trap_test_LDADD = $(TEST_LDADD) -a1_trap_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -fetch_test_LDADD = $(TEST_LDADD) -fetch_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -trap_crasher_LDADD = $(TEST_LDADD) -trap_crasher_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -oob_test_LDADD = $(TEST_LDADD) -oob_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -scaling_crash_test_LDADD = $(TEST_LDADD) -scaling_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -region_translate_test_LDADD = $(TEST_LDADD) -region_translate_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -pdf_op_test_LDADD = $(TEST_LDADD) -pdf_op_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ pdf_op_test_SOURCES = pdf-op-test.c utils.c utils.h - -region_test_LDADD = $(TEST_LDADD) -region_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ region_test_SOURCES = region-test.c utils.c utils.h - -blitters_test_LDADD = $(TEST_LDADD) -blitters_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ blitters_test_SOURCES = blitters-test.c utils.c utils.h - -scaling_test_LDADD = $(TEST_LDADD) -scaling_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ scaling_test_SOURCES = scaling-test.c utils.c utils.h - -affine_test_LDADD = $(TEST_LDADD) -affine_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ affine_test_SOURCES = affine-test.c utils.c utils.h - -alphamap_LDADD = $(TEST_LDADD) -alphamap_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ alphamap_SOURCES = alphamap.c utils.c utils.h - -alpha_loop_LDADD = $(TEST_LDADD) -alpha_loop_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ alpha_loop_SOURCES = alpha-loop.c utils.c utils.h - -composite_LDADD = $(TEST_LDADD) -composite_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ composite_SOURCES = composite.c utils.c utils.h - -gradient_crash_test_LDADD = $(TEST_LDADD) -gradient_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h - -stress_test_LDADD = $(TEST_LDADD) -stress_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ stress_test_SOURCES = stress-test.c utils.c utils.h -# GTK using test programs - -if HAVE_GTK - -GTK_LDADD = $(TEST_LDADD) $(GTK_LIBS) -GTK_UTILS = gtk-utils.c gtk-utils.h - -TESTPROGRAMS_GTK = \ - clip-test \ - clip-in \ - composite-test \ - gradient-test \ - radial-test \ - alpha-test \ - screen-test \ - convolution-test \ - trap-test - -INCLUDES += $(GTK_CFLAGS) - -gradient_test_LDADD = $(GTK_LDADD) -gradient_test_SOURCES = gradient-test.c $(GTK_UTILS) - -radial_test_LDADD = $(GTK_LDADD) -radial_test_SOURCES = radial-test.c utils.c utils.h $(GTK_UTILS) - -alpha_test_LDADD = $(GTK_LDADD) -alpha_test_SOURCES = alpha-test.c $(GTK_UTILS) - -composite_test_LDADD = $(GTK_LDADD) -composite_test_SOURCES = composite-test.c $(GTK_UTILS) - -clip_test_LDADD = $(GTK_LDADD) -clip_test_SOURCES = clip-test.c $(GTK_UTILS) - -clip_in_LDADD = $(GTK_LDADD) -clip_in_SOURCES = clip-in.c $(GTK_UTILS) - -trap_test_LDADD = $(GTK_LDADD) -trap_test_SOURCES = trap-test.c $(GTK_UTILS) - -screen_test_LDADD = $(GTK_LDADD) -screen_test_SOURCES = screen-test.c $(GTK_UTILS) - -convolution_test_LDADD = $(GTK_LDADD) -convolution_test_SOURCES = convolution-test.c $(GTK_UTILS) - -endif - # Benchmarks BENCHMARKS = \ lowlevel-blt-bench lowlevel_blt_bench_SOURCES = lowlevel-blt-bench.c utils.c utils.h -lowlevel_blt_bench_LDADD = $(TEST_LDADD) -noinst_PROGRAMS = $(TESTPROGRAMS) $(TESTPROGRAMS_GTK) $(BENCHMARKS) +noinst_PROGRAMS = $(TESTPROGRAMS) $(BENCHMARKS) TESTS = $(TESTPROGRAMS) diff --git a/pixman/test/scaling-test.c b/pixman/test/scaling-test.c index 7b78017a3..dbb9d39b0 100644 --- a/pixman/test/scaling-test.c +++ b/pixman/test/scaling-test.c @@ -1,250 +1,368 @@ -/*
- * Test program, which can detect some problems with nearest neighbour
- * and bilinear scaling in pixman. Testing is done by running lots
- * of random SRC and OVER compositing operations a8r8g8b8, x8a8r8g8b8
- * and r5g6b5 color formats.
- *
- * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
- * the case of test failure.
- */
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "utils.h"
-
-#define MAX_SRC_WIDTH 16
-#define MAX_SRC_HEIGHT 16
-#define MAX_DST_WIDTH 16
-#define MAX_DST_HEIGHT 16
-#define MAX_STRIDE 4
-
-/*
- * Composite operation with pseudorandom images
- */
-uint32_t
-test_composite (int testnum,
- int verbose)
-{
- int i;
- pixman_image_t * src_img;
- pixman_image_t * dst_img;
- pixman_transform_t transform;
- pixman_region16_t clip;
- int src_width, src_height;
- int dst_width, dst_height;
- int src_stride, dst_stride;
- int src_x, src_y;
- int dst_x, dst_y;
- int src_bpp;
- int dst_bpp;
- int w, h;
- pixman_fixed_t scale_x = 65536, scale_y = 65536;
- pixman_fixed_t translate_x = 0, translate_y = 0;
- pixman_op_t op;
- pixman_repeat_t repeat = PIXMAN_REPEAT_NONE;
- pixman_format_code_t src_fmt, dst_fmt;
- uint32_t * srcbuf;
- uint32_t * dstbuf;
- uint32_t crc32;
- FLOAT_REGS_CORRUPTION_DETECTOR_START ();
-
- lcg_srand (testnum);
-
- src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
- dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
- op = (lcg_rand_n (2) == 0) ? PIXMAN_OP_SRC : PIXMAN_OP_OVER;
-
- src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
- src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
- dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
- dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
- src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
- dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
-
- if (src_stride & 3)
- src_stride += 2;
-
- if (dst_stride & 3)
- dst_stride += 2;
-
- src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
- src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
- dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
- dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
- w = lcg_rand_n (dst_width * 3 / 2 - dst_x);
- h = lcg_rand_n (dst_height * 3 / 2 - dst_y);
-
- srcbuf = (uint32_t *)malloc (src_stride * src_height);
- dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
-
- for (i = 0; i < src_stride * src_height; i++)
- *((uint8_t *)srcbuf + i) = lcg_rand_n (256);
-
- for (i = 0; i < dst_stride * dst_height; i++)
- *((uint8_t *)dstbuf + i) = lcg_rand_n (256);
-
- src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ?
- PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
-
- dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ?
- PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
-
- src_img = pixman_image_create_bits (
- src_fmt, src_width, src_height, srcbuf, src_stride);
-
- dst_img = pixman_image_create_bits (
- dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
-
- image_endian_swap (src_img, src_bpp * 8);
- image_endian_swap (dst_img, dst_bpp * 8);
-
- if (lcg_rand_n (8) > 0)
- {
- scale_x = -32768 * 3 + lcg_rand_N (65536 * 5);
- scale_y = -32768 * 3 + lcg_rand_N (65536 * 5);
- translate_x = lcg_rand_N (65536);
- translate_y = lcg_rand_N (65536);
- pixman_transform_init_scale (&transform, scale_x, scale_y);
- pixman_transform_translate (&transform, NULL, translate_x, translate_y);
- pixman_image_set_transform (src_img, &transform);
- }
-
- switch (lcg_rand_n (4))
- {
- case 0:
- repeat = PIXMAN_REPEAT_NONE;
- break;
-
- case 1:
- repeat = PIXMAN_REPEAT_NORMAL;
- break;
-
- case 2:
- repeat = PIXMAN_REPEAT_PAD;
- break;
-
- case 3:
- repeat = PIXMAN_REPEAT_REFLECT;
- break;
-
- default:
- break;
- }
- pixman_image_set_repeat (src_img, repeat);
-
- if (lcg_rand_n (2))
- pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
- else
- pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
-
- if (verbose)
- {
- printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
- printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n",
- op, scale_x, scale_y, repeat);
- printf ("translate_x=%d, translate_y=%d\n",
- translate_x, translate_y);
- printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
- src_width, src_height, dst_width, dst_height);
- printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
- src_x, src_y, dst_x, dst_y);
- printf ("w=%d, h=%d\n", w, h);
- }
-
- if (lcg_rand_n (8) == 0)
- {
- pixman_box16_t clip_boxes[2];
- int n = lcg_rand_n (2) + 1;
-
- for (i = 0; i < n; i++)
- {
- clip_boxes[i].x1 = lcg_rand_n (src_width);
- clip_boxes[i].y1 = lcg_rand_n (src_height);
- clip_boxes[i].x2 =
- clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
- clip_boxes[i].y2 =
- clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
-
- if (verbose)
- {
- printf ("source clip box: [%d,%d-%d,%d]\n",
- clip_boxes[i].x1, clip_boxes[i].y1,
- clip_boxes[i].x2, clip_boxes[i].y2);
- }
- }
-
- pixman_region_init_rects (&clip, clip_boxes, n);
- pixman_image_set_clip_region (src_img, &clip);
- pixman_image_set_source_clipping (src_img, 1);
- pixman_region_fini (&clip);
- }
-
- if (lcg_rand_n (8) == 0)
- {
- pixman_box16_t clip_boxes[2];
- int n = lcg_rand_n (2) + 1;
- for (i = 0; i < n; i++)
- {
- clip_boxes[i].x1 = lcg_rand_n (dst_width);
- clip_boxes[i].y1 = lcg_rand_n (dst_height);
- clip_boxes[i].x2 =
- clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
- clip_boxes[i].y2 =
- clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
-
- if (verbose)
- {
- printf ("destination clip box: [%d,%d-%d,%d]\n",
- clip_boxes[i].x1, clip_boxes[i].y1,
- clip_boxes[i].x2, clip_boxes[i].y2);
- }
- }
- pixman_region_init_rects (&clip, clip_boxes, n);
- pixman_image_set_clip_region (dst_img, &clip);
- pixman_region_fini (&clip);
- }
-
- pixman_image_composite (op, src_img, NULL, dst_img,
- src_x, src_y, 0, 0, dst_x, dst_y, w, h);
-
- if (dst_fmt == PIXMAN_x8r8g8b8)
- {
- /* ignore unused part */
- for (i = 0; i < dst_stride * dst_height / 4; i++)
- dstbuf[i] &= 0xFFFFFF;
- }
-
- image_endian_swap (dst_img, dst_bpp * 8);
-
- if (verbose)
- {
- int j;
-
- for (i = 0; i < dst_height; i++)
- {
- for (j = 0; j < dst_stride; j++)
- printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
-
- printf ("\n");
- }
- }
-
- pixman_image_unref (src_img);
- pixman_image_unref (dst_img);
-
- crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height);
- free (srcbuf);
- free (dstbuf);
-
- FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
- return crc32;
-}
-
-int
-main (int argc, const char *argv[])
-{
- pixman_disable_out_of_bounds_workaround ();
-
- return fuzzer_test_main("scaling", 8000000, 0x7F1AB59F,
- test_composite, argc, argv);
-}
+/* + * Test program, which can detect some problems with nearest neighbour + * and bilinear scaling in pixman. Testing is done by running lots + * of random SRC and OVER compositing operations a8r8g8b8, x8a8r8g8b8 + * and r5g6b5 color formats. + * + * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in + * the case of test failure. + */ +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> +#include "utils.h" + +#define MAX_SRC_WIDTH 48 +#define MAX_SRC_HEIGHT 8 +#define MAX_DST_WIDTH 48 +#define MAX_DST_HEIGHT 8 +#define MAX_STRIDE 4 + +/* + * Composite operation with pseudorandom images + */ +uint32_t +test_composite (int testnum, + int verbose) +{ + int i; + pixman_image_t * src_img; + pixman_image_t * mask_img; + pixman_image_t * dst_img; + pixman_transform_t transform; + pixman_region16_t clip; + int src_width, src_height; + int mask_width, mask_height; + int dst_width, dst_height; + int src_stride, mask_stride, dst_stride; + int src_x, src_y; + int mask_x, mask_y; + int dst_x, dst_y; + int src_bpp; + int mask_bpp = 1; + int dst_bpp; + int w, h; + pixman_fixed_t scale_x = 65536, scale_y = 65536; + pixman_fixed_t translate_x = 0, translate_y = 0; + pixman_fixed_t mask_scale_x = 65536, mask_scale_y = 65536; + pixman_fixed_t mask_translate_x = 0, mask_translate_y = 0; + pixman_op_t op; + pixman_repeat_t repeat = PIXMAN_REPEAT_NONE; + pixman_repeat_t mask_repeat = PIXMAN_REPEAT_NONE; + pixman_format_code_t src_fmt, dst_fmt; + uint32_t * srcbuf; + uint32_t * dstbuf; + uint32_t * maskbuf; + uint32_t crc32; + FLOAT_REGS_CORRUPTION_DETECTOR_START (); + + lcg_srand (testnum); + + src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4; + dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4; + switch (lcg_rand_n (3)) + { + case 0: + op = PIXMAN_OP_SRC; + break; + case 1: + op = PIXMAN_OP_OVER; + break; + default: + op = PIXMAN_OP_ADD; + break; + } + + src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1; + src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1; + + if (lcg_rand_n (2)) + { + mask_width = lcg_rand_n (MAX_SRC_WIDTH) + 1; + mask_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1; + } + else + { + mask_width = mask_height = 1; + } + + dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1; + dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1; + src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp; + mask_stride = mask_width * mask_bpp + lcg_rand_n (MAX_STRIDE) * mask_bpp; + dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp; + + if (src_stride & 3) + src_stride += 2; + + if (mask_stride & 1) + mask_stride += 1; + if (mask_stride & 2) + mask_stride += 2; + + if (dst_stride & 3) + dst_stride += 2; + + src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2); + src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2); + mask_x = -(mask_width / 4) + lcg_rand_n (mask_width * 3 / 2); + mask_y = -(mask_height / 4) + lcg_rand_n (mask_height * 3 / 2); + dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2); + dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2); + w = lcg_rand_n (dst_width * 3 / 2 - dst_x); + h = lcg_rand_n (dst_height * 3 / 2 - dst_y); + + srcbuf = (uint32_t *)malloc (src_stride * src_height); + maskbuf = (uint32_t *)malloc (mask_stride * mask_height); + dstbuf = (uint32_t *)malloc (dst_stride * dst_height); + + for (i = 0; i < src_stride * src_height; i++) + *((uint8_t *)srcbuf + i) = lcg_rand_n (256); + + for (i = 0; i < mask_stride * mask_height; i++) + *((uint8_t *)maskbuf + i) = lcg_rand_n (256); + + for (i = 0; i < dst_stride * dst_height; i++) + *((uint8_t *)dstbuf + i) = lcg_rand_n (256); + + src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ? + PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5; + + dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ? + PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5; + + src_img = pixman_image_create_bits ( + src_fmt, src_width, src_height, srcbuf, src_stride); + + mask_img = pixman_image_create_bits ( + PIXMAN_a8, mask_width, mask_height, maskbuf, mask_stride); + + dst_img = pixman_image_create_bits ( + dst_fmt, dst_width, dst_height, dstbuf, dst_stride); + + image_endian_swap (src_img, src_bpp * 8); + image_endian_swap (dst_img, dst_bpp * 8); + + if (lcg_rand_n (4) > 0) + { + scale_x = -32768 * 3 + lcg_rand_N (65536 * 5); + scale_y = -32768 * 3 + lcg_rand_N (65536 * 5); + translate_x = lcg_rand_N (65536); + translate_y = lcg_rand_N (65536); + pixman_transform_init_scale (&transform, scale_x, scale_y); + pixman_transform_translate (&transform, NULL, translate_x, translate_y); + pixman_image_set_transform (src_img, &transform); + } + + if (lcg_rand_n (2) > 0) + { + mask_scale_x = -32768 * 3 + lcg_rand_N (65536 * 5); + mask_scale_y = -32768 * 3 + lcg_rand_N (65536 * 5); + mask_translate_x = lcg_rand_N (65536); + mask_translate_y = lcg_rand_N (65536); + pixman_transform_init_scale (&transform, mask_scale_x, mask_scale_y); + pixman_transform_translate (&transform, NULL, mask_translate_x, mask_translate_y); + pixman_image_set_transform (mask_img, &transform); + } + + switch (lcg_rand_n (4)) + { + case 0: + mask_repeat = PIXMAN_REPEAT_NONE; + break; + + case 1: + mask_repeat = PIXMAN_REPEAT_NORMAL; + break; + + case 2: + mask_repeat = PIXMAN_REPEAT_PAD; + break; + + case 3: + mask_repeat = PIXMAN_REPEAT_REFLECT; + break; + + default: + break; + } + pixman_image_set_repeat (mask_img, mask_repeat); + + switch (lcg_rand_n (4)) + { + case 0: + repeat = PIXMAN_REPEAT_NONE; + break; + + case 1: + repeat = PIXMAN_REPEAT_NORMAL; + break; + + case 2: + repeat = PIXMAN_REPEAT_PAD; + break; + + case 3: + repeat = PIXMAN_REPEAT_REFLECT; + break; + + default: + break; + } + pixman_image_set_repeat (src_img, repeat); + + if (lcg_rand_n (2)) + pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0); + else + pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0); + + if (lcg_rand_n (2)) + pixman_image_set_filter (mask_img, PIXMAN_FILTER_NEAREST, NULL, 0); + else + pixman_image_set_filter (mask_img, PIXMAN_FILTER_BILINEAR, NULL, 0); + + if (verbose) + { + printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt); + printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n", + op, scale_x, scale_y, repeat); + printf ("translate_x=%d, translate_y=%d\n", + translate_x, translate_y); + printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n", + src_width, src_height, dst_width, dst_height); + printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n", + src_x, src_y, dst_x, dst_y); + printf ("w=%d, h=%d\n", w, h); + } + + if (lcg_rand_n (8) == 0) + { + pixman_box16_t clip_boxes[2]; + int n = lcg_rand_n (2) + 1; + + for (i = 0; i < n; i++) + { + clip_boxes[i].x1 = lcg_rand_n (src_width); + clip_boxes[i].y1 = lcg_rand_n (src_height); + clip_boxes[i].x2 = + clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1); + clip_boxes[i].y2 = + clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1); + + if (verbose) + { + printf ("source clip box: [%d,%d-%d,%d]\n", + clip_boxes[i].x1, clip_boxes[i].y1, + clip_boxes[i].x2, clip_boxes[i].y2); + } + } + + pixman_region_init_rects (&clip, clip_boxes, n); + pixman_image_set_clip_region (src_img, &clip); + pixman_image_set_source_clipping (src_img, 1); + pixman_region_fini (&clip); + } + + if (lcg_rand_n (8) == 0) + { + pixman_box16_t clip_boxes[2]; + int n = lcg_rand_n (2) + 1; + + for (i = 0; i < n; i++) + { + clip_boxes[i].x1 = lcg_rand_n (mask_width); + clip_boxes[i].y1 = lcg_rand_n (mask_height); + clip_boxes[i].x2 = + clip_boxes[i].x1 + lcg_rand_n (mask_width - clip_boxes[i].x1); + clip_boxes[i].y2 = + clip_boxes[i].y1 + lcg_rand_n (mask_height - clip_boxes[i].y1); + + if (verbose) + { + printf ("mask clip box: [%d,%d-%d,%d]\n", + clip_boxes[i].x1, clip_boxes[i].y1, + clip_boxes[i].x2, clip_boxes[i].y2); + } + } + + pixman_region_init_rects (&clip, clip_boxes, n); + pixman_image_set_clip_region (mask_img, &clip); + pixman_image_set_source_clipping (mask_img, 1); + pixman_region_fini (&clip); + } + + if (lcg_rand_n (8) == 0) + { + pixman_box16_t clip_boxes[2]; + int n = lcg_rand_n (2) + 1; + for (i = 0; i < n; i++) + { + clip_boxes[i].x1 = lcg_rand_n (dst_width); + clip_boxes[i].y1 = lcg_rand_n (dst_height); + clip_boxes[i].x2 = + clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1); + clip_boxes[i].y2 = + clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1); + + if (verbose) + { + printf ("destination clip box: [%d,%d-%d,%d]\n", + clip_boxes[i].x1, clip_boxes[i].y1, + clip_boxes[i].x2, clip_boxes[i].y2); + } + } + pixman_region_init_rects (&clip, clip_boxes, n); + pixman_image_set_clip_region (dst_img, &clip); + pixman_region_fini (&clip); + } + + if (lcg_rand_n (2) == 0) + pixman_image_composite (op, src_img, NULL, dst_img, + src_x, src_y, 0, 0, dst_x, dst_y, w, h); + else + pixman_image_composite (op, src_img, mask_img, dst_img, + src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h); + + if (dst_fmt == PIXMAN_x8r8g8b8) + { + /* ignore unused part */ + for (i = 0; i < dst_stride * dst_height / 4; i++) + dstbuf[i] &= 0xFFFFFF; + } + + image_endian_swap (dst_img, dst_bpp * 8); + + if (verbose) + { + int j; + + for (i = 0; i < dst_height; i++) + { + for (j = 0; j < dst_stride; j++) + printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j)); + + printf ("\n"); + } + } + + pixman_image_unref (src_img); + pixman_image_unref (mask_img); + pixman_image_unref (dst_img); + + crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height); + free (srcbuf); + free (maskbuf); + free (dstbuf); + + FLOAT_REGS_CORRUPTION_DETECTOR_FINISH (); + return crc32; +} + +int +main (int argc, const char *argv[]) +{ + pixman_disable_out_of_bounds_workaround (); + + return fuzzer_test_main("scaling", 8000000, 0x80DF1CB2, + test_composite, argc, argv); +} |