diff options
author | marha <marha@users.sourceforge.net> | 2011-03-13 20:15:26 +0000 |
---|---|---|
committer | marha <marha@users.sourceforge.net> | 2011-03-13 20:15:26 +0000 |
commit | b5d1fd89898edb34f73679b542c754d837d44cf8 (patch) | |
tree | b3d14f22d0c5fd984f5ec1ed71ad5263a46e1583 /pixman | |
parent | 77ec02adbc8f9657e7749b307d3cc86ccbd163ea (diff) | |
download | vcxsrv-b5d1fd89898edb34f73679b542c754d837d44cf8.tar.gz vcxsrv-b5d1fd89898edb34f73679b542c754d837d44cf8.tar.bz2 vcxsrv-b5d1fd89898edb34f73679b542c754d837d44cf8.zip |
xkeyboard-config libxcb pixman mesalib git update 13 Mar 2011
Diffstat (limited to 'pixman')
-rw-r--r-- | pixman/pixman/pixman-arm-common.h | 773 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-neon-asm.S | 5308 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-neon-asm.h | 17 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-neon.c | 946 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-simd-asm.S | 66 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-simd.c | 855 |
6 files changed, 4088 insertions, 3877 deletions
diff --git a/pixman/pixman/pixman-arm-common.h b/pixman/pixman/pixman-arm-common.h index 9b1322b6c..2c435041a 100644 --- a/pixman/pixman/pixman-arm-common.h +++ b/pixman/pixman/pixman-arm-common.h @@ -1,364 +1,409 @@ -/* - * Copyright © 2010 Nokia Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) - */ - -#ifndef PIXMAN_ARM_COMMON_H -#define PIXMAN_ARM_COMMON_H - -#include "pixman-fast-path.h" - -/* Define some macros which can expand into proxy functions between - * ARM assembly optimized functions and the rest of pixman fast path API. - * - * All the low level ARM assembly functions have to use ARM EABI - * calling convention and take up to 8 arguments: - * width, height, dst, dst_stride, src, src_stride, mask, mask_stride - * - * The arguments are ordered with the most important coming first (the - * first 4 arguments are passed to function in registers, the rest are - * on stack). The last arguments are optional, for example if the - * function is not using mask, then 'mask' and 'mask_stride' can be - * omitted when doing a function call. - * - * Arguments 'src' and 'mask' contain either a pointer to the top left - * pixel of the composited rectangle or a pixel color value depending - * on the function type. In the case of just a color value (solid source - * or mask), the corresponding stride argument is unused. - */ - -#define SKIP_ZERO_SRC 1 -#define SKIP_ZERO_MASK 2 - -#define PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(cputype, name, \ - src_type, src_cnt, \ - dst_type, dst_cnt) \ -void \ -pixman_composite_##name##_asm_##cputype (int32_t w, \ - int32_t h, \ - dst_type *dst, \ - int32_t dst_stride, \ - src_type *src, \ - int32_t src_stride); \ - \ -static void \ -cputype##_composite_##name (pixman_implementation_t *imp, \ - pixman_op_t op, \ - pixman_image_t * src_image, \ - pixman_image_t * mask_image, \ - pixman_image_t * dst_image, \ - int32_t src_x, \ - int32_t src_y, \ - int32_t mask_x, \ - int32_t mask_y, \ - int32_t dest_x, \ - int32_t dest_y, \ - int32_t width, \ - int32_t height) \ -{ \ - dst_type *dst_line; \ - src_type *src_line; \ - int32_t dst_stride, src_stride; \ - \ - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type, \ - src_stride, src_line, src_cnt); \ - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \ - dst_stride, dst_line, dst_cnt); \ - \ - pixman_composite_##name##_asm_##cputype (width, height, \ - dst_line, dst_stride, \ - src_line, src_stride); \ -} - -#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(flags, cputype, name, \ - dst_type, dst_cnt) \ -void \ -pixman_composite_##name##_asm_##cputype (int32_t w, \ - int32_t h, \ - dst_type *dst, \ - int32_t dst_stride, \ - uint32_t src); \ - \ -static void \ -cputype##_composite_##name (pixman_implementation_t *imp, \ - pixman_op_t op, \ - pixman_image_t * src_image, \ - pixman_image_t * mask_image, \ - pixman_image_t * dst_image, \ - int32_t src_x, \ - int32_t src_y, \ - int32_t mask_x, \ - int32_t mask_y, \ - int32_t dest_x, \ - int32_t dest_y, \ - int32_t width, \ - int32_t height) \ -{ \ - dst_type *dst_line; \ - int32_t dst_stride; \ - uint32_t src; \ - \ - src = _pixman_image_get_solid ( \ - imp, src_image, dst_image->bits.format); \ - \ - if ((flags & SKIP_ZERO_SRC) && src == 0) \ - return; \ - \ - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \ - dst_stride, dst_line, dst_cnt); \ - \ - pixman_composite_##name##_asm_##cputype (width, height, \ - dst_line, dst_stride, \ - src); \ -} - -#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(flags, cputype, name, \ - mask_type, mask_cnt, \ - dst_type, dst_cnt) \ -void \ -pixman_composite_##name##_asm_##cputype (int32_t w, \ - int32_t h, \ - dst_type *dst, \ - int32_t dst_stride, \ - uint32_t src, \ - int32_t unused, \ - mask_type *mask, \ - int32_t mask_stride); \ - \ -static void \ -cputype##_composite_##name (pixman_implementation_t *imp, \ - pixman_op_t op, \ - pixman_image_t * src_image, \ - pixman_image_t * mask_image, \ - pixman_image_t * dst_image, \ - int32_t src_x, \ - int32_t src_y, \ - int32_t mask_x, \ - int32_t mask_y, \ - int32_t dest_x, \ - int32_t dest_y, \ - int32_t width, \ - int32_t height) \ -{ \ - dst_type *dst_line; \ - mask_type *mask_line; \ - int32_t dst_stride, mask_stride; \ - uint32_t src; \ - \ - src = _pixman_image_get_solid ( \ - imp, src_image, dst_image->bits.format); \ - \ - if ((flags & SKIP_ZERO_SRC) && src == 0) \ - return; \ - \ - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \ - dst_stride, dst_line, dst_cnt); \ - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type, \ - mask_stride, mask_line, mask_cnt); \ - \ - pixman_composite_##name##_asm_##cputype (width, height, \ - dst_line, dst_stride, \ - src, 0, \ - mask_line, mask_stride); \ -} - -#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(flags, cputype, name, \ - src_type, src_cnt, \ - dst_type, dst_cnt) \ -void \ -pixman_composite_##name##_asm_##cputype (int32_t w, \ - int32_t h, \ - dst_type *dst, \ - int32_t dst_stride, \ - src_type *src, \ - int32_t src_stride, \ - uint32_t mask); \ - \ -static void \ -cputype##_composite_##name (pixman_implementation_t *imp, \ - pixman_op_t op, \ - pixman_image_t * src_image, \ - pixman_image_t * mask_image, \ - pixman_image_t * dst_image, \ - int32_t src_x, \ - int32_t src_y, \ - int32_t mask_x, \ - int32_t mask_y, \ - int32_t dest_x, \ - int32_t dest_y, \ - int32_t width, \ - int32_t height) \ -{ \ - dst_type *dst_line; \ - src_type *src_line; \ - int32_t dst_stride, src_stride; \ - uint32_t mask; \ - \ - mask = _pixman_image_get_solid ( \ - imp, mask_image, dst_image->bits.format); \ - \ - if ((flags & SKIP_ZERO_MASK) && mask == 0) \ - return; \ - \ - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \ - dst_stride, dst_line, dst_cnt); \ - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type, \ - src_stride, src_line, src_cnt); \ - \ - pixman_composite_##name##_asm_##cputype (width, height, \ - dst_line, dst_stride, \ - src_line, src_stride, \ - mask); \ -} - -#define PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST(cputype, name, \ - src_type, src_cnt, \ - mask_type, mask_cnt, \ - dst_type, dst_cnt) \ -void \ -pixman_composite_##name##_asm_##cputype (int32_t w, \ - int32_t h, \ - dst_type *dst, \ - int32_t dst_stride, \ - src_type *src, \ - int32_t src_stride, \ - mask_type *mask, \ - int32_t mask_stride); \ - \ -static void \ -cputype##_composite_##name (pixman_implementation_t *imp, \ - pixman_op_t op, \ - pixman_image_t * src_image, \ - pixman_image_t * mask_image, \ - pixman_image_t * dst_image, \ - int32_t src_x, \ - int32_t src_y, \ - int32_t mask_x, \ - int32_t mask_y, \ - int32_t dest_x, \ - int32_t dest_y, \ - int32_t width, \ - int32_t height) \ -{ \ - dst_type *dst_line; \ - src_type *src_line; \ - mask_type *mask_line; \ - int32_t dst_stride, src_stride, mask_stride; \ - \ - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \ - dst_stride, dst_line, dst_cnt); \ - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type, \ - src_stride, src_line, src_cnt); \ - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type, \ - mask_stride, mask_line, mask_cnt); \ - \ - pixman_composite_##name##_asm_##cputype (width, height, \ - dst_line, dst_stride, \ - src_line, src_stride, \ - mask_line, mask_stride); \ -} - -#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST(cputype, name, op, \ - src_type, dst_type) \ -void \ -pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \ - int32_t w, \ - dst_type * dst, \ - const src_type * src, \ - pixman_fixed_t vx, \ - pixman_fixed_t unit_x); \ - \ -static force_inline void \ -scaled_nearest_scanline_##cputype##_##name##_##op (dst_type * pd, \ - const src_type * ps, \ - int32_t w, \ - pixman_fixed_t vx, \ - pixman_fixed_t unit_x, \ - pixman_fixed_t max_vx, \ - pixman_bool_t zero_src) \ -{ \ - pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \ - vx, unit_x);\ -} \ - \ -FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op, \ - scaled_nearest_scanline_##cputype##_##name##_##op, \ - src_type, dst_type, COVER) \ -FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op, \ - scaled_nearest_scanline_##cputype##_##name##_##op, \ - src_type, dst_type, NONE) \ -FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op, \ - scaled_nearest_scanline_##cputype##_##name##_##op, \ - src_type, dst_type, PAD) - -/* Provide entries for the fast path table */ -#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \ - SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \ - SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \ - SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func) - -#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op, \ - src_type, dst_type) \ -void \ -pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \ - int32_t w, \ - dst_type * dst, \ - const src_type * src, \ - pixman_fixed_t vx, \ - pixman_fixed_t unit_x, \ - const uint8_t * mask); \ - \ -static force_inline void \ -scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t * mask, \ - dst_type * pd, \ - const src_type * ps, \ - int32_t w, \ - pixman_fixed_t vx, \ - pixman_fixed_t unit_x, \ - pixman_fixed_t max_vx, \ - pixman_bool_t zero_src) \ -{ \ - if ((flags & SKIP_ZERO_SRC) && zero_src) \ - return; \ - pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \ - vx, unit_x, \ - mask); \ -} \ - \ -FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ - scaled_nearest_scanline_##cputype##_##name##_##op,\ - src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\ -FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ - scaled_nearest_scanline_##cputype##_##name##_##op,\ - src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \ -FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ - scaled_nearest_scanline_##cputype##_##name##_##op,\ - src_type, uint8_t, dst_type, PAD, TRUE, FALSE) - -/* Provide entries for the fast path table */ -#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \ - SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \ - SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \ - SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func) - -#endif +/*
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+#ifndef PIXMAN_ARM_COMMON_H
+#define PIXMAN_ARM_COMMON_H
+
+#include "pixman-fast-path.h"
+
+/* Define some macros which can expand into proxy functions between
+ * ARM assembly optimized functions and the rest of pixman fast path API.
+ *
+ * All the low level ARM assembly functions have to use ARM EABI
+ * calling convention and take up to 8 arguments:
+ * width, height, dst, dst_stride, src, src_stride, mask, mask_stride
+ *
+ * The arguments are ordered with the most important coming first (the
+ * first 4 arguments are passed to function in registers, the rest are
+ * on stack). The last arguments are optional, for example if the
+ * function is not using mask, then 'mask' and 'mask_stride' can be
+ * omitted when doing a function call.
+ *
+ * Arguments 'src' and 'mask' contain either a pointer to the top left
+ * pixel of the composited rectangle or a pixel color value depending
+ * on the function type. In the case of just a color value (solid source
+ * or mask), the corresponding stride argument is unused.
+ */
+
+#define SKIP_ZERO_SRC 1
+#define SKIP_ZERO_MASK 2
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(cputype, name, \
+ src_type, src_cnt, \
+ dst_type, dst_cnt) \
+void \
+pixman_composite_##name##_asm_##cputype (int32_t w, \
+ int32_t h, \
+ dst_type *dst, \
+ int32_t dst_stride, \
+ src_type *src, \
+ int32_t src_stride); \
+ \
+static void \
+cputype##_composite_##name (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dest_x, \
+ int32_t dest_y, \
+ int32_t width, \
+ int32_t height) \
+{ \
+ dst_type *dst_line; \
+ src_type *src_line; \
+ int32_t dst_stride, src_stride; \
+ \
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type, \
+ src_stride, src_line, src_cnt); \
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \
+ dst_stride, dst_line, dst_cnt); \
+ \
+ pixman_composite_##name##_asm_##cputype (width, height, \
+ dst_line, dst_stride, \
+ src_line, src_stride); \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(flags, cputype, name, \
+ dst_type, dst_cnt) \
+void \
+pixman_composite_##name##_asm_##cputype (int32_t w, \
+ int32_t h, \
+ dst_type *dst, \
+ int32_t dst_stride, \
+ uint32_t src); \
+ \
+static void \
+cputype##_composite_##name (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dest_x, \
+ int32_t dest_y, \
+ int32_t width, \
+ int32_t height) \
+{ \
+ dst_type *dst_line; \
+ int32_t dst_stride; \
+ uint32_t src; \
+ \
+ src = _pixman_image_get_solid ( \
+ imp, src_image, dst_image->bits.format); \
+ \
+ if ((flags & SKIP_ZERO_SRC) && src == 0) \
+ return; \
+ \
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \
+ dst_stride, dst_line, dst_cnt); \
+ \
+ pixman_composite_##name##_asm_##cputype (width, height, \
+ dst_line, dst_stride, \
+ src); \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(flags, cputype, name, \
+ mask_type, mask_cnt, \
+ dst_type, dst_cnt) \
+void \
+pixman_composite_##name##_asm_##cputype (int32_t w, \
+ int32_t h, \
+ dst_type *dst, \
+ int32_t dst_stride, \
+ uint32_t src, \
+ int32_t unused, \
+ mask_type *mask, \
+ int32_t mask_stride); \
+ \
+static void \
+cputype##_composite_##name (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dest_x, \
+ int32_t dest_y, \
+ int32_t width, \
+ int32_t height) \
+{ \
+ dst_type *dst_line; \
+ mask_type *mask_line; \
+ int32_t dst_stride, mask_stride; \
+ uint32_t src; \
+ \
+ src = _pixman_image_get_solid ( \
+ imp, src_image, dst_image->bits.format); \
+ \
+ if ((flags & SKIP_ZERO_SRC) && src == 0) \
+ return; \
+ \
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \
+ dst_stride, dst_line, dst_cnt); \
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type, \
+ mask_stride, mask_line, mask_cnt); \
+ \
+ pixman_composite_##name##_asm_##cputype (width, height, \
+ dst_line, dst_stride, \
+ src, 0, \
+ mask_line, mask_stride); \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(flags, cputype, name, \
+ src_type, src_cnt, \
+ dst_type, dst_cnt) \
+void \
+pixman_composite_##name##_asm_##cputype (int32_t w, \
+ int32_t h, \
+ dst_type *dst, \
+ int32_t dst_stride, \
+ src_type *src, \
+ int32_t src_stride, \
+ uint32_t mask); \
+ \
+static void \
+cputype##_composite_##name (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dest_x, \
+ int32_t dest_y, \
+ int32_t width, \
+ int32_t height) \
+{ \
+ dst_type *dst_line; \
+ src_type *src_line; \
+ int32_t dst_stride, src_stride; \
+ uint32_t mask; \
+ \
+ mask = _pixman_image_get_solid ( \
+ imp, mask_image, dst_image->bits.format); \
+ \
+ if ((flags & SKIP_ZERO_MASK) && mask == 0) \
+ return; \
+ \
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \
+ dst_stride, dst_line, dst_cnt); \
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type, \
+ src_stride, src_line, src_cnt); \
+ \
+ pixman_composite_##name##_asm_##cputype (width, height, \
+ dst_line, dst_stride, \
+ src_line, src_stride, \
+ mask); \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST(cputype, name, \
+ src_type, src_cnt, \
+ mask_type, mask_cnt, \
+ dst_type, dst_cnt) \
+void \
+pixman_composite_##name##_asm_##cputype (int32_t w, \
+ int32_t h, \
+ dst_type *dst, \
+ int32_t dst_stride, \
+ src_type *src, \
+ int32_t src_stride, \
+ mask_type *mask, \
+ int32_t mask_stride); \
+ \
+static void \
+cputype##_composite_##name (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dest_x, \
+ int32_t dest_y, \
+ int32_t width, \
+ int32_t height) \
+{ \
+ dst_type *dst_line; \
+ src_type *src_line; \
+ mask_type *mask_line; \
+ int32_t dst_stride, src_stride, mask_stride; \
+ \
+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \
+ dst_stride, dst_line, dst_cnt); \
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type, \
+ src_stride, src_line, src_cnt); \
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type, \
+ mask_stride, mask_line, mask_cnt); \
+ \
+ pixman_composite_##name##_asm_##cputype (width, height, \
+ dst_line, dst_stride, \
+ src_line, src_stride, \
+ mask_line, mask_stride); \
+}
+
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST(cputype, name, op, \
+ src_type, dst_type) \
+void \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \
+ int32_t w, \
+ dst_type * dst, \
+ const src_type * src, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x); \
+ \
+static force_inline void \
+scaled_nearest_scanline_##cputype##_##name##_##op (dst_type * pd, \
+ const src_type * ps, \
+ int32_t w, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx, \
+ pixman_bool_t zero_src) \
+{ \
+ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \
+ vx, unit_x);\
+} \
+ \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op, \
+ scaled_nearest_scanline_##cputype##_##name##_##op, \
+ src_type, dst_type, COVER) \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op, \
+ scaled_nearest_scanline_##cputype##_##name##_##op, \
+ src_type, dst_type, NONE) \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op, \
+ scaled_nearest_scanline_##cputype##_##name##_##op, \
+ src_type, dst_type, PAD)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \
+ SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op, \
+ src_type, dst_type) \
+void \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \
+ int32_t w, \
+ dst_type * dst, \
+ const src_type * src, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ const uint8_t * mask); \
+ \
+static force_inline void \
+scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t * mask, \
+ dst_type * pd, \
+ const src_type * ps, \
+ int32_t w, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx, \
+ pixman_bool_t zero_src) \
+{ \
+ if ((flags & SKIP_ZERO_SRC) && zero_src) \
+ return; \
+ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \
+ vx, unit_x, \
+ mask); \
+} \
+ \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \
+ scaled_nearest_scanline_##cputype##_##name##_##op,\
+ src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op, \
+ scaled_nearest_scanline_##cputype##_##name##_##op,\
+ src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \
+ scaled_nearest_scanline_##cputype##_##name##_##op,\
+ src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+/*****************************************************************************/
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op, \
+ src_type, dst_type) \
+void \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \
+ dst_type * dst, \
+ const src_type * top, \
+ const src_type * bottom, \
+ int wt, \
+ int wb, \
+ pixman_fixed_t x, \
+ pixman_fixed_t ux, \
+ int width); \
+ \
+static force_inline void \
+scaled_bilinear_scanline_##cputype##_##name##_##op ( \
+ dst_type * dst, \
+ const uint32_t * mask, \
+ const src_type * src_top, \
+ const src_type * src_bottom, \
+ int32_t w, \
+ int wt, \
+ int wb, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx, \
+ pixman_bool_t zero_src) \
+{ \
+ if ((flags & SKIP_ZERO_SRC) && zero_src) \
+ return; \
+ pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \
+ dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+} \
+ \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \
+ scaled_bilinear_scanline_##cputype##_##name##_##op, \
+ src_type, uint32_t, dst_type, COVER, FALSE, FALSE) \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \
+ scaled_bilinear_scanline_##cputype##_##name##_##op, \
+ src_type, uint32_t, dst_type, NONE, FALSE, FALSE) \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \
+ scaled_bilinear_scanline_##cputype##_##name##_##op, \
+ src_type, uint32_t, dst_type, PAD, FALSE, FALSE)
+
+#endif
diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S index c168e10c2..6cca0f2cc 100644 --- a/pixman/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman/pixman-arm-neon-asm.S @@ -1,2590 +1,2718 @@ -/* - * Copyright © 2009 Nokia Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) - */ - -/* - * This file contains implementations of NEON optimized pixel processing - * functions. There is no full and detailed tutorial, but some functions - * (those which are exposing some new or interesting features) are - * extensively commented and can be used as examples. - * - * You may want to have a look at the comments for following functions: - * - pixman_composite_over_8888_0565_asm_neon - * - pixman_composite_over_n_8_0565_asm_neon - */ - -/* Prevent the stack from becoming executable for no reason... */ -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - - .text - .fpu neon - .arch armv7a - .object_arch armv4 - .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ - .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ - .arm - .altmacro - -#include "pixman-arm-neon-asm.h" - -/* Global configuration options and preferences */ - -/* - * The code can optionally make use of unaligned memory accesses to improve - * performance of handling leading/trailing pixels for each scanline. - * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for - * example in linux if unaligned memory accesses are not configured to - * generate.exceptions. - */ -.set RESPECT_STRICT_ALIGNMENT, 1 - -/* - * Set default prefetch type. There is a choice between the following options: - * - * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work - * as NOP to workaround some HW bugs or for whatever other reason) - * - * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where - * advanced prefetch intruduces heavy overhead) - * - * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 - * which can run ARM and NEON instructions simultaneously so that extra ARM - * instructions do not add (many) extra cycles, but improve prefetch efficiency) - * - * Note: some types of function can't support advanced prefetch and fallback - * to simple one (those which handle 24bpp pixels) - */ -.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED - -/* Prefetch distance in pixels for simple prefetch */ -.set PREFETCH_DISTANCE_SIMPLE, 64 - -/* - * Implementation of pixman_composite_over_8888_0565_asm_neon - * - * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and - * performs OVER compositing operation. Function fast_composite_over_8888_0565 - * from pixman-fast-path.c does the same in C and can be used as a reference. - * - * First we need to have some NEON assembly code which can do the actual - * operation on the pixels and provide it to the template macro. - * - * Template macro quite conveniently takes care of emitting all the necessary - * code for memory reading and writing (including quite tricky cases of - * handling unaligned leading/trailing pixels), so we only need to deal with - * the data in NEON registers. - * - * NEON registers allocation in general is recommented to be the following: - * d0, d1, d2, d3 - contain loaded source pixel data - * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) - * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) - * d28, d29, d30, d31 - place for storing the result (destination pixels) - * - * As can be seen above, four 64-bit NEON registers are used for keeping - * intermediate pixel data and up to 8 pixels can be processed in one step - * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). - * - * This particular function uses the following registers allocation: - * d0, d1, d2, d3 - contain loaded source pixel data - * d4, d5 - contain loaded destination pixels (they are needed) - * d28, d29 - place for storing the result (destination pixels) - */ - -/* - * Step one. We need to have some code to do some arithmetics on pixel data. - * This is implemented as a pair of macros: '*_head' and '*_tail'. When used - * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, - * perform all the needed calculations and write the result to {d28, d29}. - * The rationale for having two macros and not just one will be explained - * later. In practice, any single monolitic function which does the work can - * be split into two parts in any arbitrary way without affecting correctness. - * - * There is one special trick here too. Common template macro can optionally - * make our life a bit easier by doing R, G, B, A color components - * deinterleaving for 32bpp pixel formats (and this feature is used in - * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that - * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we - * actually use d0 register for blue channel (a vector of eight 8-bit - * values), d1 register for green, d2 for red and d3 for alpha. This - * simple conversion can be also done with a few NEON instructions: - * - * Packed to planar conversion: - * vuzp.8 d0, d1 - * vuzp.8 d2, d3 - * vuzp.8 d1, d3 - * vuzp.8 d0, d2 - * - * Planar to packed conversion: - * vzip.8 d0, d2 - * vzip.8 d1, d3 - * vzip.8 d2, d3 - * vzip.8 d0, d1 - * - * But pixel can be loaded directly in planar format using VLD4.8 NEON - * instruction. It is 1 cycle slower than VLD1.32, so this is not always - * desirable, that's why deinterleaving is optional. - * - * But anyway, here is the code: - */ -.macro pixman_composite_over_8888_0565_process_pixblock_head - /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format - and put data into d6 - red, d7 - green, d30 - blue */ - vshrn.u16 d6, q2, #8 - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vsri.u8 d6, d6, #5 - vmvn.8 d3, d3 /* invert source alpha */ - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 - /* now do alpha blending, storing results in 8-bit planar format - into d16 - red, d19 - green, d18 - blue */ - vmull.u8 q10, d3, d6 - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 - vrshr.u16 q13, q10, #8 - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 - vraddhn.u16 d22, q12, q15 -.endm - -.macro pixman_composite_over_8888_0565_process_pixblock_tail - /* ... continue alpha blending */ - vqadd.u8 d16, d2, d20 - vqadd.u8 q9, q0, q11 - /* convert the result to r5g6b5 and store it into {d28, d29} */ - vshll.u8 q14, d16, #8 - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 -.endm - -/* - * OK, now we got almost everything that we need. Using the above two - * macros, the work can be done right. But now we want to optimize - * it a bit. ARM Cortex-A8 is an in-order core, and benefits really - * a lot from good code scheduling and software pipelining. - * - * Let's construct some code, which will run in the core main loop. - * Some pseudo-code of the main loop will look like this: - * head - * while (...) { - * tail - * head - * } - * tail - * - * It may look a bit weird, but this setup allows to hide instruction - * latencies better and also utilize dual-issue capability more - * efficiently (make pairs of load-store and ALU instructions). - * - * So what we need now is a '*_tail_head' macro, which will be used - * in the core main loop. A trivial straightforward implementation - * of this macro would look like this: - * - * pixman_composite_over_8888_0565_process_pixblock_tail - * vst1.16 {d28, d29}, [DST_W, :128]! - * vld1.16 {d4, d5}, [DST_R, :128]! - * vld4.32 {d0, d1, d2, d3}, [SRC]! - * pixman_composite_over_8888_0565_process_pixblock_head - * cache_preload 8, 8 - * - * Now it also got some VLD/VST instructions. We simply can't move from - * processing one block of pixels to the other one with just arithmetics. - * The previously processed data needs to be written to memory and new - * data needs to be fetched. Fortunately, this main loop does not deal - * with partial leading/trailing pixels and can load/store a full block - * of pixels in a bulk. Additionally, destination buffer is already - * 16 bytes aligned here (which is good for performance). - * - * New things here are DST_R, DST_W, SRC and MASK identifiers. These - * are the aliases for ARM registers which are used as pointers for - * accessing data. We maintain separate pointers for reading and writing - * destination buffer (DST_R and DST_W). - * - * Another new thing is 'cache_preload' macro. It is used for prefetching - * data into CPU L2 cache and improve performance when dealing with large - * images which are far larger than cache size. It uses one argument - * (actually two, but they need to be the same here) - number of pixels - * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some - * details about this macro. Moreover, if good performance is needed - * the code from this macro needs to be copied into '*_tail_head' macro - * and mixed with the rest of code for optimal instructions scheduling. - * We are actually doing it below. - * - * Now after all the explanations, here is the optimized code. - * Different instruction streams (originaling from '*_head', '*_tail' - * and 'cache_preload' macro) use different indentation levels for - * better readability. Actually taking the code from one of these - * indentation levels and ignoring a few VLD/VST instructions would - * result in exactly the code from '*_head', '*_tail' or 'cache_preload' - * macro! - */ - -#if 1 - -.macro pixman_composite_over_8888_0565_process_pixblock_tail_head - vqadd.u8 d16, d2, d20 - vld1.16 {d4, d5}, [DST_R, :128]! - vqadd.u8 q9, q0, q11 - vshrn.u16 d6, q2, #8 - fetch_src_pixblock - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vshll.u8 q14, d16, #8 - PF add PF_X, PF_X, #8 - vshll.u8 q8, d19, #8 - PF tst PF_CTL, #0xF - vsri.u8 d6, d6, #5 - PF addne PF_X, PF_X, #8 - vmvn.8 d3, d3 - PF subne PF_CTL, PF_CTL, #1 - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 - vmull.u8 q10, d3, d6 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vsri.u16 q14, q8, #5 - PF cmp PF_X, ORIG_W - vshll.u8 q9, d18, #8 - vrshr.u16 q13, q10, #8 - PF subge PF_X, PF_X, ORIG_W - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 - PF subges PF_CTL, PF_CTL, #0x10 - vsri.u16 q14, q9, #11 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vraddhn.u16 d22, q12, q15 - vst1.16 {d28, d29}, [DST_W, :128]! -.endm - -#else - -/* If we did not care much about the performance, we would just use this... */ -.macro pixman_composite_over_8888_0565_process_pixblock_tail_head - pixman_composite_over_8888_0565_process_pixblock_tail - vst1.16 {d28, d29}, [DST_W, :128]! - vld1.16 {d4, d5}, [DST_R, :128]! - fetch_src_pixblock - pixman_composite_over_8888_0565_process_pixblock_head - cache_preload 8, 8 -.endm - -#endif - -/* - * And now the final part. We are using 'generate_composite_function' macro - * to put all the stuff together. We are specifying the name of the function - * which we want to get, number of bits per pixel for the source, mask and - * destination (0 if unused, like mask in this case). Next come some bit - * flags: - * FLAG_DST_READWRITE - tells that the destination buffer is both read - * and written, for write-only buffer we would use - * FLAG_DST_WRITEONLY flag instead - * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data - * and separate color channels for 32bpp format. - * The next things are: - * - the number of pixels processed per iteration (8 in this case, because - * that's the maximum what can fit into four 64-bit NEON registers). - * - prefetch distance, measured in pixel blocks. In this case it is 5 times - * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal - * prefetch distance can be selected by running some benchmarks. - * - * After that we specify some macros, these are 'default_init', - * 'default_cleanup' here which are empty (but it is possible to have custom - * init/cleanup macros to be able to save/restore some extra NEON registers - * like d8-d15 or do anything else) followed by - * 'pixman_composite_over_8888_0565_process_pixblock_head', - * 'pixman_composite_over_8888_0565_process_pixblock_tail' and - * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' - * which we got implemented above. - * - * The last part is the NEON registers allocation scheme. - */ -generate_composite_function \ - pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_over_8888_0565_process_pixblock_head, \ - pixman_composite_over_8888_0565_process_pixblock_tail, \ - pixman_composite_over_8888_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 24 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_n_0565_process_pixblock_head - /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format - and put data into d6 - red, d7 - green, d30 - blue */ - vshrn.u16 d6, q2, #8 - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vsri.u8 d6, d6, #5 - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 - /* now do alpha blending, storing results in 8-bit planar format - into d16 - red, d19 - green, d18 - blue */ - vmull.u8 q10, d3, d6 - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 - vrshr.u16 q13, q10, #8 - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 - vraddhn.u16 d22, q12, q15 -.endm - -.macro pixman_composite_over_n_0565_process_pixblock_tail - /* ... continue alpha blending */ - vqadd.u8 d16, d2, d20 - vqadd.u8 q9, q0, q11 - /* convert the result to r5g6b5 and store it into {d28, d29} */ - vshll.u8 q14, d16, #8 - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_n_0565_process_pixblock_tail_head - pixman_composite_over_n_0565_process_pixblock_tail - vld1.16 {d4, d5}, [DST_R, :128]! - vst1.16 {d28, d29}, [DST_W, :128]! - pixman_composite_over_n_0565_process_pixblock_head - cache_preload 8, 8 -.endm - -.macro pixman_composite_over_n_0565_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d0, d3[0] - vdup.8 d1, d3[1] - vdup.8 d2, d3[2] - vdup.8 d3, d3[3] - vmvn.8 d3, d3 /* invert source alpha */ -.endm - -generate_composite_function \ - pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_0565_init, \ - default_cleanup, \ - pixman_composite_over_n_0565_process_pixblock_head, \ - pixman_composite_over_n_0565_process_pixblock_tail, \ - pixman_composite_over_n_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 24 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_8888_0565_process_pixblock_head - vshll.u8 q8, d1, #8 - vshll.u8 q14, d2, #8 - vshll.u8 q9, d0, #8 -.endm - -.macro pixman_composite_src_8888_0565_process_pixblock_tail - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 -.endm - -.macro pixman_composite_src_8888_0565_process_pixblock_tail_head - vsri.u16 q14, q8, #5 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - fetch_src_pixblock - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vsri.u16 q14, q9, #11 - PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vshll.u8 q8, d1, #8 - vst1.16 {d28, d29}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - vshll.u8 q14, d2, #8 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vshll.u8 q9, d0, #8 -.endm - -generate_composite_function \ - pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_8888_0565_process_pixblock_head, \ - pixman_composite_src_8888_0565_process_pixblock_tail, \ - pixman_composite_src_8888_0565_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_src_0565_8888_process_pixblock_head - vshrn.u16 d30, q0, #8 - vshrn.u16 d29, q0, #3 - vsli.u16 q0, q0, #5 - vmov.u8 d31, #255 - vsri.u8 d30, d30, #5 - vsri.u8 d29, d29, #6 - vshrn.u16 d28, q0, #2 -.endm - -.macro pixman_composite_src_0565_8888_process_pixblock_tail -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_src_0565_8888_process_pixblock_tail_head - pixman_composite_src_0565_8888_process_pixblock_tail - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - fetch_src_pixblock - pixman_composite_src_0565_8888_process_pixblock_head - cache_preload 8, 8 -.endm - -generate_composite_function \ - pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_0565_8888_process_pixblock_head, \ - pixman_composite_src_0565_8888_process_pixblock_tail, \ - pixman_composite_src_0565_8888_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_add_8_8_process_pixblock_head - vqadd.u8 q14, q0, q2 - vqadd.u8 q15, q1, q3 -.endm - -.macro pixman_composite_add_8_8_process_pixblock_tail -.endm - -.macro pixman_composite_add_8_8_process_pixblock_tail_head - fetch_src_pixblock - PF add PF_X, PF_X, #32 - PF tst PF_CTL, #0xF - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - PF addne PF_X, PF_X, #32 - PF subne PF_CTL, PF_CTL, #1 - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - vqadd.u8 q14, q0, q2 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vqadd.u8 q15, q1, q3 -.endm - -generate_composite_function \ - pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ - FLAG_DST_READWRITE, \ - 32, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8_8_process_pixblock_head, \ - pixman_composite_add_8_8_process_pixblock_tail, \ - pixman_composite_add_8_8_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_add_8888_8888_process_pixblock_tail_head - fetch_src_pixblock - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - vqadd.u8 q14, q0, q2 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vqadd.u8 q15, q1, q3 -.endm - -generate_composite_function \ - pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8_8_process_pixblock_head, \ - pixman_composite_add_8_8_process_pixblock_tail, \ - pixman_composite_add_8888_8888_process_pixblock_tail_head - -generate_composite_function_single_scanline \ - pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8_8_process_pixblock_head, \ - pixman_composite_add_8_8_process_pixblock_tail, \ - pixman_composite_add_8888_8888_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head - vmvn.8 d24, d3 /* get inverted alpha */ - /* do alpha blending */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 - vmull.u8 q11, d24, d7 -.endm - -.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 -.endm - -.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q14, q8, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - PF cmp PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - fetch_src_pixblock - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 -.endm - -generate_composite_function_single_scanline \ - pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ - pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ - pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_over_8888_8888_process_pixblock_head - pixman_composite_out_reverse_8888_8888_process_pixblock_head -.endm - -.macro pixman_composite_over_8888_8888_process_pixblock_tail - pixman_composite_out_reverse_8888_8888_process_pixblock_tail - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 -.endm - -.macro pixman_composite_over_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q14, q8, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - PF cmp PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 - fetch_src_pixblock - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 -.endm - -generate_composite_function \ - pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_process_pixblock_tail_head - -generate_composite_function_single_scanline \ - pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_process_pixblock_tail_head - -/******************************************************************************/ - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_n_8888_process_pixblock_tail_head - pixman_composite_over_8888_8888_process_pixblock_tail - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - pixman_composite_over_8888_8888_process_pixblock_head - cache_preload 8, 8 -.endm - -.macro pixman_composite_over_n_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d0, d3[0] - vdup.8 d1, d3[1] - vdup.8 d2, d3[2] - vdup.8 d3, d3[3] -.endm - -generate_composite_function \ - pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_8888_init, \ - default_cleanup, \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_n_8888_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head - vrshr.u16 q14, q8, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - PF cmp PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 - vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 - vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 -.endm - -.macro pixman_composite_over_reverse_n_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d7[0]}, [DUMMY] - vdup.8 d4, d7[0] - vdup.8 d5, d7[1] - vdup.8 d6, d7[2] - vdup.8 d7, d7[3] -.endm - -generate_composite_function \ - pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_reverse_n_8888_init, \ - default_cleanup, \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 4, /* src_basereg */ \ - 24 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_8888_8_0565_process_pixblock_head - vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ - vmull.u8 q1, d24, d9 - vmull.u8 q6, d24, d10 - vmull.u8 q7, d24, d11 - vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ - vrshr.u16 q9, q1, #8 - vrshr.u16 q10, q6, #8 - vrshr.u16 q11, q7, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q9 - vraddhn.u16 d2, q6, q10 - vraddhn.u16 d3, q7, q11 - vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ - vsri.u8 d7, d7, #6 - vmvn.8 d3, d3 - vshrn.u16 d30, q2, #2 - vmull.u8 q8, d3, d6 /* now do alpha blending */ - vmull.u8 q9, d3, d7 - vmull.u8 q10, d3, d30 -.endm - -.macro pixman_composite_over_8888_8_0565_process_pixblock_tail - /* 3 cycle bubble (after vmull.u8) */ - vrshr.u16 q13, q8, #8 - vrshr.u16 q11, q9, #8 - vrshr.u16 q15, q10, #8 - vraddhn.u16 d16, q8, q13 - vraddhn.u16 d27, q9, q11 - vraddhn.u16 d26, q10, q15 - vqadd.u8 d16, d2, d16 - /* 1 cycle bubble */ - vqadd.u8 q9, q0, q13 - vshll.u8 q14, d16, #8 /* convert to 16bpp */ - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - /* 1 cycle bubble */ - vsri.u16 q14, q9, #11 -.endm - -.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head - vld1.16 {d4, d5}, [DST_R, :128]! - vshrn.u16 d6, q2, #8 - fetch_mask_pixblock - vshrn.u16 d7, q2, #3 - fetch_src_pixblock - vmull.u8 q6, d24, d10 - vrshr.u16 q13, q8, #8 - vrshr.u16 q11, q9, #8 - vrshr.u16 q15, q10, #8 - vraddhn.u16 d16, q8, q13 - vraddhn.u16 d27, q9, q11 - vraddhn.u16 d26, q10, q15 - vqadd.u8 d16, d2, d16 - vmull.u8 q1, d24, d9 - vqadd.u8 q9, q0, q13 - vshll.u8 q14, d16, #8 - vmull.u8 q0, d24, d8 - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - vmull.u8 q7, d24, d11 - vsri.u16 q14, q9, #11 - - cache_preload 8, 8 - - vsli.u16 q2, q2, #5 - vrshr.u16 q8, q0, #8 - vrshr.u16 q9, q1, #8 - vrshr.u16 q10, q6, #8 - vrshr.u16 q11, q7, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q9 - vraddhn.u16 d2, q6, q10 - vraddhn.u16 d3, q7, q11 - vsri.u8 d6, d6, #5 - vsri.u8 d7, d7, #6 - vmvn.8 d3, d3 - vshrn.u16 d30, q2, #2 - vst1.16 {d28, d29}, [DST_W, :128]! - vmull.u8 q8, d3, d6 - vmull.u8 q9, d3, d7 - vmull.u8 q10, d3, d30 -.endm - -generate_composite_function \ - pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_8_0565_process_pixblock_head, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 24 /* mask_basereg */ - -/******************************************************************************/ - -/* - * This function needs a special initialization of solid mask. - * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET - * offset, split into color components and replicated in d8-d11 - * registers. Additionally, this function needs all the NEON registers, - * so it has to save d8-d15 registers which are callee saved according - * to ABI. These registers are restored from 'cleanup' macro. All the - * other NEON registers are caller saved, so can be clobbered freely - * without introducing any problems. - */ -.macro pixman_composite_over_n_8_0565_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d8, d11[0] - vdup.8 d9, d11[1] - vdup.8 d10, d11[2] - vdup.8 d11, d11[3] -.endm - -.macro pixman_composite_over_n_8_0565_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_8_0565_init, \ - pixman_composite_over_n_8_0565_cleanup, \ - pixman_composite_over_8888_8_0565_process_pixblock_head, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_over_8888_n_0565_init - add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) - vpush {d8-d15} - vld1.32 {d24[0]}, [DUMMY] - vdup.8 d24, d24[3] -.endm - -.macro pixman_composite_over_8888_n_0565_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_8888_n_0565_init, \ - pixman_composite_over_8888_n_0565_cleanup, \ - pixman_composite_over_8888_8_0565_process_pixblock_head, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 24 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_0565_0565_process_pixblock_head -.endm - -.macro pixman_composite_src_0565_0565_process_pixblock_tail -.endm - -.macro pixman_composite_src_0565_0565_process_pixblock_tail_head - vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! - fetch_src_pixblock - cache_preload 16, 16 -.endm - -generate_composite_function \ - pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ - FLAG_DST_WRITEONLY, \ - 16, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_0565_0565_process_pixblock_head, \ - pixman_composite_src_0565_0565_process_pixblock_tail, \ - pixman_composite_src_0565_0565_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_n_8_process_pixblock_head -.endm - -.macro pixman_composite_src_n_8_process_pixblock_tail -.endm - -.macro pixman_composite_src_n_8_process_pixblock_tail_head - vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! -.endm - -.macro pixman_composite_src_n_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d0[0]}, [DUMMY] - vsli.u64 d0, d0, #8 - vsli.u64 d0, d0, #16 - vsli.u64 d0, d0, #32 - vorr d1, d0, d0 - vorr q1, q0, q0 -.endm - -.macro pixman_composite_src_n_8_cleanup -.endm - -generate_composite_function \ - pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ - FLAG_DST_WRITEONLY, \ - 32, /* number of pixels, processed in a single block */ \ - 0, /* prefetch distance */ \ - pixman_composite_src_n_8_init, \ - pixman_composite_src_n_8_cleanup, \ - pixman_composite_src_n_8_process_pixblock_head, \ - pixman_composite_src_n_8_process_pixblock_tail, \ - pixman_composite_src_n_8_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_n_0565_process_pixblock_head -.endm - -.macro pixman_composite_src_n_0565_process_pixblock_tail -.endm - -.macro pixman_composite_src_n_0565_process_pixblock_tail_head - vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! -.endm - -.macro pixman_composite_src_n_0565_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d0[0]}, [DUMMY] - vsli.u64 d0, d0, #16 - vsli.u64 d0, d0, #32 - vorr d1, d0, d0 - vorr q1, q0, q0 -.endm - -.macro pixman_composite_src_n_0565_cleanup -.endm - -generate_composite_function \ - pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ - FLAG_DST_WRITEONLY, \ - 16, /* number of pixels, processed in a single block */ \ - 0, /* prefetch distance */ \ - pixman_composite_src_n_0565_init, \ - pixman_composite_src_n_0565_cleanup, \ - pixman_composite_src_n_0565_process_pixblock_head, \ - pixman_composite_src_n_0565_process_pixblock_tail, \ - pixman_composite_src_n_0565_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_n_8888_process_pixblock_head -.endm - -.macro pixman_composite_src_n_8888_process_pixblock_tail -.endm - -.macro pixman_composite_src_n_8888_process_pixblock_tail_head - vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! -.endm - -.macro pixman_composite_src_n_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d0[0]}, [DUMMY] - vsli.u64 d0, d0, #32 - vorr d1, d0, d0 - vorr q1, q0, q0 -.endm - -.macro pixman_composite_src_n_8888_cleanup -.endm - -generate_composite_function \ - pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ - FLAG_DST_WRITEONLY, \ - 8, /* number of pixels, processed in a single block */ \ - 0, /* prefetch distance */ \ - pixman_composite_src_n_8888_init, \ - pixman_composite_src_n_8888_cleanup, \ - pixman_composite_src_n_8888_process_pixblock_head, \ - pixman_composite_src_n_8888_process_pixblock_tail, \ - pixman_composite_src_n_8888_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_8888_8888_process_pixblock_head -.endm - -.macro pixman_composite_src_8888_8888_process_pixblock_tail -.endm - -.macro pixman_composite_src_8888_8888_process_pixblock_tail_head - vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! - fetch_src_pixblock - cache_preload 8, 8 -.endm - -generate_composite_function \ - pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_8888_8888_process_pixblock_head, \ - pixman_composite_src_8888_8888_process_pixblock_tail, \ - pixman_composite_src_8888_8888_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_x888_8888_process_pixblock_head - vorr q0, q0, q2 - vorr q1, q1, q2 -.endm - -.macro pixman_composite_src_x888_8888_process_pixblock_tail -.endm - -.macro pixman_composite_src_x888_8888_process_pixblock_tail_head - vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! - fetch_src_pixblock - vorr q0, q0, q2 - vorr q1, q1, q2 - cache_preload 8, 8 -.endm - -.macro pixman_composite_src_x888_8888_init - vmov.u8 q2, #0xFF - vshl.u32 q2, q2, #24 -.endm - -generate_composite_function \ - pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - pixman_composite_src_x888_8888_init, \ - default_cleanup, \ - pixman_composite_src_x888_8888_process_pixblock_head, \ - pixman_composite_src_x888_8888_process_pixblock_tail, \ - pixman_composite_src_x888_8888_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_n_8_8888_process_pixblock_head - /* expecting deinterleaved source data in {d8, d9, d10, d11} */ - /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ - /* and destination data in {d4, d5, d6, d7} */ - /* mask is in d24 (d25, d26, d27 are unused) */ - - /* in */ - vmull.u8 q0, d24, d8 - vmull.u8 q1, d24, d9 - vmull.u8 q6, d24, d10 - vmull.u8 q7, d24, d11 - vrshr.u16 q10, q0, #8 - vrshr.u16 q11, q1, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q13, q7, #8 - vraddhn.u16 d0, q0, q10 - vraddhn.u16 d1, q1, q11 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d3, q7, q13 - vmvn.8 d24, d3 /* get inverted alpha */ - /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ - /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ - /* now do alpha blending */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 - vmull.u8 q11, d24, d7 -.endm - -.macro pixman_composite_over_n_8_8888_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head - pixman_composite_over_n_8_8888_process_pixblock_tail - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - fetch_mask_pixblock - cache_preload 8, 8 - pixman_composite_over_n_8_8888_process_pixblock_head -.endm - -.macro pixman_composite_over_n_8_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d8, d11[0] - vdup.8 d9, d11[1] - vdup.8 d10, d11[2] - vdup.8 d11, d11[3] -.endm - -.macro pixman_composite_over_n_8_8888_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_8_8888_init, \ - pixman_composite_over_n_8_8888_cleanup, \ - pixman_composite_over_n_8_8888_process_pixblock_head, \ - pixman_composite_over_n_8_8888_process_pixblock_tail, \ - pixman_composite_over_n_8_8888_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_over_n_8_8_process_pixblock_head - vmull.u8 q0, d24, d8 - vmull.u8 q1, d25, d8 - vmull.u8 q6, d26, d8 - vmull.u8 q7, d27, d8 - vrshr.u16 q10, q0, #8 - vrshr.u16 q11, q1, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q13, q7, #8 - vraddhn.u16 d0, q0, q10 - vraddhn.u16 d1, q1, q11 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d3, q7, q13 - vmvn.8 q12, q0 - vmvn.8 q13, q1 - vmull.u8 q8, d24, d4 - vmull.u8 q9, d25, d5 - vmull.u8 q10, d26, d6 - vmull.u8 q11, d27, d7 -.endm - -.macro pixman_composite_over_n_8_8_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_n_8_8_process_pixblock_tail_head - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - pixman_composite_over_n_8_8_process_pixblock_tail - fetch_mask_pixblock - cache_preload 32, 32 - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - pixman_composite_over_n_8_8_process_pixblock_head -.endm - -.macro pixman_composite_over_n_8_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d8[0]}, [DUMMY] - vdup.8 d8, d8[3] -.endm - -.macro pixman_composite_over_n_8_8_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ - FLAG_DST_READWRITE, \ - 32, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_8_8_init, \ - pixman_composite_over_n_8_8_cleanup, \ - pixman_composite_over_n_8_8_process_pixblock_head, \ - pixman_composite_over_n_8_8_process_pixblock_tail, \ - pixman_composite_over_n_8_8_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head - /* - * 'combine_mask_ca' replacement - * - * input: solid src (n) in {d8, d9, d10, d11} - * dest in {d4, d5, d6, d7 } - * mask in {d24, d25, d26, d27} - * output: updated src in {d0, d1, d2, d3 } - * updated mask in {d24, d25, d26, d3 } - */ - vmull.u8 q0, d24, d8 - vmull.u8 q1, d25, d9 - vmull.u8 q6, d26, d10 - vmull.u8 q7, d27, d11 - vmull.u8 q9, d11, d25 - vmull.u8 q12, d11, d24 - vmull.u8 q13, d11, d26 - vrshr.u16 q8, q0, #8 - vrshr.u16 q10, q1, #8 - vrshr.u16 q11, q6, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q10 - vraddhn.u16 d2, q6, q11 - vrshr.u16 q11, q12, #8 - vrshr.u16 q8, q9, #8 - vrshr.u16 q6, q13, #8 - vrshr.u16 q10, q7, #8 - vraddhn.u16 d24, q12, q11 - vraddhn.u16 d25, q9, q8 - vraddhn.u16 d26, q13, q6 - vraddhn.u16 d3, q7, q10 - /* - * 'combine_over_ca' replacement - * - * output: updated dest in {d28, d29, d30, d31} - */ - vmvn.8 d24, d24 - vmvn.8 d25, d25 - vmull.u8 q8, d24, d4 - vmull.u8 q9, d25, d5 - vmvn.8 d26, d26 - vmvn.8 d27, d3 - vmull.u8 q10, d26, d6 - vmull.u8 q11, d27, d7 -.endm - -.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail - /* ... continue 'combine_over_ca' replacement */ - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q6, q10, #8 - vrshr.u16 q7, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q6, q10 - vraddhn.u16 d31, q7, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 -.endm - -.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q6, q10, #8 - vrshr.u16 q7, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q6, q10 - vraddhn.u16 d31, q7, q11 - fetch_mask_pixblock - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 - cache_preload 8, 8 - pixman_composite_over_n_8888_8888_ca_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -.macro pixman_composite_over_n_8888_8888_ca_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d8, d11[0] - vdup.8 d9, d11[1] - vdup.8 d10, d11[2] - vdup.8 d11, d11[3] -.endm - -.macro pixman_composite_over_n_8888_8888_ca_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_8888_8888_ca_init, \ - pixman_composite_over_n_8888_8888_ca_cleanup, \ - pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ - pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ - pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_in_n_8_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* and destination data in {d4, d5, d6, d7} */ - vmull.u8 q8, d4, d3 - vmull.u8 q9, d5, d3 - vmull.u8 q10, d6, d3 - vmull.u8 q11, d7, d3 -.endm - -.macro pixman_composite_in_n_8_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q8, q14 - vraddhn.u16 d29, q9, q15 - vraddhn.u16 d30, q10, q12 - vraddhn.u16 d31, q11, q13 -.endm - -.macro pixman_composite_in_n_8_process_pixblock_tail_head - pixman_composite_in_n_8_process_pixblock_tail - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - cache_preload 32, 32 - pixman_composite_in_n_8_process_pixblock_head - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -.macro pixman_composite_in_n_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d3, d3[3] -.endm - -.macro pixman_composite_in_n_8_cleanup -.endm - -generate_composite_function \ - pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ - FLAG_DST_READWRITE, \ - 32, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_in_n_8_init, \ - pixman_composite_in_n_8_cleanup, \ - pixman_composite_in_n_8_process_pixblock_head, \ - pixman_composite_in_n_8_process_pixblock_tail, \ - pixman_composite_in_n_8_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 24 /* mask_basereg */ - -.macro pixman_composite_add_n_8_8_process_pixblock_head - /* expecting source data in {d8, d9, d10, d11} */ - /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ - /* and destination data in {d4, d5, d6, d7} */ - /* mask is in d24, d25, d26, d27 */ - vmull.u8 q0, d24, d11 - vmull.u8 q1, d25, d11 - vmull.u8 q6, d26, d11 - vmull.u8 q7, d27, d11 - vrshr.u16 q10, q0, #8 - vrshr.u16 q11, q1, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q13, q7, #8 - vraddhn.u16 d0, q0, q10 - vraddhn.u16 d1, q1, q11 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d3, q7, q13 - vqadd.u8 q14, q0, q2 - vqadd.u8 q15, q1, q3 -.endm - -.macro pixman_composite_add_n_8_8_process_pixblock_tail -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_add_n_8_8_process_pixblock_tail_head - pixman_composite_add_n_8_8_process_pixblock_tail - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - fetch_mask_pixblock - cache_preload 32, 32 - pixman_composite_add_n_8_8_process_pixblock_head -.endm - -.macro pixman_composite_add_n_8_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d11, d11[3] -.endm - -.macro pixman_composite_add_n_8_8_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ - FLAG_DST_READWRITE, \ - 32, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_add_n_8_8_init, \ - pixman_composite_add_n_8_8_cleanup, \ - pixman_composite_add_n_8_8_process_pixblock_head, \ - pixman_composite_add_n_8_8_process_pixblock_tail, \ - pixman_composite_add_n_8_8_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_add_8_8_8_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* destination data in {d4, d5, d6, d7} */ - /* mask in {d24, d25, d26, d27} */ - vmull.u8 q8, d24, d0 - vmull.u8 q9, d25, d1 - vmull.u8 q10, d26, d2 - vmull.u8 q11, d27, d3 - vrshr.u16 q0, q8, #8 - vrshr.u16 q1, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q9 - vraddhn.u16 d2, q12, q10 - vraddhn.u16 d3, q13, q11 - vqadd.u8 q14, q0, q2 - vqadd.u8 q15, q1, q3 -.endm - -.macro pixman_composite_add_8_8_8_process_pixblock_tail -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_add_8_8_8_process_pixblock_tail_head - pixman_composite_add_8_8_8_process_pixblock_tail - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - fetch_mask_pixblock - fetch_src_pixblock - cache_preload 32, 32 - pixman_composite_add_8_8_8_process_pixblock_head -.endm - -.macro pixman_composite_add_8_8_8_init -.endm - -.macro pixman_composite_add_8_8_8_cleanup -.endm - -generate_composite_function \ - pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ - FLAG_DST_READWRITE, \ - 32, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_add_8_8_8_init, \ - pixman_composite_add_8_8_8_cleanup, \ - pixman_composite_add_8_8_8_process_pixblock_head, \ - pixman_composite_add_8_8_8_process_pixblock_tail, \ - pixman_composite_add_8_8_8_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_add_8888_8888_8888_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* destination data in {d4, d5, d6, d7} */ - /* mask in {d24, d25, d26, d27} */ - vmull.u8 q8, d27, d0 - vmull.u8 q9, d27, d1 - vmull.u8 q10, d27, d2 - vmull.u8 q11, d27, d3 - /* 1 cycle bubble */ - vrsra.u16 q8, q8, #8 - vrsra.u16 q9, q9, #8 - vrsra.u16 q10, q10, #8 - vrsra.u16 q11, q11, #8 -.endm - -.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail - /* 2 cycle bubble */ - vrshrn.u16 d28, q8, #8 - vrshrn.u16 d29, q9, #8 - vrshrn.u16 d30, q10, #8 - vrshrn.u16 d31, q11, #8 - vqadd.u8 q14, q2, q14 - /* 1 cycle bubble */ - vqadd.u8 q15, q3, q15 -.endm - -.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head - fetch_src_pixblock - vrshrn.u16 d28, q8, #8 - fetch_mask_pixblock - vrshrn.u16 d29, q9, #8 - vmull.u8 q8, d27, d0 - vrshrn.u16 d30, q10, #8 - vmull.u8 q9, d27, d1 - vrshrn.u16 d31, q11, #8 - vmull.u8 q10, d27, d2 - vqadd.u8 q14, q2, q14 - vmull.u8 q11, d27, d3 - vqadd.u8 q15, q3, q15 - vrsra.u16 q8, q8, #8 - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrsra.u16 q9, q9, #8 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - vrsra.u16 q10, q10, #8 - - cache_preload 8, 8 - - vrsra.u16 q11, q11, #8 -.endm - -generate_composite_function \ - pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8888_8888_8888_process_pixblock_head, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head - -generate_composite_function_single_scanline \ - pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8888_8888_8888_process_pixblock_head, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head - -/******************************************************************************/ - -generate_composite_function \ - pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8888_8888_8888_process_pixblock_head, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 27 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_add_n_8_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d0, d3[0] - vdup.8 d1, d3[1] - vdup.8 d2, d3[2] - vdup.8 d3, d3[3] -.endm - -.macro pixman_composite_add_n_8_8888_cleanup -.endm - -generate_composite_function \ - pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_add_n_8_8888_init, \ - pixman_composite_add_n_8_8888_cleanup, \ - pixman_composite_add_8888_8888_8888_process_pixblock_head, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 27 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_add_8888_n_8888_init - add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) - vld1.32 {d27[0]}, [DUMMY] - vdup.8 d27, d27[3] -.endm - -.macro pixman_composite_add_8888_n_8888_cleanup -.endm - -generate_composite_function \ - pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_add_8888_n_8888_init, \ - pixman_composite_add_8888_n_8888_cleanup, \ - pixman_composite_add_8888_8888_8888_process_pixblock_head, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 27 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* destination data in {d4, d5, d6, d7} */ - /* solid mask is in d15 */ - - /* 'in' */ - vmull.u8 q8, d15, d3 - vmull.u8 q6, d15, d2 - vmull.u8 q5, d15, d1 - vmull.u8 q4, d15, d0 - vrshr.u16 q13, q8, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q11, q5, #8 - vrshr.u16 q10, q4, #8 - vraddhn.u16 d3, q8, q13 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d1, q5, q11 - vraddhn.u16 d0, q4, q10 - vmvn.8 d24, d3 /* get inverted alpha */ - /* now do alpha blending */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 - vmull.u8 q11, d24, d7 -.endm - -.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail - fetch_src_pixblock - cache_preload 8, 8 - fetch_mask_pixblock - pixman_composite_out_reverse_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -generate_composite_function_single_scanline \ - pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ - pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ - pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 12 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_8888_n_8888_process_pixblock_head - pixman_composite_out_reverse_8888_n_8888_process_pixblock_head -.endm - -.macro pixman_composite_over_8888_n_8888_process_pixblock_tail - pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - pixman_composite_over_8888_n_8888_process_pixblock_tail - fetch_src_pixblock - cache_preload 8, 8 - pixman_composite_over_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -.macro pixman_composite_over_8888_n_8888_init - add DUMMY, sp, #48 - vpush {d8-d15} - vld1.32 {d15[0]}, [DUMMY] - vdup.8 d15, d15[3] -.endm - -.macro pixman_composite_over_8888_n_8888_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_8888_n_8888_init, \ - pixman_composite_over_8888_n_8888_cleanup, \ - pixman_composite_over_8888_n_8888_process_pixblock_head, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail_head - -/******************************************************************************/ - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - pixman_composite_over_8888_n_8888_process_pixblock_tail - fetch_src_pixblock - cache_preload 8, 8 - fetch_mask_pixblock - pixman_composite_over_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -generate_composite_function \ - pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_n_8888_process_pixblock_head, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 12 /* mask_basereg */ - -generate_composite_function_single_scanline \ - pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_n_8888_process_pixblock_head, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 12 /* mask_basereg */ - -/******************************************************************************/ - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - pixman_composite_over_8888_n_8888_process_pixblock_tail - fetch_src_pixblock - cache_preload 8, 8 - fetch_mask_pixblock - pixman_composite_over_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -generate_composite_function \ - pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_n_8888_process_pixblock_head, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 15 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_0888_0888_process_pixblock_head -.endm - -.macro pixman_composite_src_0888_0888_process_pixblock_tail -.endm - -.macro pixman_composite_src_0888_0888_process_pixblock_tail_head - vst3.8 {d0, d1, d2}, [DST_W]! - fetch_src_pixblock - cache_preload 8, 8 -.endm - -generate_composite_function \ - pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ - FLAG_DST_WRITEONLY, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_0888_0888_process_pixblock_head, \ - pixman_composite_src_0888_0888_process_pixblock_tail, \ - pixman_composite_src_0888_0888_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_0888_8888_rev_process_pixblock_head - vswp d0, d2 -.endm - -.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail -.endm - -.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head - vst4.8 {d0, d1, d2, d3}, [DST_W]! - fetch_src_pixblock - vswp d0, d2 - cache_preload 8, 8 -.endm - -.macro pixman_composite_src_0888_8888_rev_init - veor d3, d3, d3 -.endm - -generate_composite_function \ - pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - pixman_composite_src_0888_8888_rev_init, \ - default_cleanup, \ - pixman_composite_src_0888_8888_rev_process_pixblock_head, \ - pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ - pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_0888_0565_rev_process_pixblock_head - vshll.u8 q8, d1, #8 - vshll.u8 q9, d2, #8 -.endm - -.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail - vshll.u8 q14, d0, #8 - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 -.endm - -.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head - vshll.u8 q14, d0, #8 - fetch_src_pixblock - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 - vshll.u8 q8, d1, #8 - vst1.16 {d28, d29}, [DST_W, :128]! - vshll.u8 q9, d2, #8 -.endm - -generate_composite_function \ - pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ - FLAG_DST_WRITEONLY, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_0888_0565_rev_process_pixblock_head, \ - pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ - pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_pixbuf_8888_process_pixblock_head - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 -.endm - -.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - vraddhn.u16 d30, q11, q8 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d28, q13, q10 -.endm - -.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - fetch_src_pixblock - vraddhn.u16 d30, q11, q8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d28, q13, q10 - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -.endm - -generate_composite_function \ - pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_pixbuf_8888_process_pixblock_head, \ - pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ - pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 -.endm - -.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - vraddhn.u16 d28, q11, q8 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d30, q13, q10 -.endm - -.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - fetch_src_pixblock - vraddhn.u16 d28, q11, q8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d30, q13, q10 - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -.endm - -generate_composite_function \ - pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ - pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ - pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_0565_8_0565_process_pixblock_head - /* mask is in d15 */ - convert_0565_to_x888 q4, d2, d1, d0 - convert_0565_to_x888 q5, d6, d5, d4 - /* source pixel data is in {d0, d1, d2, XX} */ - /* destination pixel data is in {d4, d5, d6, XX} */ - vmvn.8 d7, d15 - vmull.u8 q6, d15, d2 - vmull.u8 q5, d15, d1 - vmull.u8 q4, d15, d0 - vmull.u8 q8, d7, d4 - vmull.u8 q9, d7, d5 - vmull.u8 q13, d7, d6 - vrshr.u16 q12, q6, #8 - vrshr.u16 q11, q5, #8 - vrshr.u16 q10, q4, #8 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d1, q5, q11 - vraddhn.u16 d0, q4, q10 -.endm - -.macro pixman_composite_over_0565_8_0565_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q13, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q13 - vqadd.u8 q0, q0, q14 - vqadd.u8 q1, q1, q15 - /* 32bpp result is in {d0, d1, d2, XX} */ - convert_8888_to_0565 d2, d1, d0, q14, q15, q3 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head - fetch_mask_pixblock - pixman_composite_over_0565_8_0565_process_pixblock_tail - fetch_src_pixblock - vld1.16 {d10, d11}, [DST_R, :128]! - cache_preload 8, 8 - pixman_composite_over_0565_8_0565_process_pixblock_head - vst1.16 {d28, d29}, [DST_W, :128]! -.endm - -generate_composite_function \ - pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_0565_8_0565_process_pixblock_head, \ - pixman_composite_over_0565_8_0565_process_pixblock_tail, \ - pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 10, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 15 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_0565_n_0565_init - add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) - vpush {d8-d15} - vld1.32 {d15[0]}, [DUMMY] - vdup.8 d15, d15[3] -.endm - -.macro pixman_composite_over_0565_n_0565_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_0565_n_0565_init, \ - pixman_composite_over_0565_n_0565_cleanup, \ - pixman_composite_over_0565_8_0565_process_pixblock_head, \ - pixman_composite_over_0565_8_0565_process_pixblock_tail, \ - pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 10, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 15 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_add_0565_8_0565_process_pixblock_head - /* mask is in d15 */ - convert_0565_to_x888 q4, d2, d1, d0 - convert_0565_to_x888 q5, d6, d5, d4 - /* source pixel data is in {d0, d1, d2, XX} */ - /* destination pixel data is in {d4, d5, d6, XX} */ - vmull.u8 q6, d15, d2 - vmull.u8 q5, d15, d1 - vmull.u8 q4, d15, d0 - vrshr.u16 q12, q6, #8 - vrshr.u16 q11, q5, #8 - vrshr.u16 q10, q4, #8 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d1, q5, q11 - vraddhn.u16 d0, q4, q10 -.endm - -.macro pixman_composite_add_0565_8_0565_process_pixblock_tail - vqadd.u8 q0, q0, q2 - vqadd.u8 q1, q1, q3 - /* 32bpp result is in {d0, d1, d2, XX} */ - convert_8888_to_0565 d2, d1, d0, q14, q15, q3 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head - fetch_mask_pixblock - pixman_composite_add_0565_8_0565_process_pixblock_tail - fetch_src_pixblock - vld1.16 {d10, d11}, [DST_R, :128]! - cache_preload 8, 8 - pixman_composite_add_0565_8_0565_process_pixblock_head - vst1.16 {d28, d29}, [DST_W, :128]! -.endm - -generate_composite_function \ - pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_add_0565_8_0565_process_pixblock_head, \ - pixman_composite_add_0565_8_0565_process_pixblock_tail, \ - pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 10, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 15 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_out_reverse_8_0565_process_pixblock_head - /* mask is in d15 */ - convert_0565_to_x888 q5, d6, d5, d4 - /* destination pixel data is in {d4, d5, d6, xx} */ - vmvn.8 d24, d15 /* get inverted alpha */ - /* now do alpha blending */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 -.endm - -.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vraddhn.u16 d0, q14, q8 - vraddhn.u16 d1, q15, q9 - vraddhn.u16 d2, q12, q10 - /* 32bpp result is in {d0, d1, d2, XX} */ - convert_8888_to_0565 d2, d1, d0, q14, q15, q3 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head - fetch_src_pixblock - pixman_composite_out_reverse_8_0565_process_pixblock_tail - vld1.16 {d10, d11}, [DST_R, :128]! - cache_preload 8, 8 - pixman_composite_out_reverse_8_0565_process_pixblock_head - vst1.16 {d28, d29}, [DST_W, :128]! -.endm - -generate_composite_function \ - pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_out_reverse_8_0565_process_pixblock_head, \ - pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ - pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 10, /* dst_r_basereg */ \ - 15, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -generate_composite_function_nearest_scanline \ - pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_process_pixblock_tail_head - -generate_composite_function_nearest_scanline \ - pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_over_8888_0565_process_pixblock_head, \ - pixman_composite_over_8888_0565_process_pixblock_tail, \ - pixman_composite_over_8888_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 24 /* mask_basereg */ - -generate_composite_function_nearest_scanline \ - pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_8888_0565_process_pixblock_head, \ - pixman_composite_src_8888_0565_process_pixblock_tail, \ - pixman_composite_src_8888_0565_process_pixblock_tail_head - -generate_composite_function_nearest_scanline \ - pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_0565_8888_process_pixblock_head, \ - pixman_composite_src_0565_8888_process_pixblock_tail, \ - pixman_composite_src_0565_8888_process_pixblock_tail_head - -generate_composite_function_nearest_scanline \ - pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_8_0565_process_pixblock_head, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 24 /* mask_basereg */ - -generate_composite_function_nearest_scanline \ - pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_0565_8_0565_process_pixblock_head, \ - pixman_composite_over_0565_8_0565_process_pixblock_tail, \ - pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 10, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 15 /* mask_basereg */ - -/******************************************************************************/ - -/* Supplementary macro for setting function attributes */ -.macro pixman_asm_function fname - .func fname - .global fname -#ifdef __ELF__ - .hidden fname - .type fname, %function -#endif -fname: -.endm - -.macro bilinear_interpolate_last_pixel - mov TMP1, X, asr #16 - mov TMP2, X, asr #16 - add TMP1, TOP, TMP1, asl #2 - add TMP2, BOTTOM, TMP2, asl #2 - vld1.32 {d0}, [TMP1] - vshr.u16 d30, d24, #8 - vld1.32 {d1}, [TMP2] - vmull.u8 q1, d0, d28 - vmlal.u8 q1, d1, d29 - /* 5 cycles bubble */ - vshll.u16 q0, d2, #8 - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - /* 5 cycles bubble */ - vshrn.u32 d0, q0, #16 - /* 3 cycles bubble */ - vmovn.u16 d0, q0 - /* 1 cycle bubble */ - vst1.32 {d0[0]}, [OUT, :32]! -.endm - -.macro bilinear_interpolate_two_pixels - mov TMP1, X, asr #16 - mov TMP2, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #2 - add TMP2, BOTTOM, TMP2, asl #2 - vld1.32 {d0}, [TMP1] - vld1.32 {d1}, [TMP2] - vmull.u8 q1, d0, d28 - vmlal.u8 q1, d1, d29 - mov TMP1, X, asr #16 - mov TMP2, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #2 - add TMP2, BOTTOM, TMP2, asl #2 - vld1.32 {d20}, [TMP1] - vld1.32 {d21}, [TMP2] - vmull.u8 q11, d20, d28 - vmlal.u8 q11, d21, d29 - vshr.u16 q15, q12, #8 - vadd.u16 q12, q12, q13 - vshll.u16 q0, d2, #8 - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #8 - vmlsl.u16 q10, d22, d31 - vmlal.u16 q10, d23, d31 - vshrn.u32 d30, q0, #16 - vshrn.u32 d31, q10, #16 - vmovn.u16 d0, q15 - vst1.32 {d0}, [OUT]! -.endm - -.macro bilinear_interpolate_four_pixels - mov TMP1, X, asr #16 - mov TMP2, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #2 - add TMP2, BOTTOM, TMP2, asl #2 - vld1.32 {d0}, [TMP1] - vld1.32 {d1}, [TMP2] - vmull.u8 q1, d0, d28 - vmlal.u8 q1, d1, d29 - mov TMP1, X, asr #16 - mov TMP2, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #2 - add TMP2, BOTTOM, TMP2, asl #2 - vld1.32 {d20}, [TMP1] - vld1.32 {d21}, [TMP2] - vmull.u8 q11, d20, d28 - vmlal.u8 q11, d21, d29 - vshr.u16 q15, q12, #8 - vadd.u16 q12, q12, q13 - vshll.u16 q0, d2, #8 - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #8 - vmlsl.u16 q10, d22, d31 - vmlal.u16 q10, d23, d31 - mov TMP1, X, asr #16 - mov TMP2, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #2 - add TMP2, BOTTOM, TMP2, asl #2 - vld1.32 {d4}, [TMP1] - vld1.32 {d5}, [TMP2] - vmull.u8 q3, d4, d28 - vmlal.u8 q3, d5, d29 - mov TMP1, X, asr #16 - mov TMP2, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #2 - add TMP2, BOTTOM, TMP2, asl #2 - vld1.32 {d16}, [TMP1] - vld1.32 {d17}, [TMP2] - vmull.u8 q9, d16, d28 - vmlal.u8 q9, d17, d29 - vshr.u16 q15, q12, #8 - vadd.u16 q12, q12, q13 - vshll.u16 q2, d6, #8 - vmlsl.u16 q2, d6, d30 - vmlal.u16 q2, d7, d30 - vshll.u16 q8, d18, #8 - vmlsl.u16 q8, d18, d31 - vmlal.u16 q8, d19, d31 - vshrn.u32 d0, q0, #16 - vshrn.u32 d1, q10, #16 - vshrn.u32 d4, q2, #16 - vshrn.u32 d5, q8, #16 - vmovn.u16 d0, q0 - vmovn.u16 d1, q2 - vst1.32 {d0, d1}, [OUT]! -.endm - - -/* - * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out, - * const uint32_t * top, - * const uint32_t * bottom, - * int wt, - * int wb, - * pixman_fixed_t x, - * pixman_fixed_t ux, - * int width) - */ - -pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon - OUT .req r0 - TOP .req r1 - BOTTOM .req r2 - WT .req r3 - WB .req r4 - X .req r5 - UX .req r6 - WIDTH .req ip - TMP1 .req r3 - TMP2 .req r4 - - mov ip, sp - push {r4, r5, r6, r7} - ldmia ip, {WB, X, UX, WIDTH} - - cmp WIDTH, #0 - ble 3f - vdup.u16 q12, X - vdup.u16 q13, UX - vdup.u8 d28, WT - vdup.u8 d29, WB - vadd.u16 d25, d25, d26 - vadd.u16 q13, q13, q13 - - subs WIDTH, WIDTH, #4 - blt 1f -0: - bilinear_interpolate_four_pixels - subs WIDTH, WIDTH, #4 - bge 0b -1: - tst WIDTH, #2 - beq 2f - bilinear_interpolate_two_pixels -2: - tst WIDTH, #1 - beq 3f - bilinear_interpolate_last_pixel -3: - pop {r4, r5, r6, r7} - bx lr - - .unreq OUT - .unreq TOP - .unreq BOTTOM - .unreq WT - .unreq WB - .unreq X - .unreq UX - .unreq WIDTH - .unreq TMP1 - .unreq TMP2 -.endfunc +/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ * - pixman_composite_over_8888_0565_asm_neon
+ * - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .fpu neon
+ .arch armv7a
+ .object_arch armv4
+ .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
+ .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
+ .arm
+ .altmacro
+
+#include "pixman-arm-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ * to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * d0, d1, d2, d3 - contain loaded source pixel data
+ * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
+ * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
+ * d28, d29, d30, d31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * d0, d1, d2, d3 - contain loaded source pixel data
+ * d4, d5 - contain loaded destination pixels (they are needed)
+ * d28, d29 - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
+ * perform all the needed calculations and write the result to {d28, d29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
+ * actually use d0 register for blue channel (a vector of eight 8-bit
+ * values), d1 register for green, d2 for red and d3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion:
+ * vuzp.8 d0, d1
+ * vuzp.8 d2, d3
+ * vuzp.8 d1, d3
+ * vuzp.8 d0, d2
+ *
+ * Planar to packed conversion:
+ * vzip.8 d0, d2
+ * vzip.8 d1, d3
+ * vzip.8 d2, d3
+ * vzip.8 d0, d1
+ *
+ * But pixel can be loaded directly in planar format using VLD4.8 NEON
+ * instruction. It is 1 cycle slower than VLD1.32, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+ /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+ and put data into d6 - red, d7 - green, d30 - blue */
+ vshrn.u16 d6, q2, #8
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vsri.u8 d6, d6, #5
+ vmvn.8 d3, d3 /* invert source alpha */
+ vsri.u8 d7, d7, #6
+ vshrn.u16 d30, q2, #2
+ /* now do alpha blending, storing results in 8-bit planar format
+ into d16 - red, d19 - green, d18 - blue */
+ vmull.u8 q10, d3, d6
+ vmull.u8 q11, d3, d7
+ vmull.u8 q12, d3, d30
+ vrshr.u16 q13, q10, #8
+ vrshr.u16 q3, q11, #8
+ vrshr.u16 q15, q12, #8
+ vraddhn.u16 d20, q10, q13
+ vraddhn.u16 d23, q11, q3
+ vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+ /* ... continue alpha blending */
+ vqadd.u8 d16, d2, d20
+ vqadd.u8 q9, q0, q11
+ /* convert the result to r5g6b5 and store it into {d28, d29} */
+ vshll.u8 q14, d16, #8
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ * head
+ * while (...) {
+ * tail
+ * head
+ * }
+ * tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ * pixman_composite_over_8888_0565_process_pixblock_tail
+ * vst1.16 {d28, d29}, [DST_W, :128]!
+ * vld1.16 {d4, d5}, [DST_R, :128]!
+ * vld4.32 {d0, d1, d2, d3}, [SRC]!
+ * pixman_composite_over_8888_0565_process_pixblock_head
+ * cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few VLD/VST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+ vqadd.u8 d16, d2, d20
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vqadd.u8 q9, q0, q11
+ vshrn.u16 d6, q2, #8
+ fetch_src_pixblock
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vshll.u8 q14, d16, #8
+ PF add PF_X, PF_X, #8
+ vshll.u8 q8, d19, #8
+ PF tst PF_CTL, #0xF
+ vsri.u8 d6, d6, #5
+ PF addne PF_X, PF_X, #8
+ vmvn.8 d3, d3
+ PF subne PF_CTL, PF_CTL, #1
+ vsri.u8 d7, d7, #6
+ vshrn.u16 d30, q2, #2
+ vmull.u8 q10, d3, d6
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmull.u8 q11, d3, d7
+ vmull.u8 q12, d3, d30
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vsri.u16 q14, q8, #5
+ PF cmp PF_X, ORIG_W
+ vshll.u8 q9, d18, #8
+ vrshr.u16 q13, q10, #8
+ PF subge PF_X, PF_X, ORIG_W
+ vrshr.u16 q3, q11, #8
+ vrshr.u16 q15, q12, #8
+ PF subges PF_CTL, PF_CTL, #0x10
+ vsri.u16 q14, q9, #11
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vraddhn.u16 d20, q10, q13
+ vraddhn.u16 d23, q11, q3
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vraddhn.u16 d22, q12, q15
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+ pixman_composite_over_8888_0565_process_pixblock_tail
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ fetch_src_pixblock
+ pixman_composite_over_8888_0565_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ * FLAG_DST_READWRITE - tells that the destination buffer is both read
+ * and written, for write-only buffer we would use
+ * FLAG_DST_WRITEONLY flag instead
+ * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ * and separate color channels for 32bpp format.
+ * The next things are:
+ * - the number of pixels processed per iteration (8 in this case, because
+ * that's the maximum what can fit into four 64-bit NEON registers).
+ * - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ * prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+ pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_0565_process_pixblock_head, \
+ pixman_composite_over_8888_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+ /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+ and put data into d6 - red, d7 - green, d30 - blue */
+ vshrn.u16 d6, q2, #8
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vsri.u8 d6, d6, #5
+ vsri.u8 d7, d7, #6
+ vshrn.u16 d30, q2, #2
+ /* now do alpha blending, storing results in 8-bit planar format
+ into d16 - red, d19 - green, d18 - blue */
+ vmull.u8 q10, d3, d6
+ vmull.u8 q11, d3, d7
+ vmull.u8 q12, d3, d30
+ vrshr.u16 q13, q10, #8
+ vrshr.u16 q3, q11, #8
+ vrshr.u16 q15, q12, #8
+ vraddhn.u16 d20, q10, q13
+ vraddhn.u16 d23, q11, q3
+ vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+ /* ... continue alpha blending */
+ vqadd.u8 d16, d2, d20
+ vqadd.u8 q9, q0, q11
+ /* convert the result to r5g6b5 and store it into {d28, d29} */
+ vshll.u8 q14, d16, #8
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+ pixman_composite_over_n_0565_process_pixblock_tail
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ pixman_composite_over_n_0565_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_0565_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+ vmvn.8 d3, d3 /* invert source alpha */
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_0565_init, \
+ default_cleanup, \
+ pixman_composite_over_n_0565_process_pixblock_head, \
+ pixman_composite_over_n_0565_process_pixblock_tail, \
+ pixman_composite_over_n_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+ vshll.u8 q8, d1, #8
+ vshll.u8 q14, d2, #8
+ vshll.u8 q9, d0, #8
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+ vsri.u16 q14, q8, #5
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ fetch_src_pixblock
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vsri.u16 q14, q9, #11
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vshll.u8 q8, d1, #8
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ vshll.u8 q14, d2, #8
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vshll.u8 q9, d0, #8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_0565_process_pixblock_head, \
+ pixman_composite_src_8888_0565_process_pixblock_tail, \
+ pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+ vshrn.u16 d30, q0, #8
+ vshrn.u16 d29, q0, #3
+ vsli.u16 q0, q0, #5
+ vmov.u8 d31, #255
+ vsri.u8 d30, d30, #5
+ vsri.u8 d29, d29, #6
+ vshrn.u16 d28, q0, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+ pixman_composite_src_0565_8888_process_pixblock_tail
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ fetch_src_pixblock
+ pixman_composite_src_0565_8888_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_8888_process_pixblock_head, \
+ pixman_composite_src_0565_8888_process_pixblock_tail, \
+ pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_process_pixblock_head
+ vqadd.u8 q14, q0, q2
+ vqadd.u8 q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
+ fetch_src_pixblock
+ PF add PF_X, PF_X, #32
+ PF tst PF_CTL, #0xF
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ PF addne PF_X, PF_X, #32
+ PF subne PF_CTL, PF_CTL, #1
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ vqadd.u8 q14, q0, q2
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vqadd.u8 q15, q1, q3
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+ fetch_src_pixblock
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ vqadd.u8 q14, q0, q2
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vqadd.u8 q15, q1, q3
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
+ vmvn.8 d24, d3 /* get inverted alpha */
+ /* do alpha blending */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+ vmull.u8 q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q14, q8, #8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ PF cmp PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ fetch_src_pixblock
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vmull.u8 q10, d22, d6
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+ pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q14, q8, #8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ PF cmp PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ fetch_src_pixblock
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vmull.u8 q10, d22, d6
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+ pixman_composite_over_8888_8888_process_pixblock_tail
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ pixman_composite_over_8888_8888_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8888_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+ vrshr.u16 q14, q8, #8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ PF cmp PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+ vmull.u8 q10, d22, d6
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d7[0]}, [DUMMY]
+ vdup.8 d4, d7[0]
+ vdup.8 d5, d7[1]
+ vdup.8 d6, d7[2]
+ vdup.8 d7, d7[3]
+.endm
+
+generate_composite_function \
+ pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_reverse_n_8888_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 4, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
+ vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
+ vmull.u8 q1, d24, d9
+ vmull.u8 q6, d24, d10
+ vmull.u8 q7, d24, d11
+ vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
+ vrshr.u16 q9, q1, #8
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q11, q7, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q9
+ vraddhn.u16 d2, q6, q10
+ vraddhn.u16 d3, q7, q11
+ vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
+ vsri.u8 d7, d7, #6
+ vmvn.8 d3, d3
+ vshrn.u16 d30, q2, #2
+ vmull.u8 q8, d3, d6 /* now do alpha blending */
+ vmull.u8 q9, d3, d7
+ vmull.u8 q10, d3, d30
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
+ /* 3 cycle bubble (after vmull.u8) */
+ vrshr.u16 q13, q8, #8
+ vrshr.u16 q11, q9, #8
+ vrshr.u16 q15, q10, #8
+ vraddhn.u16 d16, q8, q13
+ vraddhn.u16 d27, q9, q11
+ vraddhn.u16 d26, q10, q15
+ vqadd.u8 d16, d2, d16
+ /* 1 cycle bubble */
+ vqadd.u8 q9, q0, q13
+ vshll.u8 q14, d16, #8 /* convert to 16bpp */
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ /* 1 cycle bubble */
+ vsri.u16 q14, q9, #11
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vshrn.u16 d6, q2, #8
+ fetch_mask_pixblock
+ vshrn.u16 d7, q2, #3
+ fetch_src_pixblock
+ vmull.u8 q6, d24, d10
+ vrshr.u16 q13, q8, #8
+ vrshr.u16 q11, q9, #8
+ vrshr.u16 q15, q10, #8
+ vraddhn.u16 d16, q8, q13
+ vraddhn.u16 d27, q9, q11
+ vraddhn.u16 d26, q10, q15
+ vqadd.u8 d16, d2, d16
+ vmull.u8 q1, d24, d9
+ vqadd.u8 q9, q0, q13
+ vshll.u8 q14, d16, #8
+ vmull.u8 q0, d24, d8
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ vmull.u8 q7, d24, d11
+ vsri.u16 q14, q9, #11
+
+ cache_preload 8, 8
+
+ vsli.u16 q2, q2, #5
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q9, q1, #8
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q11, q7, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q9
+ vraddhn.u16 d2, q6, q10
+ vraddhn.u16 d3, q7, q11
+ vsri.u8 d6, d6, #5
+ vsri.u8 d7, d7, #6
+ vmvn.8 d3, d3
+ vshrn.u16 d30, q2, #2
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ vmull.u8 q8, d3, d6
+ vmull.u8 q9, d3, d7
+ vmull.u8 q10, d3, d30
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d8, d11[0]
+ vdup.8 d9, d11[1]
+ vdup.8 d10, d11[2]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_0565_init, \
+ pixman_composite_over_n_8_0565_cleanup, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_0565_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vpush {d8-d15}
+ vld1.32 {d24[0]}, [DUMMY]
+ vdup.8 d24, d24[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_8888_n_0565_init, \
+ pixman_composite_over_8888_n_0565_cleanup, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+ vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+ fetch_src_pixblock
+ cache_preload 16, 16
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 16, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_0565_process_pixblock_head, \
+ pixman_composite_src_0565_0565_process_pixblock_tail, \
+ pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+ vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d0[0]}, [DUMMY]
+ vsli.u64 d0, d0, #8
+ vsli.u64 d0, d0, #16
+ vsli.u64 d0, d0, #32
+ vorr d1, d0, d0
+ vorr q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+ FLAG_DST_WRITEONLY, \
+ 32, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_8_init, \
+ pixman_composite_src_n_8_cleanup, \
+ pixman_composite_src_n_8_process_pixblock_head, \
+ pixman_composite_src_n_8_process_pixblock_tail, \
+ pixman_composite_src_n_8_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+ vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_0565_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d0[0]}, [DUMMY]
+ vsli.u64 d0, d0, #16
+ vsli.u64 d0, d0, #32
+ vorr d1, d0, d0
+ vorr q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 16, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_0565_init, \
+ pixman_composite_src_n_0565_cleanup, \
+ pixman_composite_src_n_0565_process_pixblock_head, \
+ pixman_composite_src_n_0565_process_pixblock_tail, \
+ pixman_composite_src_n_0565_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+ vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d0[0]}, [DUMMY]
+ vsli.u64 d0, d0, #32
+ vorr d1, d0, d0
+ vorr q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_8888_init, \
+ pixman_composite_src_n_8888_cleanup, \
+ pixman_composite_src_n_8888_process_pixblock_head, \
+ pixman_composite_src_n_8888_process_pixblock_tail, \
+ pixman_composite_src_n_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+ vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+ fetch_src_pixblock
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_8888_process_pixblock_head, \
+ pixman_composite_src_8888_8888_process_pixblock_tail, \
+ pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+ vorr q0, q0, q2
+ vorr q1, q1, q2
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+ vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+ fetch_src_pixblock
+ vorr q0, q0, q2
+ vorr q1, q1, q2
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+ vmov.u8 q2, #0xFF
+ vshl.u32 q2, q2, #24
+.endm
+
+generate_composite_function \
+ pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ pixman_composite_src_x888_8888_init, \
+ default_cleanup, \
+ pixman_composite_src_x888_8888_process_pixblock_head, \
+ pixman_composite_src_x888_8888_process_pixblock_tail, \
+ pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+ /* expecting deinterleaved source data in {d8, d9, d10, d11} */
+ /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+ /* and destination data in {d4, d5, d6, d7} */
+ /* mask is in d24 (d25, d26, d27 are unused) */
+
+ /* in */
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d24, d9
+ vmull.u8 q6, d24, d10
+ vmull.u8 q7, d24, d11
+ vrshr.u16 q10, q0, #8
+ vrshr.u16 q11, q1, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q13, q7, #8
+ vraddhn.u16 d0, q0, q10
+ vraddhn.u16 d1, q1, q11
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d3, q7, q13
+ vmvn.8 d24, d3 /* get inverted alpha */
+ /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
+ /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
+ /* now do alpha blending */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+ vmull.u8 q11, d24, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+ pixman_composite_over_n_8_8888_process_pixblock_tail
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ fetch_mask_pixblock
+ cache_preload 8, 8
+ pixman_composite_over_n_8_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d8, d11[0]
+ vdup.8 d9, d11[1]
+ vdup.8 d10, d11[2]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_8888_init, \
+ pixman_composite_over_n_8_8888_cleanup, \
+ pixman_composite_over_n_8_8888_process_pixblock_head, \
+ pixman_composite_over_n_8_8888_process_pixblock_tail, \
+ pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d25, d8
+ vmull.u8 q6, d26, d8
+ vmull.u8 q7, d27, d8
+ vrshr.u16 q10, q0, #8
+ vrshr.u16 q11, q1, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q13, q7, #8
+ vraddhn.u16 d0, q0, q10
+ vraddhn.u16 d1, q1, q11
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d3, q7, q13
+ vmvn.8 q12, q0
+ vmvn.8 q13, q1
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d25, d5
+ vmull.u8 q10, d26, d6
+ vmull.u8 q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_n_8_8_process_pixblock_tail
+ fetch_mask_pixblock
+ cache_preload 32, 32
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d8[0]}, [DUMMY]
+ vdup.8 d8, d8[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_8_init, \
+ pixman_composite_over_n_8_8_cleanup, \
+ pixman_composite_over_n_8_8_process_pixblock_head, \
+ pixman_composite_over_n_8_8_process_pixblock_tail, \
+ pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+ /*
+ * 'combine_mask_ca' replacement
+ *
+ * input: solid src (n) in {d8, d9, d10, d11}
+ * dest in {d4, d5, d6, d7 }
+ * mask in {d24, d25, d26, d27}
+ * output: updated src in {d0, d1, d2, d3 }
+ * updated mask in {d24, d25, d26, d3 }
+ */
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d25, d9
+ vmull.u8 q6, d26, d10
+ vmull.u8 q7, d27, d11
+ vmull.u8 q9, d11, d25
+ vmull.u8 q12, d11, d24
+ vmull.u8 q13, d11, d26
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q10, q1, #8
+ vrshr.u16 q11, q6, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q10
+ vraddhn.u16 d2, q6, q11
+ vrshr.u16 q11, q12, #8
+ vrshr.u16 q8, q9, #8
+ vrshr.u16 q6, q13, #8
+ vrshr.u16 q10, q7, #8
+ vraddhn.u16 d24, q12, q11
+ vraddhn.u16 d25, q9, q8
+ vraddhn.u16 d26, q13, q6
+ vraddhn.u16 d3, q7, q10
+ /*
+ * 'combine_over_ca' replacement
+ *
+ * output: updated dest in {d28, d29, d30, d31}
+ */
+ vmvn.8 d24, d24
+ vmvn.8 d25, d25
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d25, d5
+ vmvn.8 d26, d26
+ vmvn.8 d27, d3
+ vmull.u8 q10, d26, d6
+ vmull.u8 q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+ /* ... continue 'combine_over_ca' replacement */
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q6, q10, #8
+ vrshr.u16 q7, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q6, q10
+ vraddhn.u16 d31, q7, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q6, q10, #8
+ vrshr.u16 q7, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q6, q10
+ vraddhn.u16 d31, q7, q11
+ fetch_mask_pixblock
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ cache_preload 8, 8
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d8, d11[0]
+ vdup.8 d9, d11[1]
+ vdup.8 d10, d11[2]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8888_8888_ca_init, \
+ pixman_composite_over_n_8888_8888_ca_cleanup, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_in_n_8_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* and destination data in {d4, d5, d6, d7} */
+ vmull.u8 q8, d4, d3
+ vmull.u8 q9, d5, d3
+ vmull.u8 q10, d6, d3
+ vmull.u8 q11, d7, d3
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q8, q14
+ vraddhn.u16 d29, q9, q15
+ vraddhn.u16 d30, q10, q12
+ vraddhn.u16 d31, q11, q13
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+ pixman_composite_in_n_8_process_pixblock_tail
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ cache_preload 32, 32
+ pixman_composite_in_n_8_process_pixblock_head
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_in_n_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_in_n_8_init, \
+ pixman_composite_in_n_8_cleanup, \
+ pixman_composite_in_n_8_process_pixblock_head, \
+ pixman_composite_in_n_8_process_pixblock_tail, \
+ pixman_composite_in_n_8_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+ /* expecting source data in {d8, d9, d10, d11} */
+ /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+ /* and destination data in {d4, d5, d6, d7} */
+ /* mask is in d24, d25, d26, d27 */
+ vmull.u8 q0, d24, d11
+ vmull.u8 q1, d25, d11
+ vmull.u8 q6, d26, d11
+ vmull.u8 q7, d27, d11
+ vrshr.u16 q10, q0, #8
+ vrshr.u16 q11, q1, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q13, q7, #8
+ vraddhn.u16 d0, q0, q10
+ vraddhn.u16 d1, q1, q11
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d3, q7, q13
+ vqadd.u8 q14, q0, q2
+ vqadd.u8 q15, q1, q3
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+ pixman_composite_add_n_8_8_process_pixblock_tail
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ fetch_mask_pixblock
+ cache_preload 32, 32
+ pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_n_8_8_init, \
+ pixman_composite_add_n_8_8_cleanup, \
+ pixman_composite_add_n_8_8_process_pixblock_head, \
+ pixman_composite_add_n_8_8_process_pixblock_tail, \
+ pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* destination data in {d4, d5, d6, d7} */
+ /* mask in {d24, d25, d26, d27} */
+ vmull.u8 q8, d24, d0
+ vmull.u8 q9, d25, d1
+ vmull.u8 q10, d26, d2
+ vmull.u8 q11, d27, d3
+ vrshr.u16 q0, q8, #8
+ vrshr.u16 q1, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q9
+ vraddhn.u16 d2, q12, q10
+ vraddhn.u16 d3, q13, q11
+ vqadd.u8 q14, q0, q2
+ vqadd.u8 q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+ pixman_composite_add_8_8_8_process_pixblock_tail
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ fetch_mask_pixblock
+ fetch_src_pixblock
+ cache_preload 32, 32
+ pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_8_8_8_init, \
+ pixman_composite_add_8_8_8_cleanup, \
+ pixman_composite_add_8_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_8_process_pixblock_tail, \
+ pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* destination data in {d4, d5, d6, d7} */
+ /* mask in {d24, d25, d26, d27} */
+ vmull.u8 q8, d27, d0
+ vmull.u8 q9, d27, d1
+ vmull.u8 q10, d27, d2
+ vmull.u8 q11, d27, d3
+ /* 1 cycle bubble */
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+ /* 2 cycle bubble */
+ vrshrn.u16 d28, q8, #8
+ vrshrn.u16 d29, q9, #8
+ vrshrn.u16 d30, q10, #8
+ vrshrn.u16 d31, q11, #8
+ vqadd.u8 q14, q2, q14
+ /* 1 cycle bubble */
+ vqadd.u8 q15, q3, q15
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+ fetch_src_pixblock
+ vrshrn.u16 d28, q8, #8
+ fetch_mask_pixblock
+ vrshrn.u16 d29, q9, #8
+ vmull.u8 q8, d27, d0
+ vrshrn.u16 d30, q10, #8
+ vmull.u8 q9, d27, d1
+ vrshrn.u16 d31, q11, #8
+ vmull.u8 q10, d27, d2
+ vqadd.u8 q14, q2, q14
+ vmull.u8 q11, d27, d3
+ vqadd.u8 q15, q3, q15
+ vrsra.u16 q8, q8, #8
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrsra.u16 q9, q9, #8
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q10, q10, #8
+
+ cache_preload 8, 8
+
+ vrsra.u16 q11, q11, #8
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+generate_composite_function \
+ pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_n_8_8888_init, \
+ pixman_composite_add_n_8_8888_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_n_8888_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vld1.32 {d27[0]}, [DUMMY]
+ vdup.8 d27, d27[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_8888_n_8888_init, \
+ pixman_composite_add_8888_n_8888_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* destination data in {d4, d5, d6, d7} */
+ /* solid mask is in d15 */
+
+ /* 'in' */
+ vmull.u8 q8, d15, d3
+ vmull.u8 q6, d15, d2
+ vmull.u8 q5, d15, d1
+ vmull.u8 q4, d15, d0
+ vrshr.u16 q13, q8, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q11, q5, #8
+ vrshr.u16 q10, q4, #8
+ vraddhn.u16 d3, q8, q13
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d1, q5, q11
+ vraddhn.u16 d0, q4, q10
+ vmvn.8 d24, d3 /* get inverted alpha */
+ /* now do alpha blending */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+ vmull.u8 q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+ add DUMMY, sp, #48
+ vpush {d8-d15}
+ vld1.32 {d15[0]}, [DUMMY]
+ vdup.8 d15, d15[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_8888_n_8888_init, \
+ pixman_composite_over_8888_n_8888_cleanup, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+ vst3.8 {d0, d1, d2}, [DST_W]!
+ fetch_src_pixblock
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_0888_process_pixblock_head, \
+ pixman_composite_src_0888_0888_process_pixblock_tail, \
+ pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+ vswp d0, d2
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+ vst4.8 {d0, d1, d2, d3}, [DST_W]!
+ fetch_src_pixblock
+ vswp d0, d2
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+ veor d3, d3, d3
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ pixman_composite_src_0888_8888_rev_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+ vshll.u8 q8, d1, #8
+ vshll.u8 q9, d2, #8
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+ vshll.u8 q14, d0, #8
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+ vshll.u8 q14, d0, #8
+ fetch_src_pixblock
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+ vshll.u8 q8, d1, #8
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ vshll.u8 q9, d2, #8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ vraddhn.u16 d30, q11, q8
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d28, q13, q10
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ fetch_src_pixblock
+ vraddhn.u16 d30, q11, q8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d28, q13, q10
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+ pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ vraddhn.u16 d28, q11, q8
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d30, q13, q10
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ fetch_src_pixblock
+ vraddhn.u16 d28, q11, q8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d30, q13, q10
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+ pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_head
+ /* mask is in d15 */
+ convert_0565_to_x888 q4, d2, d1, d0
+ convert_0565_to_x888 q5, d6, d5, d4
+ /* source pixel data is in {d0, d1, d2, XX} */
+ /* destination pixel data is in {d4, d5, d6, XX} */
+ vmvn.8 d7, d15
+ vmull.u8 q6, d15, d2
+ vmull.u8 q5, d15, d1
+ vmull.u8 q4, d15, d0
+ vmull.u8 q8, d7, d4
+ vmull.u8 q9, d7, d5
+ vmull.u8 q13, d7, d6
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q11, q5, #8
+ vrshr.u16 q10, q4, #8
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d1, q5, q11
+ vraddhn.u16 d0, q4, q10
+.endm
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q13, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q13
+ vqadd.u8 q0, q0, q14
+ vqadd.u8 q1, q1, q15
+ /* 32bpp result is in {d0, d1, d2, XX} */
+ convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
+ fetch_mask_pixblock
+ pixman_composite_over_0565_8_0565_process_pixblock_tail
+ fetch_src_pixblock
+ vld1.16 {d10, d11}, [DST_R, :128]!
+ cache_preload 8, 8
+ pixman_composite_over_0565_8_0565_process_pixblock_head
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_n_0565_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vpush {d8-d15}
+ vld1.32 {d15[0]}, [DUMMY]
+ vdup.8 d15, d15[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_0565_n_0565_init, \
+ pixman_composite_over_0565_n_0565_cleanup, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+ /* mask is in d15 */
+ convert_0565_to_x888 q4, d2, d1, d0
+ convert_0565_to_x888 q5, d6, d5, d4
+ /* source pixel data is in {d0, d1, d2, XX} */
+ /* destination pixel data is in {d4, d5, d6, XX} */
+ vmull.u8 q6, d15, d2
+ vmull.u8 q5, d15, d1
+ vmull.u8 q4, d15, d0
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q11, q5, #8
+ vrshr.u16 q10, q4, #8
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d1, q5, q11
+ vraddhn.u16 d0, q4, q10
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+ vqadd.u8 q0, q0, q2
+ vqadd.u8 q1, q1, q3
+ /* 32bpp result is in {d0, d1, d2, XX} */
+ convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+ fetch_mask_pixblock
+ pixman_composite_add_0565_8_0565_process_pixblock_tail
+ fetch_src_pixblock
+ vld1.16 {d10, d11}, [DST_R, :128]!
+ cache_preload 8, 8
+ pixman_composite_add_0565_8_0565_process_pixblock_head
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_add_0565_8_0565_process_pixblock_head, \
+ pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+ /* mask is in d15 */
+ convert_0565_to_x888 q5, d6, d5, d4
+ /* destination pixel data is in {d4, d5, d6, xx} */
+ vmvn.8 d24, d15 /* get inverted alpha */
+ /* now do alpha blending */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vraddhn.u16 d0, q14, q8
+ vraddhn.u16 d1, q15, q9
+ vraddhn.u16 d2, q12, q10
+ /* 32bpp result is in {d0, d1, d2, XX} */
+ convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+ fetch_src_pixblock
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail
+ vld1.16 {d10, d11}, [DST_R, :128]!
+ cache_preload 8, 8
+ pixman_composite_out_reverse_8_0565_process_pixblock_head
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 15, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_0565_process_pixblock_head, \
+ pixman_composite_over_8888_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_0565_process_pixblock_head, \
+ pixman_composite_src_8888_0565_process_pixblock_tail, \
+ pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_8888_process_pixblock_head, \
+ pixman_composite_src_0565_8888_process_pixblock_tail, \
+ pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+ .func fname
+ .global fname
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {reg1}, [TMP1]
+ vld1.32 {reg2}, [TMP2]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ vld1.32 {reg2[0]}, [TMP1]
+ vld1.32 {reg2[1]}, [TMP2]
+ convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+ bilinear_load_8888 reg1, reg2, tmp1
+ vmull.u8 acc1, reg1, d28
+ vmlal.u8 acc1, reg2, d29
+ bilinear_load_8888 reg3, reg4, tmp2
+ vmull.u8 acc2, reg3, d28
+ vmlal.u8 acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+ mov TMP2, X, asr #16
+ add X, X, UX
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ add TMP3, TOP, TMP4, asl #1
+ add TMP4, BOTTOM, TMP4, asl #1
+ vld1.32 {acc2lo[0]}, [TMP1]
+ vld1.32 {acc2hi[0]}, [TMP3]
+ vld1.32 {acc2lo[1]}, [TMP2]
+ vld1.32 {acc2hi[1]}, [TMP4]
+ convert_0565_to_x888 acc2, reg3, reg2, reg1
+ vzip.u8 reg1, reg3
+ vzip.u8 reg2, reg4
+ vzip.u8 reg3, reg4
+ vzip.u8 reg1, reg2
+ vmull.u8 acc1, reg1, d28
+ vmlal.u8 acc1, reg2, d29
+ vmull.u8 acc2, reg3, d28
+ vmlal.u8 acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ mov TMP2, X, asr #16
+ add X, X, UX
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ add TMP3, TOP, TMP4, asl #1
+ add TMP4, BOTTOM, TMP4, asl #1
+ vld1.32 {xacc2lo[0]}, [TMP1]
+ vld1.32 {xacc2hi[0]}, [TMP3]
+ vld1.32 {xacc2lo[1]}, [TMP2]
+ vld1.32 {xacc2hi[1]}, [TMP4]
+ convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ add TMP3, TOP, TMP4, asl #1
+ add TMP4, BOTTOM, TMP4, asl #1
+ vld1.32 {yacc2lo[0]}, [TMP1]
+ vzip.u8 xreg1, xreg3
+ vld1.32 {yacc2hi[0]}, [TMP3]
+ vzip.u8 xreg2, xreg4
+ vld1.32 {yacc2lo[1]}, [TMP2]
+ vzip.u8 xreg3, xreg4
+ vld1.32 {yacc2hi[1]}, [TMP4]
+ vzip.u8 xreg1, xreg2
+ convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+ vmull.u8 xacc1, xreg1, d28
+ vzip.u8 yreg1, yreg3
+ vmlal.u8 xacc1, xreg2, d29
+ vzip.u8 yreg2, yreg4
+ vmull.u8 xacc2, xreg3, d28
+ vzip.u8 yreg3, yreg4
+ vmlal.u8 xacc2, xreg4, d29
+ vzip.u8 yreg1, yreg2
+ vmull.u8 yacc1, yreg1, d28
+ vmlal.u8 yacc1, yreg2, d29
+ vmull.u8 yacc2, yreg3, d28
+ vmlal.u8 yacc2, yreg4, d29
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+ vst1.32 {d0, d1}, [OUT]!
+.elseif numpix == 2
+ vst1.32 {d0}, [OUT]!
+.elseif numpix == 1
+ vst1.32 {d0[0]}, [OUT, :32]!
+.else
+ .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+ vuzp.u8 d0, d1
+ vuzp.u8 d2, d3
+ vuzp.u8 d1, d3
+ vuzp.u8 d0, d2
+ convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+ vst1.16 {d2}, [OUT]!
+.elseif numpix == 2
+ vst1.32 {d2[0]}, [OUT]!
+.elseif numpix == 1
+ vst1.16 {d2[0]}, [OUT]!
+.else
+ .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+ bilinear_load_&src_fmt d0, d1, d2
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ vshr.u16 d30, d24, #8
+ /* 4 cycles bubble */
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ /* 5 cycles bubble */
+ vshrn.u32 d0, q0, #16
+ /* 3 cycles bubble */
+ vmovn.u16 d0, q0
+ /* 1 cycle bubble */
+ bilinear_store_&dst_fmt 1, q2, q3
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+ bilinear_load_and_vertical_interpolate_two_&src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #8
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshrn.u32 d30, q0, #16
+ vshrn.u32 d31, q10, #16
+ vmovn.u16 d0, q15
+ bilinear_store_&dst_fmt 2, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_load_and_vertical_interpolate_four_&src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23 \
+ q3, q9, d4, d5, d16, d17, d18, d19
+ pld [TMP1, PF_OFFS]
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #8
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshr.u16 q15, q12, #8
+ vshll.u16 q2, d6, #8
+ vmlsl.u16 q2, d6, d30
+ vmlal.u16 q2, d7, d30
+ vshll.u16 q8, d18, #8
+ pld [TMP2, PF_OFFS]
+ vmlsl.u16 q8, d18, d31
+ vmlal.u16 q8, d19, d31
+ vadd.u16 q12, q12, q13
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q10, #16
+ vshrn.u32 d4, q2, #16
+ vshrn.u32 d5, q8, #16
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q2
+ bilinear_store_&dst_fmt 4, q2, q3
+.endm
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * TODO: use software pipelining and aligned writes to the destination buffer
+ * in order to improve performance
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ * fname - name of the function to generate
+ * src_fmt - source color format (8888 or 0565)
+ * dst_fmt - destination color format (8888 or 0565)
+ * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+ bpp_shift, prefetch_distance
+
+pixman_asm_function fname
+ OUT .req r0
+ TOP .req r1
+ BOTTOM .req r2
+ WT .req r3
+ WB .req r4
+ X .req r5
+ UX .req r6
+ WIDTH .req ip
+ TMP1 .req r3
+ TMP2 .req r4
+ PF_OFFS .req r7
+ TMP3 .req r8
+ TMP4 .req r9
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9}
+ mov PF_OFFS, #prefetch_distance
+ ldmia ip, {WB, X, UX, WIDTH}
+ mul PF_OFFS, PF_OFFS, UX
+
+ cmp WIDTH, #0
+ ble 3f
+
+ vdup.u16 q12, X
+ vdup.u16 q13, UX
+ vdup.u8 d28, WT
+ vdup.u8 d29, WB
+ vadd.u16 d25, d25, d26
+ vadd.u16 q13, q13, q13
+
+ subs WIDTH, WIDTH, #4
+ blt 1f
+ mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
+0:
+ bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ subs WIDTH, WIDTH, #4
+ bge 0b
+1:
+ tst WIDTH, #2
+ beq 2f
+ bilinear_interpolate_two_pixels src_fmt, dst_fmt
+2:
+ tst WIDTH, #1
+ beq 3f
+ bilinear_interpolate_last_pixel src_fmt, dst_fmt
+3:
+ pop {r4, r5, r6, r7, r8, r9}
+ bx lr
+
+ .unreq OUT
+ .unreq TOP
+ .unreq BOTTOM
+ .unreq WT
+ .unreq WB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
+.endfunc
+
+.endm
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28
diff --git a/pixman/pixman/pixman-arm-neon-asm.h b/pixman/pixman/pixman-arm-neon-asm.h index 6e3d583f5..0ba67d05f 100644 --- a/pixman/pixman/pixman-arm-neon-asm.h +++ b/pixman/pixman/pixman-arm-neon-asm.h @@ -1158,3 +1158,20 @@ fname: vsri.u16 out, tmp1, #5
vsri.u16 out, tmp2, #11
.endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+ vshl.u16 out0, in, #5 /* G top 6 bits */
+ vshl.u16 tmp, in, #11 /* B top 5 bits */
+ vsri.u16 in, in, #5 /* R is ready in top bits */
+ vsri.u16 out0, out0, #6 /* G is ready in top bits */
+ vsri.u16 tmp, tmp, #5 /* B is ready in top bits */
+ vshr.u16 out1, in, #8 /* R is in place */
+ vsri.u16 out0, tmp, #8 /* G & B is in place */
+ vzip.u16 out0, out1 /* everything is in place */
+.endm
diff --git a/pixman/pixman/pixman-arm-neon.c b/pixman/pixman/pixman-arm-neon.c index c7c025418..5213a2007 100644 --- a/pixman/pixman/pixman-arm-neon.c +++ b/pixman/pixman/pixman-arm-neon.c @@ -1,486 +1,460 @@ -/* - * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of ARM Ltd not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. ARM Ltd makes no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Ian Rickards (ian.rickards@arm.com) - * Author: Jonathan Morton (jonathan.morton@movial.com) - * Author: Markku Vire (markku.vire@movial.com) - * - */ - -#ifdef HAVE_CONFIG_H -#include <config.h> -#endif - -#include <string.h> -#include "pixman-private.h" -#include "pixman-arm-common.h" - -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_8888, - uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_x888_8888, - uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_0565, - uint16_t, 1, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0888, - uint8_t, 3, uint8_t, 3) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_0565, - uint32_t, 1, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_8888, - uint16_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_8888_rev, - uint8_t, 3, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev, - uint8_t, 3, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888, - uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_rpixbuf_8888, - uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8, - uint8_t, 1, uint8_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888, - uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_0565, - uint32_t, 1, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888, - uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565, - uint8_t, 1, uint16_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_0565, - uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_8888, - uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888, - uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8, - uint8_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565, - uint8_t, 1, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888, - uint8_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca, - uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8, - uint8_t, 1, uint8_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8, - uint8_t, 1, uint8_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888, - uint8_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888, - uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_0565, - uint32_t, 1, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_0565_n_0565, - uint16_t, 1, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, add_8888_n_8888, - uint32_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8, - uint8_t, 1, uint8_t, 1, uint8_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_0565_8_0565, - uint16_t, 1, uint8_t, 1, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8_8888, - uint32_t, 1, uint8_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888, - uint32_t, 1, uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888, - uint32_t, 1, uint8_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8888_8888, - uint32_t, 1, uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_0565, - uint32_t, 1, uint8_t, 1, uint16_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565, - uint16_t, 1, uint8_t, 1, uint16_t, 1) - -PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER, - uint32_t, uint32_t) -PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER, - uint32_t, uint16_t) -PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC, - uint32_t, uint16_t) -PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC, - uint16_t, uint32_t) - -PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565, - OVER, uint32_t, uint16_t) -PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565, - OVER, uint16_t, uint16_t) - -void -pixman_composite_src_n_8_asm_neon (int32_t w, - int32_t h, - uint8_t *dst, - int32_t dst_stride, - uint8_t src); - -void -pixman_composite_src_n_0565_asm_neon (int32_t w, - int32_t h, - uint16_t *dst, - int32_t dst_stride, - uint16_t src); - -void -pixman_composite_src_n_8888_asm_neon (int32_t w, - int32_t h, - uint32_t *dst, - int32_t dst_stride, - uint32_t src); - -static pixman_bool_t -pixman_fill_neon (uint32_t *bits, - int stride, - int bpp, - int x, - int y, - int width, - int height, - uint32_t _xor) -{ - /* stride is always multiple of 32bit units in pixman */ - uint32_t byte_stride = stride * sizeof(uint32_t); - - switch (bpp) - { - case 8: - pixman_composite_src_n_8_asm_neon ( - width, - height, - (uint8_t *)(((char *) bits) + y * byte_stride + x), - byte_stride, - _xor & 0xff); - return TRUE; - case 16: - pixman_composite_src_n_0565_asm_neon ( - width, - height, - (uint16_t *)(((char *) bits) + y * byte_stride + x * 2), - byte_stride / 2, - _xor & 0xffff); - return TRUE; - case 32: - pixman_composite_src_n_8888_asm_neon ( - width, - height, - (uint32_t *)(((char *) bits) + y * byte_stride + x * 4), - byte_stride / 4, - _xor); - return TRUE; - default: - return FALSE; - } -} - -static pixman_bool_t -pixman_blt_neon (uint32_t *src_bits, - uint32_t *dst_bits, - int src_stride, - int dst_stride, - int src_bpp, - int dst_bpp, - int src_x, - int src_y, - int dst_x, - int dst_y, - int width, - int height) -{ - if (src_bpp != dst_bpp) - return FALSE; - - switch (src_bpp) - { - case 16: - pixman_composite_src_0565_0565_asm_neon ( - width, height, - (uint16_t *)(((char *) dst_bits) + - dst_y * dst_stride * 4 + dst_x * 2), dst_stride * 2, - (uint16_t *)(((char *) src_bits) + - src_y * src_stride * 4 + src_x * 2), src_stride * 2); - return TRUE; - case 32: - pixman_composite_src_8888_8888_asm_neon ( - width, height, - (uint32_t *)(((char *) dst_bits) + - dst_y * dst_stride * 4 + dst_x * 4), dst_stride, - (uint32_t *)(((char *) src_bits) + - src_y * src_stride * 4 + src_x * 4), src_stride); - return TRUE; - default: - return FALSE; - } -} - -void -pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t * out, - const uint32_t * top, - const uint32_t * bottom, - int wt, - int wb, - pixman_fixed_t x, - pixman_fixed_t ux, - int width); - -static force_inline void -scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t * dst, - const uint32_t * mask, - const uint32_t * src_top, - const uint32_t * src_bottom, - int32_t w, - int wt, - int wb, - pixman_fixed_t vx, - pixman_fixed_t unit_x, - pixman_fixed_t max_vx, - pixman_bool_t zero_src) -{ - pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top, - src_bottom, wt, wb, - vx, unit_x, w); -} - -FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC, - scaled_bilinear_scanline_neon_8888_8888_SRC, - uint32_t, uint32_t, uint32_t, - COVER, FALSE, FALSE) -FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC, - scaled_bilinear_scanline_neon_8888_8888_SRC, - uint32_t, uint32_t, uint32_t, - PAD, FALSE, FALSE) -FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC, - scaled_bilinear_scanline_neon_8888_8888_SRC, - uint32_t, uint32_t, uint32_t, - NONE, FALSE, FALSE) - -static const pixman_fast_path_t arm_neon_fast_paths[] = -{ - PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565), - PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, neon_composite_src_0565_0565), - PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565), - PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565), - PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565), - PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565), - PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, neon_composite_src_0565_8888), - PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, neon_composite_src_0565_8888), - PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, neon_composite_src_0565_8888), - PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, neon_composite_src_0565_8888), - PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, neon_composite_src_8888_8888), - PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, neon_composite_src_8888_8888), - PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, neon_composite_src_8888_8888), - PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, neon_composite_src_8888_8888), - PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, neon_composite_src_8888_8888), - PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, neon_composite_src_8888_8888), - PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, neon_composite_src_x888_8888), - PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, neon_composite_src_x888_8888), - PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, neon_composite_src_0888_0888), - PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, x8r8g8b8, neon_composite_src_0888_8888_rev), - PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, r5g6b5, neon_composite_src_0888_0565_rev), - PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8r8g8b8, neon_composite_src_pixbuf_8888), - PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8b8g8r8, neon_composite_src_rpixbuf_8888), - PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8r8g8b8, neon_composite_src_rpixbuf_8888), - PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8b8g8r8, neon_composite_src_pixbuf_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, neon_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, neon_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, neon_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, neon_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, neon_composite_over_n_0565), - PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, neon_composite_over_n_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, neon_composite_over_n_8888), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, neon_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, neon_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, neon_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, r5g6b5, neon_composite_over_8888_n_0565), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, b5g6r5, neon_composite_over_8888_n_0565), - PIXMAN_STD_FAST_PATH (OVER, r5g6b5, solid, r5g6b5, neon_composite_over_0565_n_0565), - PIXMAN_STD_FAST_PATH (OVER, b5g6r5, solid, b5g6r5, neon_composite_over_0565_n_0565), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, neon_composite_over_8888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, neon_composite_over_8888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, neon_composite_over_8888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, neon_composite_over_8888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, r5g6b5, neon_composite_over_8888_8_0565), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, b5g6r5, neon_composite_over_8888_8_0565), - PIXMAN_STD_FAST_PATH (OVER, r5g6b5, a8, r5g6b5, neon_composite_over_0565_8_0565), - PIXMAN_STD_FAST_PATH (OVER, b5g6r5, a8, b5g6r5, neon_composite_over_0565_8_0565), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, neon_composite_over_8888_0565), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, neon_composite_over_8888_0565), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, neon_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, neon_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, neon_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, neon_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, a8r8g8b8, neon_composite_src_x888_8888), - PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, a8b8g8r8, neon_composite_src_x888_8888), - PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, neon_composite_add_n_8_8), - PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, neon_composite_add_n_8_8888), - PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, neon_composite_add_n_8_8888), - PIXMAN_STD_FAST_PATH (ADD, a8, a8, a8, neon_composite_add_8_8_8), - PIXMAN_STD_FAST_PATH (ADD, r5g6b5, a8, r5g6b5, neon_composite_add_0565_8_0565), - PIXMAN_STD_FAST_PATH (ADD, b5g6r5, a8, b5g6r5, neon_composite_add_0565_8_0565), - PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8, a8r8g8b8, neon_composite_add_8888_8_8888), - PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, a8, a8b8g8r8, neon_composite_add_8888_8_8888), - PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888), - PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, solid, a8r8g8b8, neon_composite_add_8888_n_8888), - PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, solid, a8b8g8r8, neon_composite_add_8888_n_8888), - PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8_8), - PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, neon_composite_add_8888_8888), - PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, neon_composite_add_8888_8888), - PIXMAN_STD_FAST_PATH (IN, solid, null, a8, neon_composite_in_n_8), - PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888), - PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888), - PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, r5g6b5, neon_composite_out_reverse_8_0565), - PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, b5g6r5, neon_composite_out_reverse_8_0565), - - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888), - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888), - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888), - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888), - - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565), - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565), - - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565), - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565), - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565), - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565), - - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888), - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888), - /* Note: NONE repeat is not supported yet */ - SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888), - SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888), - SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888), - SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888), - - PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565), - PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565), - - PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565), - PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565), - - SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888), - SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888), - SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888), - - { PIXMAN_OP_NONE }, -}; - -static pixman_bool_t -arm_neon_blt (pixman_implementation_t *imp, - uint32_t * src_bits, - uint32_t * dst_bits, - int src_stride, - int dst_stride, - int src_bpp, - int dst_bpp, - int src_x, - int src_y, - int dst_x, - int dst_y, - int width, - int height) -{ - if (!pixman_blt_neon ( - src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, - src_x, src_y, dst_x, dst_y, width, height)) - - { - return _pixman_implementation_blt ( - imp->delegate, - src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, - src_x, src_y, dst_x, dst_y, width, height); - } - - return TRUE; -} - -static pixman_bool_t -arm_neon_fill (pixman_implementation_t *imp, - uint32_t * bits, - int stride, - int bpp, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor)) - return TRUE; - - return _pixman_implementation_fill ( - imp->delegate, bits, stride, bpp, x, y, width, height, xor); -} - -#define BIND_COMBINE_U(name) \ -void \ -pixman_composite_scanline_##name##_mask_asm_neon (int32_t w, \ - const uint32_t *dst, \ - const uint32_t *src, \ - const uint32_t *mask); \ - \ -void \ -pixman_composite_scanline_##name##_asm_neon (int32_t w, \ - const uint32_t *dst, \ - const uint32_t *src); \ - \ -static void \ -neon_combine_##name##_u (pixman_implementation_t *imp, \ - pixman_op_t op, \ - uint32_t * dest, \ - const uint32_t * src, \ - const uint32_t * mask, \ - int width) \ -{ \ - if (mask) \ - pixman_composite_scanline_##name##_mask_asm_neon (width, dest, \ - src, mask); \ - else \ - pixman_composite_scanline_##name##_asm_neon (width, dest, src); \ -} - -BIND_COMBINE_U (over) -BIND_COMBINE_U (add) -BIND_COMBINE_U (out_reverse) - -pixman_implementation_t * -_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback) -{ - pixman_implementation_t *imp = - _pixman_implementation_create (fallback, arm_neon_fast_paths); - - imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u; - imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u; - imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u; - - imp->blt = arm_neon_blt; - imp->fill = arm_neon_fill; - - return imp; -} +/*
+ * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of ARM Ltd not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. ARM Ltd makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Ian Rickards (ian.rickards@arm.com)
+ * Author: Jonathan Morton (jonathan.morton@movial.com)
+ * Author: Markku Vire (markku.vire@movial.com)
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_8888,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_x888_8888,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_0565,
+ uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0888,
+ uint8_t, 3, uint8_t, 3)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_0565,
+ uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_8888,
+ uint16_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_8888_rev,
+ uint8_t, 3, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev,
+ uint8_t, 3, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_rpixbuf_8888,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8,
+ uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_0565,
+ uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565,
+ uint8_t, 1, uint16_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_0565,
+ uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_8888,
+ uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888,
+ uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8,
+ uint8_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565,
+ uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8,
+ uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
+ uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_0565,
+ uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_0565_n_0565,
+ uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, add_8888_n_8888,
+ uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
+ uint8_t, 1, uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_0565_8_0565,
+ uint16_t, 1, uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8_8888,
+ uint32_t, 1, uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888,
+ uint32_t, 1, uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888,
+ uint32_t, 1, uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8888_8888,
+ uint32_t, 1, uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_0565,
+ uint32_t, 1, uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565,
+ uint16_t, 1, uint8_t, 1, uint16_t, 1)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
+ uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
+ uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC,
+ uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC,
+ uint16_t, uint32_t)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565,
+ OVER, uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
+ OVER, uint16_t, uint16_t)
+
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
+ uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
+ uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
+ uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
+ uint16_t, uint16_t)
+
+void
+pixman_composite_src_n_8_asm_neon (int32_t w,
+ int32_t h,
+ uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t src);
+
+void
+pixman_composite_src_n_0565_asm_neon (int32_t w,
+ int32_t h,
+ uint16_t *dst,
+ int32_t dst_stride,
+ uint16_t src);
+
+void
+pixman_composite_src_n_8888_asm_neon (int32_t w,
+ int32_t h,
+ uint32_t *dst,
+ int32_t dst_stride,
+ uint32_t src);
+
+static pixman_bool_t
+pixman_fill_neon (uint32_t *bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t _xor)
+{
+ /* stride is always multiple of 32bit units in pixman */
+ uint32_t byte_stride = stride * sizeof(uint32_t);
+
+ switch (bpp)
+ {
+ case 8:
+ pixman_composite_src_n_8_asm_neon (
+ width,
+ height,
+ (uint8_t *)(((char *) bits) + y * byte_stride + x),
+ byte_stride,
+ _xor & 0xff);
+ return TRUE;
+ case 16:
+ pixman_composite_src_n_0565_asm_neon (
+ width,
+ height,
+ (uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
+ byte_stride / 2,
+ _xor & 0xffff);
+ return TRUE;
+ case 32:
+ pixman_composite_src_n_8888_asm_neon (
+ width,
+ height,
+ (uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
+ byte_stride / 4,
+ _xor);
+ return TRUE;
+ default:
+ return FALSE;
+ }
+}
+
+static pixman_bool_t
+pixman_blt_neon (uint32_t *src_bits,
+ uint32_t *dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height)
+{
+ if (src_bpp != dst_bpp)
+ return FALSE;
+
+ switch (src_bpp)
+ {
+ case 16:
+ pixman_composite_src_0565_0565_asm_neon (
+ width, height,
+ (uint16_t *)(((char *) dst_bits) +
+ dst_y * dst_stride * 4 + dst_x * 2), dst_stride * 2,
+ (uint16_t *)(((char *) src_bits) +
+ src_y * src_stride * 4 + src_x * 2), src_stride * 2);
+ return TRUE;
+ case 32:
+ pixman_composite_src_8888_8888_asm_neon (
+ width, height,
+ (uint32_t *)(((char *) dst_bits) +
+ dst_y * dst_stride * 4 + dst_x * 4), dst_stride,
+ (uint32_t *)(((char *) src_bits) +
+ src_y * src_stride * 4 + src_x * 4), src_stride);
+ return TRUE;
+ default:
+ return FALSE;
+ }
+}
+
+static const pixman_fast_path_t arm_neon_fast_paths[] =
+{
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, neon_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, neon_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, neon_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, neon_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, neon_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, neon_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, neon_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, neon_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, neon_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, neon_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, neon_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, neon_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, neon_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, neon_composite_src_0888_0888),
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, x8r8g8b8, neon_composite_src_0888_8888_rev),
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, r5g6b5, neon_composite_src_0888_0565_rev),
+ PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8r8g8b8, neon_composite_src_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8b8g8r8, neon_composite_src_rpixbuf_8888),
+ PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8r8g8b8, neon_composite_src_rpixbuf_8888),
+ PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8b8g8r8, neon_composite_src_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, neon_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, neon_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, neon_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, neon_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, neon_composite_over_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, neon_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, neon_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, neon_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, neon_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, neon_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, r5g6b5, neon_composite_over_8888_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, b5g6r5, neon_composite_over_8888_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, r5g6b5, solid, r5g6b5, neon_composite_over_0565_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, b5g6r5, solid, b5g6r5, neon_composite_over_0565_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, neon_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, neon_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, neon_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, neon_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, r5g6b5, neon_composite_over_8888_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, b5g6r5, neon_composite_over_8888_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, r5g6b5, a8, r5g6b5, neon_composite_over_0565_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, b5g6r5, a8, b5g6r5, neon_composite_over_0565_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, neon_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, neon_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, neon_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, neon_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, neon_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, neon_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, a8r8g8b8, neon_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, a8b8g8r8, neon_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, neon_composite_add_n_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, neon_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, neon_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8, a8, a8, neon_composite_add_8_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, r5g6b5, a8, r5g6b5, neon_composite_add_0565_8_0565),
+ PIXMAN_STD_FAST_PATH (ADD, b5g6r5, a8, b5g6r5, neon_composite_add_0565_8_0565),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8, a8r8g8b8, neon_composite_add_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, a8, a8b8g8r8, neon_composite_add_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, solid, a8r8g8b8, neon_composite_add_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, solid, a8b8g8r8, neon_composite_add_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, neon_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, neon_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (IN, solid, null, a8, neon_composite_in_n_8),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, r5g6b5, neon_composite_out_reverse_8_0565),
+ PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, b5g6r5, neon_composite_out_reverse_8_0565),
+
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
+
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
+
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
+
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
+ /* Note: NONE repeat is not supported yet */
+ SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+
+ PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565),
+
+ PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
+
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+
+ SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),
+
+ { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+arm_neon_blt (pixman_implementation_t *imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height)
+{
+ if (!pixman_blt_neon (
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dst_x, dst_y, width, height))
+
+ {
+ return _pixman_implementation_blt (
+ imp->delegate,
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dst_x, dst_y, width, height);
+ }
+
+ return TRUE;
+}
+
+static pixman_bool_t
+arm_neon_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
+ return TRUE;
+
+ return _pixman_implementation_fill (
+ imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+}
+
+#define BIND_COMBINE_U(name) \
+void \
+pixman_composite_scanline_##name##_mask_asm_neon (int32_t w, \
+ const uint32_t *dst, \
+ const uint32_t *src, \
+ const uint32_t *mask); \
+ \
+void \
+pixman_composite_scanline_##name##_asm_neon (int32_t w, \
+ const uint32_t *dst, \
+ const uint32_t *src); \
+ \
+static void \
+neon_combine_##name##_u (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ uint32_t * dest, \
+ const uint32_t * src, \
+ const uint32_t * mask, \
+ int width) \
+{ \
+ if (mask) \
+ pixman_composite_scanline_##name##_mask_asm_neon (width, dest, \
+ src, mask); \
+ else \
+ pixman_composite_scanline_##name##_asm_neon (width, dest, src); \
+}
+
+BIND_COMBINE_U (over)
+BIND_COMBINE_U (add)
+BIND_COMBINE_U (out_reverse)
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
+{
+ pixman_implementation_t *imp =
+ _pixman_implementation_create (fallback, arm_neon_fast_paths);
+
+ imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
+ imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
+ imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u;
+
+ imp->blt = arm_neon_blt;
+ imp->fill = arm_neon_fill;
+
+ return imp;
+}
diff --git a/pixman/pixman/pixman-arm-simd-asm.S b/pixman/pixman/pixman-arm-simd-asm.S index d97545c1b..e00836b6c 100644 --- a/pixman/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman/pixman-arm-simd-asm.S @@ -331,15 +331,29 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 .endfunc
/*
- * Note: This function is only using armv4t instructions (not even armv6),
+ * Note: This code is only using armv5te instructions (not even armv6),
* but is scheduled for ARM Cortex-A8 pipeline. So it might need to
* be split into a few variants, tuned for each microarchitecture.
*
* TODO: In order to get good performance on ARM9/ARM11 cores (which don't
* have efficient write combining), it needs to be changed to use 16-byte
* aligned writes using STM instruction.
+ *
+ * Nearest scanline scaler macro template uses the following arguments:
+ * fname - name of the function to generate
+ * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
+ * t - type suffix for LDR/STR instructions
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
+ * prefetch_braking_distance - stop prefetching when that many pixels are
+ * remaining before the end of scanline
*/
-pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
+
+.macro generate_nearest_scanline_func fname, bpp_shift, t, \
+ prefetch_distance, \
+ prefetch_braking_distance
+
+pixman_asm_function fname
W .req r0
DST .req r1
SRC .req r2
@@ -348,30 +362,46 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 TMP1 .req r4
TMP2 .req r5
VXMASK .req r6
+ PF_OFFS .req r7
ldr UNIT_X, [sp]
push {r4, r5, r6, r7}
- mvn VXMASK, #1
+ mvn VXMASK, #((1 << bpp_shift) - 1)
/* define helper macro */
.macro scale_2_pixels
- ldrh TMP1, [SRC, TMP1]
- and TMP2, VXMASK, VX, lsr #15
+ ldr&t TMP1, [SRC, TMP1]
+ and TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- strh TMP1, [DST], #2
+ str&t TMP1, [DST], #(1 << bpp_shift)
- ldrh TMP2, [SRC, TMP2]
- and TMP1, VXMASK, VX, lsr #15
+ ldr&t TMP2, [SRC, TMP2]
+ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- strh TMP2, [DST], #2
+ str&t TMP2, [DST], #(1 << bpp_shift)
.endm
/* now do the scaling */
- and TMP1, VXMASK, VX, lsr #15
+ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- subs W, #4
+ subs W, W, #(8 + prefetch_braking_distance)
blt 2f
-1: /* main loop, process 4 pixels per iteration */
+ /* calculate prefetch offset */
+ mov PF_OFFS, #prefetch_distance
+ mla PF_OFFS, UNIT_X, PF_OFFS, VX
+1: /* main loop, process 8 pixels per iteration with prefetch */
+ subs W, W, #8
+ add PF_OFFS, UNIT_X, lsl #3
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
+ bge 1b
+2:
+ subs W, W, #(4 - 8 - prefetch_braking_distance)
+ blt 2f
+1: /* process the remaining pixels */
scale_2_pixels
scale_2_pixels
subs W, W, #4
@@ -382,8 +412,8 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 scale_2_pixels
2:
tst W, #1
- ldrneh TMP1, [SRC, TMP1]
- strneh TMP1, [DST], #2
+ ldrne&t TMP1, [SRC, TMP1]
+ strne&t TMP1, [DST]
/* cleanup helper macro */
.purgem scale_2_pixels
.unreq DST
@@ -394,7 +424,15 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 .unreq TMP1
.unreq TMP2
.unreq VXMASK
+ .unreq PF_OFFS
/* return */
pop {r4, r5, r6, r7}
bx lr
.endfunc
+.endm
+
+generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
+
+generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32
diff --git a/pixman/pixman/pixman-arm-simd.c b/pixman/pixman/pixman-arm-simd.c index 6bbc1094d..45981a6b0 100644 --- a/pixman/pixman/pixman-arm-simd.c +++ b/pixman/pixman/pixman-arm-simd.c @@ -1,423 +1,432 @@ -/* - * Copyright © 2008 Mozilla Corporation - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Mozilla Corporation not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. Mozilla Corporation makes no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Jeff Muizelaar (jeff@infidigm.net) - * - */ -#ifdef HAVE_CONFIG_H -#include <config.h> -#endif - -#include "pixman-private.h" -#include "pixman-arm-common.h" -#include "pixman-fast-path.h" - -#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */ - -void -pixman_composite_add_8_8_asm_armv6 (int32_t width, - int32_t height, - uint8_t *dst_line, - int32_t dst_stride, - uint8_t *src_line, - int32_t src_stride) -{ - uint8_t *dst, *src; - int32_t w; - uint8_t s, d; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - /* ensure both src and dst are properly aligned before doing 32 bit reads - * we'll stay in this loop if src and dst have differing alignments - */ - while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3))) - { - s = *src; - d = *dst; - asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); - *dst = d; - - dst++; - src++; - w--; - } - - while (w >= 4) - { - asm ("uqadd8 %0, %1, %2" - : "=r" (*(uint32_t*)dst) - : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst)); - dst += 4; - src += 4; - w -= 4; - } - - while (w) - { - s = *src; - d = *dst; - asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); - *dst = d; - - dst++; - src++; - w--; - } - } - -} - -void -pixman_composite_over_8888_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t *src_line, - int32_t src_stride) -{ - uint32_t *dst; - uint32_t *src; - int32_t w; - uint32_t component_half = 0x800080; - uint32_t upper_component_mask = 0xff00ff00; - uint32_t alpha_mask = 0xff; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load src */ - "ldr r5, [%[src]], #4\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - - /* = 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" - - "ldr r4, [%[dest]] \n\t" - -#else - "ldr r4, [%[dest]] \n\t" - - /* = 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" -#endif - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* multiply by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - /* recombine the 0xff00ff00 bytes of r6 and r7 */ - "and r7, r7, %[upper_component_mask]\n\t" - "uxtab16 r6, r7, r6, ror #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) - : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask), - [alpha_mask] "r" (alpha_mask) - : "r4", "r5", "r6", "r7", "r8", "cc", "memory" - ); - } -} - -void -pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t *src_line, - int32_t src_stride, - uint32_t mask) -{ - uint32_t *dst; - uint32_t *src; - int32_t w; - uint32_t component_half = 0x800080; - uint32_t alpha_mask = 0xff; - - mask = (mask) >> 24; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load src */ - "ldr r5, [%[src]], #4\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - -#endif - "ldr r4, [%[dest]] \n\t" - - "uxtb16 r6, r5\n\t" - "uxtb16 r7, r5, ror #8\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, %[mask_alpha], %[component_half]\n\t" - "mla r7, r7, %[mask_alpha], %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r5, r6, r7, lsl #8\n\t" - - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r6, r6, r7, lsl #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) - : [component_half] "r" (component_half), [mask_alpha] "r" (mask), - [alpha_mask] "r" (alpha_mask) - : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory" - ); - } -} - -void -pixman_composite_over_n_8_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t src, - int32_t unused, - uint8_t *mask_line, - int32_t mask_stride) -{ - uint32_t srca; - uint32_t *dst; - uint8_t *mask; - int32_t w; - - srca = src >> 24; - - uint32_t component_mask = 0xff00ff; - uint32_t component_half = 0x800080; - - uint32_t src_hi = (src >> 8) & component_mask; - uint32_t src_lo = src & component_mask; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load mask */ - "ldrb r5, [%[mask]], #1\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - -#endif - "ldr r4, [%[dest]] \n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, %[src_lo], r5, %[component_half]\n\t" - "mla r7, %[src_hi], r5, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r5, r6, r7, lsl #8\n\t" - - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* we could simplify this to use 'sub' if we were - * willing to give up a register for alpha_mask - */ - "mvn r8, r5\n\t" - "mov r8, r8, lsr #24\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r6, r6, r7, lsl #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask) - : [component_half] "r" (component_half), - [src_hi] "r" (src_hi), [src_lo] "r" (src_lo) - : "r4", "r5", "r6", "r7", "r8", "cc", "memory"); - } -} - -#endif - -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, - uint8_t, 1, uint8_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, - uint32_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888, - uint32_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888, - uint8_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC, - uint16_t, uint16_t) - -static const pixman_fast_path_t arm_simd_fast_paths[] = -{ - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888), - - PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8), - - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888), - - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565), - PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565), - - { PIXMAN_OP_NONE }, -}; - -pixman_implementation_t * -_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback) -{ - pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths); - - return imp; -} +/*
+ * Copyright © 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+#include "pixman-fast-path.h"
+
+#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
+
+void
+pixman_composite_add_8_8_asm_armv6 (int32_t width,
+ int32_t height,
+ uint8_t *dst_line,
+ int32_t dst_stride,
+ uint8_t *src_line,
+ int32_t src_stride)
+{
+ uint8_t *dst, *src;
+ int32_t w;
+ uint8_t s, d;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ /* ensure both src and dst are properly aligned before doing 32 bit reads
+ * we'll stay in this loop if src and dst have differing alignments
+ */
+ while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
+ {
+ s = *src;
+ d = *dst;
+ asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+ *dst = d;
+
+ dst++;
+ src++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ asm ("uqadd8 %0, %1, %2"
+ : "=r" (*(uint32_t*)dst)
+ : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *src;
+ d = *dst;
+ asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+ *dst = d;
+
+ dst++;
+ src++;
+ w--;
+ }
+ }
+
+}
+
+void
+pixman_composite_over_8888_8888_asm_armv6 (int32_t width,
+ int32_t height,
+ uint32_t *dst_line,
+ int32_t dst_stride,
+ uint32_t *src_line,
+ int32_t src_stride)
+{
+ uint32_t *dst;
+ uint32_t *src;
+ int32_t w;
+ uint32_t component_half = 0x800080;
+ uint32_t upper_component_mask = 0xff00ff00;
+ uint32_t alpha_mask = 0xff;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+/* #define inner_branch */
+ asm volatile (
+ "cmp %[w], #0\n\t"
+ "beq 2f\n\t"
+ "1:\n\t"
+ /* load src */
+ "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+ * The 0x0 case also allows us to avoid doing an unecessary data
+ * write which is more valuable so we only check for that
+ */
+ "cmp r5, #0\n\t"
+ "beq 3f\n\t"
+
+ /* = 255 - alpha */
+ "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+ "ldr r4, [%[dest]] \n\t"
+
+#else
+ "ldr r4, [%[dest]] \n\t"
+
+ /* = 255 - alpha */
+ "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+#endif
+ "uxtb16 r6, r4\n\t"
+ "uxtb16 r7, r4, ror #8\n\t"
+
+ /* multiply by 257 and divide by 65536 */
+ "mla r6, r6, r8, %[component_half]\n\t"
+ "mla r7, r7, r8, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ /* recombine the 0xff00ff00 bytes of r6 and r7 */
+ "and r7, r7, %[upper_component_mask]\n\t"
+ "uxtab16 r6, r7, r6, ror #8\n\t"
+
+ "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+ "3:\n\t"
+
+#endif
+ "str r5, [%[dest]], #4\n\t"
+ /* increment counter and jmp to top */
+ "subs %[w], %[w], #1\n\t"
+ "bne 1b\n\t"
+ "2:\n\t"
+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+ : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
+ [alpha_mask] "r" (alpha_mask)
+ : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
+ );
+ }
+}
+
+void
+pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width,
+ int32_t height,
+ uint32_t *dst_line,
+ int32_t dst_stride,
+ uint32_t *src_line,
+ int32_t src_stride,
+ uint32_t mask)
+{
+ uint32_t *dst;
+ uint32_t *src;
+ int32_t w;
+ uint32_t component_half = 0x800080;
+ uint32_t alpha_mask = 0xff;
+
+ mask = (mask) >> 24;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+/* #define inner_branch */
+ asm volatile (
+ "cmp %[w], #0\n\t"
+ "beq 2f\n\t"
+ "1:\n\t"
+ /* load src */
+ "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+ * The 0x0 case also allows us to avoid doing an unecessary data
+ * write which is more valuable so we only check for that
+ */
+ "cmp r5, #0\n\t"
+ "beq 3f\n\t"
+
+#endif
+ "ldr r4, [%[dest]] \n\t"
+
+ "uxtb16 r6, r5\n\t"
+ "uxtb16 r7, r5, ror #8\n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
+ "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r5, r6, r7, lsl #8\n\t"
+
+ "uxtb16 r6, r4\n\t"
+ "uxtb16 r7, r4, ror #8\n\t"
+
+ /* 255 - alpha */
+ "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, r6, r8, %[component_half]\n\t"
+ "mla r7, r7, r8, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r6, r6, r7, lsl #8\n\t"
+
+ "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+ "3:\n\t"
+
+#endif
+ "str r5, [%[dest]], #4\n\t"
+ /* increment counter and jmp to top */
+ "subs %[w], %[w], #1\n\t"
+ "bne 1b\n\t"
+ "2:\n\t"
+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+ : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
+ [alpha_mask] "r" (alpha_mask)
+ : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
+ );
+ }
+}
+
+void
+pixman_composite_over_n_8_8888_asm_armv6 (int32_t width,
+ int32_t height,
+ uint32_t *dst_line,
+ int32_t dst_stride,
+ uint32_t src,
+ int32_t unused,
+ uint8_t *mask_line,
+ int32_t mask_stride)
+{
+ uint32_t srca;
+ uint32_t *dst;
+ uint8_t *mask;
+ int32_t w;
+
+ srca = src >> 24;
+
+ uint32_t component_mask = 0xff00ff;
+ uint32_t component_half = 0x800080;
+
+ uint32_t src_hi = (src >> 8) & component_mask;
+ uint32_t src_lo = src & component_mask;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+/* #define inner_branch */
+ asm volatile (
+ "cmp %[w], #0\n\t"
+ "beq 2f\n\t"
+ "1:\n\t"
+ /* load mask */
+ "ldrb r5, [%[mask]], #1\n\t"
+#ifdef inner_branch
+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+ * The 0x0 case also allows us to avoid doing an unecessary data
+ * write which is more valuable so we only check for that
+ */
+ "cmp r5, #0\n\t"
+ "beq 3f\n\t"
+
+#endif
+ "ldr r4, [%[dest]] \n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, %[src_lo], r5, %[component_half]\n\t"
+ "mla r7, %[src_hi], r5, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r5, r6, r7, lsl #8\n\t"
+
+ "uxtb16 r6, r4\n\t"
+ "uxtb16 r7, r4, ror #8\n\t"
+
+ /* we could simplify this to use 'sub' if we were
+ * willing to give up a register for alpha_mask
+ */
+ "mvn r8, r5\n\t"
+ "mov r8, r8, lsr #24\n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, r6, r8, %[component_half]\n\t"
+ "mla r7, r7, r8, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r6, r6, r7, lsl #8\n\t"
+
+ "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+ "3:\n\t"
+
+#endif
+ "str r5, [%[dest]], #4\n\t"
+ /* increment counter and jmp to top */
+ "subs %[w], %[w], #1\n\t"
+ "bne 1b\n\t"
+ "2:\n\t"
+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
+ : [component_half] "r" (component_half),
+ [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
+ : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
+ }
+}
+
+#endif
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
+ uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
+ uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
+ uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
+ uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
+ uint32_t, uint32_t)
+
+static const pixman_fast_path_t arm_simd_fast_paths[] =
+{
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
+
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
+
+ { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
+{
+ pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
+
+ return imp;
+}
|