From 48d0dcbd5b7f80810ce259bc9ed6f57f99e27ca9 Mon Sep 17 00:00:00 2001 From: marha Date: Wed, 16 Feb 2011 16:53:37 +0000 Subject: pixman mesa git update 16 Feb 2011 --- pixman/Makefile.am | 2 +- pixman/configure.ac | 1 + pixman/demos/Makefile.am | 34 + pixman/demos/alpha-test.c | 117 + pixman/demos/clip-in.c | 50 + pixman/demos/clip-test.c | 97 + pixman/demos/composite-test.c | 191 ++ pixman/demos/convolution-test.c | 47 + pixman/demos/gradient-test.c | 89 + pixman/demos/gtk-utils.c | 115 + pixman/demos/gtk-utils.h | 13 + pixman/demos/radial-test.c | 198 ++ pixman/demos/screen-test.c | 44 + pixman/demos/trap-test.c | 49 + pixman/pixman/pixman-arm-common.h | 59 +- pixman/pixman/pixman-arm-neon-asm.S | 4758 ++++++++++++++++++----------------- pixman/pixman/pixman-arm-neon.c | 11 + pixman/pixman/pixman-fast-path.c | 4455 ++++++++++++++++---------------- pixman/pixman/pixman-fast-path.h | 1039 ++++---- pixman/pixman/pixman-sse2.c | 124 +- pixman/test/Makefile.am | 104 +- pixman/test/alpha-test.c | 117 - pixman/test/clip-in.c | 50 - pixman/test/clip-test.c | 97 - pixman/test/composite-test.c | 191 -- pixman/test/convolution-test.c | 47 - pixman/test/gradient-test.c | 89 - pixman/test/gtk-utils.c | 115 - pixman/test/gtk-utils.h | 13 - pixman/test/radial-test.c | 198 -- pixman/test/scaling-test.c | 618 +++-- pixman/test/screen-test.c | 44 - pixman/test/trap-test.c | 49 - 33 files changed, 6814 insertions(+), 6411 deletions(-) create mode 100644 pixman/demos/Makefile.am create mode 100644 pixman/demos/alpha-test.c create mode 100644 pixman/demos/clip-in.c create mode 100644 pixman/demos/clip-test.c create mode 100644 pixman/demos/composite-test.c create mode 100644 pixman/demos/convolution-test.c create mode 100644 pixman/demos/gradient-test.c create mode 100644 pixman/demos/gtk-utils.c create mode 100644 pixman/demos/gtk-utils.h create mode 100644 pixman/demos/radial-test.c create mode 100644 pixman/demos/screen-test.c create mode 100644 pixman/demos/trap-test.c delete mode 100644 pixman/test/alpha-test.c delete mode 100644 pixman/test/clip-in.c delete mode 100644 pixman/test/clip-test.c delete mode 100644 pixman/test/composite-test.c delete mode 100644 pixman/test/convolution-test.c delete mode 100644 pixman/test/gradient-test.c delete mode 100644 pixman/test/gtk-utils.c delete mode 100644 pixman/test/gtk-utils.h delete mode 100644 pixman/test/radial-test.c delete mode 100644 pixman/test/screen-test.c delete mode 100644 pixman/test/trap-test.c (limited to 'pixman') diff --git a/pixman/Makefile.am b/pixman/Makefile.am index 63b08c1fb..062c58a89 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = pixman test +SUBDIRS = pixman test demos pkgconfigdir=$(libdir)/pkgconfig pkgconfig_DATA=pixman-1.pc diff --git a/pixman/configure.ac b/pixman/configure.ac index ab2ecde1b..5242799bb 100644 --- a/pixman/configure.ac +++ b/pixman/configure.ac @@ -794,6 +794,7 @@ AC_OUTPUT([pixman-1.pc Makefile pixman/Makefile pixman/pixman-version.h + demos/Makefile test/Makefile]) m4_if(m4_eval(pixman_minor % 2), [1], [ diff --git a/pixman/demos/Makefile.am b/pixman/demos/Makefile.am new file mode 100644 index 000000000..2dcdfd350 --- /dev/null +++ b/pixman/demos/Makefile.am @@ -0,0 +1,34 @@ +if HAVE_GTK + +AM_CFLAGS = @OPENMP_CFLAGS@ +AM_LDFLAGS = @OPENMP_CFLAGS@ + +LDADD = $(GTK_LIBS) $(top_builddir)/pixman/libpixman-1.la -lm +INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(GTK_CFLAGS) + +GTK_UTILS = gtk-utils.c gtk-utils.h + +DEMOS = \ + clip-test \ + clip-in \ + composite-test \ + gradient-test \ + radial-test \ + alpha-test \ + screen-test \ + convolution-test \ + trap-test + +gradient_test_SOURCES = gradient-test.c $(GTK_UTILS) +alpha_test_SOURCES = alpha-test.c $(GTK_UTILS) +composite_test_SOURCES = composite-test.c $(GTK_UTILS) +clip_test_SOURCES = clip-test.c $(GTK_UTILS) +clip_in_SOURCES = clip-in.c $(GTK_UTILS) +trap_test_SOURCES = trap-test.c $(GTK_UTILS) +screen_test_SOURCES = screen-test.c $(GTK_UTILS) +convolution_test_SOURCES = convolution-test.c $(GTK_UTILS) +radial_test_SOURCES = radial-test.c ../test/utils.c ../test/utils.h $(GTK_UTILS) + +noinst_PROGRAMS = $(DEMOS) + +endif diff --git a/pixman/demos/alpha-test.c b/pixman/demos/alpha-test.c new file mode 100644 index 000000000..92c208142 --- /dev/null +++ b/pixman/demos/alpha-test.c @@ -0,0 +1,117 @@ +#include +#include +#include "pixman.h" +#include "gtk-utils.h" + +int +main (int argc, char **argv) +{ +#define WIDTH 400 +#define HEIGHT 200 + + uint32_t *alpha = malloc (WIDTH * HEIGHT * 4); + uint32_t *dest = malloc (WIDTH * HEIGHT * 4); + uint32_t *src = malloc (WIDTH * HEIGHT * 4); + pixman_image_t *grad_img; + pixman_image_t *alpha_img; + pixman_image_t *dest_img; + pixman_image_t *src_img; + int i; + pixman_gradient_stop_t stops[2] = + { + { pixman_int_to_fixed (0), { 0x0000, 0x0000, 0x0000, 0x0000 } }, + { pixman_int_to_fixed (1), { 0xffff, 0x0000, 0x1111, 0xffff } } + }; + pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 }; + pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH), + pixman_int_to_fixed (0) }; +#if 0 + pixman_transform_t trans = { + { { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), }, + { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), }, + { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } + } + }; +#else + pixman_transform_t trans = { + { { pixman_fixed_1, 0, 0 }, + { 0, pixman_fixed_1, 0 }, + { 0, 0, pixman_fixed_1 } } + }; +#endif + + pixman_point_fixed_t c_inner; + pixman_point_fixed_t c_outer; + pixman_fixed_t r_inner; + pixman_fixed_t r_outer; + + for (i = 0; i < WIDTH * HEIGHT; ++i) + alpha[i] = 0x4f00004f; /* pale blue */ + + alpha_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, + WIDTH, HEIGHT, + alpha, + WIDTH * 4); + + for (i = 0; i < WIDTH * HEIGHT; ++i) + dest[i] = 0xffffff00; /* yellow */ + + dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, + WIDTH, HEIGHT, + dest, + WIDTH * 4); + + for (i = 0; i < WIDTH * HEIGHT; ++i) + src[i] = 0xffff0000; + + src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, + WIDTH, HEIGHT, + src, + WIDTH * 4); + + c_inner.x = pixman_double_to_fixed (50.0); + c_inner.y = pixman_double_to_fixed (50.0); + c_outer.x = pixman_double_to_fixed (50.0); + c_outer.y = pixman_double_to_fixed (50.0); + r_inner = 0; + r_outer = pixman_double_to_fixed (50.0); + +#if 0 + grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner, + stops, 2); +#endif +#if 0 + grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner, + stops, 2); + grad_img = pixman_image_create_linear_gradient (&c_inner, &c_outer, + r_inner, r_outer, + stops, 2); +#endif + + grad_img = pixman_image_create_linear_gradient (&p1, &p2, + stops, 2); + + pixman_image_set_transform (grad_img, &trans); + pixman_image_set_repeat (grad_img, PIXMAN_REPEAT_PAD); + + pixman_image_composite (PIXMAN_OP_OVER, grad_img, NULL, alpha_img, + 0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT); + + pixman_image_set_alpha_map (src_img, alpha_img, 10, 10); + + pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img, + 0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT); + + printf ("0, 0: %x\n", dest[0]); + printf ("10, 10: %x\n", dest[10 * 10 + 10]); + printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]); + + show_image (dest_img); + + pixman_image_unref (src_img); + pixman_image_unref (grad_img); + pixman_image_unref (alpha_img); + free (dest); + + return 0; +} diff --git a/pixman/demos/clip-in.c b/pixman/demos/clip-in.c new file mode 100644 index 000000000..51579811f --- /dev/null +++ b/pixman/demos/clip-in.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include "pixman.h" +#include "gtk-utils.h" + +/* This test demonstrates that clipping is done totally different depending + * on whether the source is transformed or not. + */ +int +main (int argc, char **argv) +{ +#define WIDTH 200 +#define HEIGHT 200 + +#define SMALL 25 + + uint32_t *sbits = malloc (SMALL * SMALL * 4); + uint32_t *bits = malloc (WIDTH * HEIGHT * 4); + pixman_transform_t trans = { + { + { pixman_double_to_fixed (1.0), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.1), }, + { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.1), }, + { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) } + } }; + + pixman_image_t *src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, SMALL, SMALL, sbits, 4 * SMALL); + pixman_image_t *dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, 4 * WIDTH); + + memset (bits, 0xff, WIDTH * HEIGHT * 4); + memset (sbits, 0x00, SMALL * SMALL * 4); + + pixman_image_composite (PIXMAN_OP_IN, + src_img, NULL, dest_img, + 0, 0, 0, 0, SMALL, SMALL, 200, 200); + + pixman_image_set_transform (src_img, &trans); + + pixman_image_composite (PIXMAN_OP_IN, + src_img, NULL, dest_img, + 0, 0, 0, 0, SMALL * 2, SMALL * 2, 200, 200); + + show_image (dest_img); + + pixman_image_unref (src_img); + pixman_image_unref (dest_img); + free (bits); + + return 0; +} diff --git a/pixman/demos/clip-test.c b/pixman/demos/clip-test.c new file mode 100644 index 000000000..aa0df4482 --- /dev/null +++ b/pixman/demos/clip-test.c @@ -0,0 +1,97 @@ +#include +#include +#include "pixman.h" +#include "gtk-utils.h" + +#define WIDTH 200 +#define HEIGHT 200 + +static pixman_image_t * +create_solid_bits (uint32_t pixel) +{ + uint32_t *pixels = malloc (WIDTH * HEIGHT * 4); + int i; + + for (i = 0; i < WIDTH * HEIGHT; ++i) + pixels[i] = pixel; + + return pixman_image_create_bits (PIXMAN_a8r8g8b8, + WIDTH, HEIGHT, + pixels, + WIDTH * 4); +} + +int +main (int argc, char **argv) +{ + pixman_image_t *gradient_img; + pixman_image_t *src_img, *dst_img; + pixman_gradient_stop_t stops[2] = + { + { pixman_int_to_fixed (0), { 0xffff, 0x0000, 0x0000, 0xffff } }, + { pixman_int_to_fixed (1), { 0xffff, 0xffff, 0x0000, 0xffff } } + }; +#if 0 + pixman_point_fixed_t p1 = { 0, 0 }; + pixman_point_fixed_t p2 = { pixman_int_to_fixed (WIDTH), + pixman_int_to_fixed (HEIGHT) }; +#endif + pixman_point_fixed_t c_inner; + pixman_point_fixed_t c_outer; + pixman_fixed_t r_inner; + pixman_fixed_t r_outer; + pixman_region32_t clip_region; + pixman_transform_t trans = { + { { pixman_double_to_fixed (1.3), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.5), }, + { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.5), }, + { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) } + } + }; + + src_img = create_solid_bits (0xff0000ff); + + c_inner.x = pixman_double_to_fixed (100.0); + c_inner.y = pixman_double_to_fixed (100.0); + c_outer.x = pixman_double_to_fixed (100.0); + c_outer.y = pixman_double_to_fixed (100.0); + r_inner = 0; + r_outer = pixman_double_to_fixed (100.0); + + gradient_img = pixman_image_create_radial_gradient (&c_inner, &c_outer, + r_inner, r_outer, + stops, 2); + +#if 0 + gradient_img = pixman_image_create_linear_gradient (&p1, &p2, + stops, 2); + +#endif + + pixman_image_composite (PIXMAN_OP_OVER, gradient_img, NULL, src_img, + 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT); + + pixman_region32_init_rect (&clip_region, 50, 0, 100, 200); + pixman_image_set_clip_region32 (src_img, &clip_region); + pixman_image_set_source_clipping (src_img, TRUE); + pixman_image_set_has_client_clip (src_img, TRUE); + pixman_image_set_transform (src_img, &trans); + pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL); + + dst_img = create_solid_bits (0xffff0000); + pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dst_img, + 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT); + + +#if 0 + printf ("0, 0: %x\n", src[0]); + printf ("10, 10: %x\n", src[10 * 10 + 10]); + printf ("w, h: %x\n", src[(HEIGHT - 1) * 100 + (WIDTH - 1)]); +#endif + + show_image (dst_img); + + pixman_image_unref (gradient_img); + pixman_image_unref (src_img); + + return 0; +} diff --git a/pixman/demos/composite-test.c b/pixman/demos/composite-test.c new file mode 100644 index 000000000..79d5d5eac --- /dev/null +++ b/pixman/demos/composite-test.c @@ -0,0 +1,191 @@ +#include +#include +#include +#include "pixman.h" +#include "gtk-utils.h" + +#define WIDTH 60 +#define HEIGHT 60 + +typedef struct { + const char *name; + pixman_op_t op; +} operator_t; + +static const operator_t operators[] = { + { "CLEAR", PIXMAN_OP_CLEAR }, + { "SRC", PIXMAN_OP_SRC }, + { "DST", PIXMAN_OP_DST }, + { "OVER", PIXMAN_OP_OVER }, + { "OVER_REVERSE", PIXMAN_OP_OVER_REVERSE }, + { "IN", PIXMAN_OP_IN }, + { "IN_REVERSE", PIXMAN_OP_IN_REVERSE }, + { "OUT", PIXMAN_OP_OUT }, + { "OUT_REVERSE", PIXMAN_OP_OUT_REVERSE }, + { "ATOP", PIXMAN_OP_ATOP }, + { "ATOP_REVERSE", PIXMAN_OP_ATOP_REVERSE }, + { "XOR", PIXMAN_OP_XOR }, + { "ADD", PIXMAN_OP_ADD }, + { "SATURATE", PIXMAN_OP_SATURATE }, + + { "MULTIPLY", PIXMAN_OP_MULTIPLY }, + { "SCREEN", PIXMAN_OP_SCREEN }, + { "OVERLAY", PIXMAN_OP_OVERLAY }, + { "DARKEN", PIXMAN_OP_DARKEN }, + { "LIGHTEN", PIXMAN_OP_LIGHTEN }, + { "COLOR_DODGE", PIXMAN_OP_COLOR_DODGE }, + { "COLOR_BURN", PIXMAN_OP_COLOR_BURN }, + { "HARD_LIGHT", PIXMAN_OP_HARD_LIGHT }, + { "SOFT_LIGHT", PIXMAN_OP_SOFT_LIGHT }, + { "DIFFERENCE", PIXMAN_OP_DIFFERENCE }, + { "EXCLUSION", PIXMAN_OP_EXCLUSION }, + { "HSL_HUE", PIXMAN_OP_HSL_HUE }, + { "HSL_SATURATION", PIXMAN_OP_HSL_SATURATION }, + { "HSL_COLOR", PIXMAN_OP_HSL_COLOR }, + { "HSL_LUMINOSITY", PIXMAN_OP_HSL_LUMINOSITY }, +}; + +static uint32_t +reader (const void *src, int size) +{ + switch (size) + { + case 1: + return *(uint8_t *)src; + case 2: + return *(uint16_t *)src; + case 4: + return *(uint32_t *)src; + default: + g_assert_not_reached(); + } +} + +static void +writer (void *src, uint32_t value, int size) +{ + switch (size) + { + case 1: + *(uint8_t *)src = value; + break; + + case 2: + *(uint16_t *)src = value; + break; + + case 4: + *(uint32_t *)src = value; + break; + + default: + break; + } +} + +int +main (int argc, char **argv) +{ +#define d2f pixman_double_to_fixed + + GtkWidget *window, *swindow; + GtkWidget *table; + uint32_t *dest = malloc (WIDTH * HEIGHT * 4); + uint32_t *src = malloc (WIDTH * HEIGHT * 4); + pixman_image_t *src_img; + pixman_image_t *dest_img; + pixman_point_fixed_t p1 = { -10 << 0, 0 }; + pixman_point_fixed_t p2 = { WIDTH << 16, (HEIGHT - 10) << 16 }; + uint16_t full = 0xcfff; + uint16_t low = 0x5000; + uint16_t alpha = 0xffff; + pixman_gradient_stop_t stops[6] = + { + { d2f (0.0), { full, low, low, alpha } }, + { d2f (0.25), { full, full, low, alpha } }, + { d2f (0.4), { low, full, low, alpha } }, + { d2f (0.6), { low, full, full, alpha } }, + { d2f (0.8), { low, low, full, alpha } }, + { d2f (1.0), { full, low, full, alpha } }, + }; + + int i; + + gtk_init (&argc, &argv); + + window = gtk_window_new (GTK_WINDOW_TOPLEVEL); + + gtk_window_set_default_size (GTK_WINDOW (window), 800, 600); + + g_signal_connect (window, "delete-event", + G_CALLBACK (gtk_main_quit), + NULL); + table = gtk_table_new (G_N_ELEMENTS (operators) / 6, 6, TRUE); + + src_img = pixman_image_create_linear_gradient (&p1, &p2, stops, + sizeof (stops) / sizeof (stops[0])); + + pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD); + + dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, + WIDTH, HEIGHT, + dest, + WIDTH * 4); + pixman_image_set_accessors (dest_img, reader, writer); + + for (i = 0; i < G_N_ELEMENTS (operators); ++i) + { + GtkWidget *image; + GdkPixbuf *pixbuf; + GtkWidget *vbox; + GtkWidget *label; + int j, k; + + vbox = gtk_vbox_new (FALSE, 0); + + label = gtk_label_new (operators[i].name); + gtk_box_pack_start (GTK_BOX (vbox), label, FALSE, FALSE, 6); + gtk_widget_show (label); + + for (j = 0; j < HEIGHT; ++j) + { + for (k = 0; k < WIDTH; ++k) + dest[j * WIDTH + k] = 0x7f6f6f00; + } + pixman_image_composite (operators[i].op, src_img, NULL, dest_img, + 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT); + pixbuf = pixbuf_from_argb32 (pixman_image_get_data (dest_img), TRUE, + WIDTH, HEIGHT, WIDTH * 4); + image = gtk_image_new_from_pixbuf (pixbuf); + gtk_box_pack_start (GTK_BOX (vbox), image, FALSE, FALSE, 0); + gtk_widget_show (image); + + gtk_table_attach_defaults (GTK_TABLE (table), vbox, + i % 6, (i % 6) + 1, i / 6, (i / 6) + 1); + gtk_widget_show (vbox); + + g_object_unref (pixbuf); + } + + pixman_image_unref (src_img); + free (src); + pixman_image_unref (dest_img); + free (dest); + + swindow = gtk_scrolled_window_new (NULL, NULL); + gtk_scrolled_window_set_policy (GTK_SCROLLED_WINDOW (swindow), + GTK_POLICY_AUTOMATIC, + GTK_POLICY_AUTOMATIC); + + gtk_scrolled_window_add_with_viewport (GTK_SCROLLED_WINDOW (swindow), table); + gtk_widget_show (table); + + gtk_container_add (GTK_CONTAINER (window), swindow); + gtk_widget_show (swindow); + + gtk_widget_show (window); + + gtk_main (); + + return 0; +} diff --git a/pixman/demos/convolution-test.c b/pixman/demos/convolution-test.c new file mode 100644 index 000000000..da284af7b --- /dev/null +++ b/pixman/demos/convolution-test.c @@ -0,0 +1,47 @@ +#include +#include +#include "pixman.h" +#include "gtk-utils.h" + +int +main (int argc, char **argv) +{ +#define WIDTH 200 +#define HEIGHT 200 + +#define d2f pixman_double_to_fixed + + uint32_t *src = malloc (WIDTH * HEIGHT * 4); + uint32_t *mask = malloc (WIDTH * HEIGHT * 4); + uint32_t *dest = malloc (WIDTH * HEIGHT * 4); + pixman_fixed_t convolution[] = + { + d2f (3), d2f (3), + d2f (0.5), d2f (0.5), d2f (0.5), + d2f (0.5), d2f (0.5), d2f (0.5), + d2f (0.5), d2f (0.5), d2f (0.5), + }; + pixman_image_t *simg, *mimg, *dimg; + + int i; + + for (i = 0; i < WIDTH * HEIGHT; ++i) + { + src[i] = 0x7f007f00; + mask[i] = (i % 256) * 0x01000000; + dest[i] = 0; + } + + simg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src, WIDTH * 4); + mimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, mask, WIDTH * 4); + dimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4); + + pixman_image_set_filter (mimg, PIXMAN_FILTER_CONVOLUTION, + convolution, 11); + + pixman_image_composite (PIXMAN_OP_OVER, simg, mimg, dimg, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT); + + show_image (dimg); + + return 0; +} diff --git a/pixman/demos/gradient-test.c b/pixman/demos/gradient-test.c new file mode 100644 index 000000000..fc84844b0 --- /dev/null +++ b/pixman/demos/gradient-test.c @@ -0,0 +1,89 @@ +#include +#include +#include "pixman.h" +#include "gtk-utils.h" + +int +main (int argc, char **argv) +{ +#define WIDTH 400 +#define HEIGHT 200 + + uint32_t *dest = malloc (WIDTH * HEIGHT * 4); + pixman_image_t *src_img; + pixman_image_t *dest_img; + int i; + pixman_gradient_stop_t stops[2] = + { + { pixman_int_to_fixed (0), { 0xffff, 0xeeee, 0xeeee, 0xeeee } }, + { pixman_int_to_fixed (1), { 0xffff, 0x1111, 0x1111, 0x1111 } } + }; + pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 }; + pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH / 8.), + pixman_int_to_fixed (0) }; +#if 0 + pixman_transform_t trans = { + { { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), }, + { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), }, + { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } + } + }; +#else + pixman_transform_t trans = { + { { pixman_fixed_1, 0, 0 }, + { 0, pixman_fixed_1, 0 }, + { 0, 0, pixman_fixed_1 } } + }; +#endif + + pixman_point_fixed_t c_inner; + pixman_point_fixed_t c_outer; + pixman_fixed_t r_inner; + pixman_fixed_t r_outer; + + for (i = 0; i < WIDTH * HEIGHT; ++i) + dest[i] = 0x4f00004f; /* pale blue */ + + dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, + WIDTH, HEIGHT, + dest, + WIDTH * 4); + + c_inner.x = pixman_double_to_fixed (50.0); + c_inner.y = pixman_double_to_fixed (50.0); + c_outer.x = pixman_double_to_fixed (50.0); + c_outer.y = pixman_double_to_fixed (50.0); + r_inner = 0; + r_outer = pixman_double_to_fixed (50.0); + + src_img = pixman_image_create_conical_gradient (&c_inner, r_inner, + stops, 2); +#if 0 + src_img = pixman_image_create_conical_gradient (&c_inner, r_inner, + stops, 2); + src_img = pixman_image_create_linear_gradient (&c_inner, &c_outer, + r_inner, r_outer, + stops, 2); +#endif + + src_img = pixman_image_create_linear_gradient (&p1, &p2, + stops, 2); + + pixman_image_set_transform (src_img, &trans); + pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD); + + pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img, + 0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT); + + printf ("0, 0: %x\n", dest[0]); + printf ("10, 10: %x\n", dest[10 * 10 + 10]); + printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]); + + show_image (dest_img); + + pixman_image_unref (src_img); + pixman_image_unref (dest_img); + free (dest); + + return 0; +} diff --git a/pixman/demos/gtk-utils.c b/pixman/demos/gtk-utils.c new file mode 100644 index 000000000..f45cdc912 --- /dev/null +++ b/pixman/demos/gtk-utils.c @@ -0,0 +1,115 @@ +#include +#include +#include "pixman-private.h" /* For image->bits.format + * FIXME: there should probably be public API for this + */ +#include "gtk-utils.h" + +GdkPixbuf * +pixbuf_from_argb32 (uint32_t *bits, + gboolean has_alpha, + int width, + int height, + int stride) +{ + GdkPixbuf *pixbuf = gdk_pixbuf_new (GDK_COLORSPACE_RGB, TRUE, + 8, width, height); + int p_stride = gdk_pixbuf_get_rowstride (pixbuf); + guint32 *p_bits = (guint32 *)gdk_pixbuf_get_pixels (pixbuf); + int w, h; + + for (h = 0; h < height; ++h) + { + for (w = 0; w < width; ++w) + { + uint32_t argb = bits[h * (stride / 4) + w]; + guint r, g, b, a; + char *pb = (char *)p_bits; + + pb += h * p_stride + w * 4; + + r = (argb & 0x00ff0000) >> 16; + g = (argb & 0x0000ff00) >> 8; + b = (argb & 0x000000ff) >> 0; + a = has_alpha? (argb & 0xff000000) >> 24 : 0xff; + + if (a) + { + r = (r * 255) / a; + g = (g * 255) / a; + b = (b * 255) / a; + } + + if (r > 255) r = 255; + if (g > 255) g = 255; + if (b > 255) b = 255; + + pb[0] = r; + pb[1] = g; + pb[2] = b; + pb[3] = a; + } + } + + return pixbuf; +} + + +static gboolean +on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data) +{ + GdkPixbuf *pixbuf = data; + + gdk_draw_pixbuf (widget->window, NULL, + pixbuf, 0, 0, 0, 0, + gdk_pixbuf_get_width (pixbuf), + gdk_pixbuf_get_height (pixbuf), + GDK_RGB_DITHER_NONE, + 0, 0); + + return TRUE; +} + +void +show_image (pixman_image_t *image) +{ + GtkWidget *window; + GdkPixbuf *pixbuf; + int width, height, stride; + int argc; + char **argv; + char *arg0 = g_strdup ("pixman-test-program"); + gboolean has_alpha; + pixman_format_code_t format; + + argc = 1; + argv = (char **)&arg0; + + gtk_init (&argc, &argv); + + window = gtk_window_new (GTK_WINDOW_TOPLEVEL); + width = pixman_image_get_width (image); + height = pixman_image_get_height (image); + stride = pixman_image_get_stride (image); + + gtk_window_set_default_size (GTK_WINDOW (window), width, height); + + format = image->bits.format; + + if (format == PIXMAN_a8r8g8b8) + has_alpha = TRUE; + else if (format == PIXMAN_x8r8g8b8) + has_alpha = FALSE; + else + g_error ("Can't deal with this format: %x\n", format); + + pixbuf = pixbuf_from_argb32 (pixman_image_get_data (image), has_alpha, + width, height, stride); + + g_signal_connect (window, "expose_event", G_CALLBACK (on_expose), pixbuf); + g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL); + + gtk_widget_show (window); + + gtk_main (); +} diff --git a/pixman/demos/gtk-utils.h b/pixman/demos/gtk-utils.h new file mode 100644 index 000000000..2cb13bcf0 --- /dev/null +++ b/pixman/demos/gtk-utils.h @@ -0,0 +1,13 @@ +#include +#include +#include +#include +#include "pixman.h" + +void show_image (pixman_image_t *image); + +GdkPixbuf *pixbuf_from_argb32 (uint32_t *bits, + gboolean has_alpha, + int width, + int height, + int stride); diff --git a/pixman/demos/radial-test.c b/pixman/demos/radial-test.c new file mode 100644 index 000000000..35e90d786 --- /dev/null +++ b/pixman/demos/radial-test.c @@ -0,0 +1,198 @@ +#include "../test/utils.h" +#include "gtk-utils.h" + +#define NUM_GRADIENTS 7 +#define NUM_STOPS 3 +#define NUM_REPEAT 4 +#define SIZE 128 +#define WIDTH (SIZE * NUM_GRADIENTS) +#define HEIGHT (SIZE * NUM_REPEAT) + +/* + * We want to test all the possible relative positions of the start + * and end circle: + * + * - The start circle can be smaller/equal/bigger than the end + * circle. A radial gradient can be classified in one of these + * three cases depending on the sign of dr. + * + * - The smaller circle can be completely inside/internally + * tangent/outside (at least in part) of the bigger circle. This + * classification is the same as the one which can be computed by + * examining the sign of a = (dx^2 + dy^2 - dr^2). + * + * - If the two circles have the same size, neither can be inside or + * internally tangent + * + * This test draws radial gradients whose circles always have the same + * centers (0, 0) and (1, 0), but with different radiuses. From left + * to right: + * + * - Small start circle completely inside the end circle + * 0.25 -> 1.75; dr = 1.5 > 0; a = 1 - 1.50^2 < 0 + * + * - Small start circle internally tangent to the end circle + * 0.50 -> 1.50; dr = 1.0 > 0; a = 1 - 1.00^2 = 0 + * + * - Small start circle outside of the end circle + * 0.50 -> 1.00; dr = 0.5 > 0; a = 1 - 0.50^2 > 0 + * + * - Start circle with the same size as the end circle + * 1.00 -> 1.00; dr = 0.0 = 0; a = 1 - 0.00^2 > 0 + * + * - Small end circle outside of the start circle + * 1.00 -> 0.50; dr = -0.5 > 0; a = 1 - 0.50^2 > 0 + * + * - Small end circle internally tangent to the start circle + * 1.50 -> 0.50; dr = -1.0 > 0; a = 1 - 1.00^2 = 0 + * + * - Small end circle completely inside the start circle + * 1.75 -> 0.25; dr = -1.5 > 0; a = 1 - 1.50^2 < 0 + * + */ + +const static double radiuses[NUM_GRADIENTS] = { + 0.25, + 0.50, + 0.50, + 1.00, + 1.00, + 1.50, + 1.75 +}; + +#define double_to_color(x) \ + (((uint32_t) ((x)*65536)) - (((uint32_t) ((x)*65536)) >> 16)) + +#define PIXMAN_STOP(offset,r,g,b,a) \ + { pixman_double_to_fixed (offset), \ + { \ + double_to_color (r), \ + double_to_color (g), \ + double_to_color (b), \ + double_to_color (a) \ + } \ + } + +static const pixman_gradient_stop_t stops[NUM_STOPS] = { + PIXMAN_STOP (0.0, 1, 0, 0, 0.75), + PIXMAN_STOP (0.70710678, 0, 1, 0, 0), + PIXMAN_STOP (1.0, 0, 0, 1, 1) +}; + +static pixman_image_t * +create_radial (int index) +{ + pixman_point_fixed_t p0, p1; + pixman_fixed_t r0, r1; + double x0, x1, radius0, radius1, left, right, center; + + x0 = 0; + x1 = 1; + radius0 = radiuses[index]; + radius1 = radiuses[NUM_GRADIENTS - index - 1]; + + /* center the gradient */ + left = MIN (x0 - radius0, x1 - radius1); + right = MAX (x0 + radius0, x1 + radius1); + center = (left + right) * 0.5; + x0 -= center; + x1 -= center; + + /* scale to make it fit within a 1x1 rect centered in (0,0) */ + x0 *= 0.25; + x1 *= 0.25; + radius0 *= 0.25; + radius1 *= 0.25; + + p0.x = pixman_double_to_fixed (x0); + p0.y = pixman_double_to_fixed (0); + + p1.x = pixman_double_to_fixed (x1); + p1.y = pixman_double_to_fixed (0); + + r0 = pixman_double_to_fixed (radius0); + r1 = pixman_double_to_fixed (radius1); + + return pixman_image_create_radial_gradient (&p0, &p1, + r0, r1, + stops, NUM_STOPS); +} + +static const pixman_repeat_t repeat[NUM_REPEAT] = { + PIXMAN_REPEAT_NONE, + PIXMAN_REPEAT_NORMAL, + PIXMAN_REPEAT_REFLECT, + PIXMAN_REPEAT_PAD +}; + +int +main (int argc, char **argv) +{ + pixman_transform_t transform; + pixman_image_t *src_img, *dest_img; + int i, j; + + enable_fp_exceptions (); + + dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, + WIDTH, HEIGHT, + NULL, 0); + + pixman_transform_init_identity (&transform); + + /* + * The create_radial() function returns gradients centered in the + * origin and whose interesting part fits a 1x1 square. We want to + * paint these gradients on a SIZExSIZE square and to make things + * easier we want the origin in the top-left corner of the square + * we want to see. + */ + pixman_transform_translate (NULL, &transform, + pixman_double_to_fixed (0.5), + pixman_double_to_fixed (0.5)); + + pixman_transform_scale (NULL, &transform, + pixman_double_to_fixed (SIZE), + pixman_double_to_fixed (SIZE)); + + /* + * Gradients are evaluated at the center of each pixel, so we need + * to translate by half a pixel to trigger some interesting + * cornercases. In particular, the original implementation of PDF + * radial gradients tried to divide by 0 when using this transform + * on the "tangent circles" cases. + */ + pixman_transform_translate (NULL, &transform, + pixman_double_to_fixed (0.5), + pixman_double_to_fixed (0.5)); + + for (i = 0; i < NUM_GRADIENTS; i++) + { + src_img = create_radial (i); + pixman_image_set_transform (src_img, &transform); + + for (j = 0; j < NUM_REPEAT; j++) + { + pixman_image_set_repeat (src_img, repeat[j]); + + pixman_image_composite32 (PIXMAN_OP_OVER, + src_img, + NULL, + dest_img, + 0, 0, + 0, 0, + i * SIZE, j * SIZE, + SIZE, SIZE); + + } + + pixman_image_unref (src_img); + } + + show_image (dest_img); + + pixman_image_unref (dest_img); + + return 0; +} diff --git a/pixman/demos/screen-test.c b/pixman/demos/screen-test.c new file mode 100644 index 000000000..e69dba3de --- /dev/null +++ b/pixman/demos/screen-test.c @@ -0,0 +1,44 @@ +#include +#include +#include "pixman.h" +#include "gtk-utils.h" + +int +main (int argc, char **argv) +{ +#define WIDTH 40 +#define HEIGHT 40 + + uint32_t *src1 = malloc (WIDTH * HEIGHT * 4); + uint32_t *src2 = malloc (WIDTH * HEIGHT * 4); + uint32_t *src3 = malloc (WIDTH * HEIGHT * 4); + uint32_t *dest = malloc (3 * WIDTH * 2 * HEIGHT * 4); + pixman_image_t *simg1, *simg2, *simg3, *dimg; + + int i; + + for (i = 0; i < WIDTH * HEIGHT; ++i) + { + src1[i] = 0x7ff00000; + src2[i] = 0x7f00ff00; + src3[i] = 0x7f0000ff; + } + + for (i = 0; i < 3 * WIDTH * 2 * HEIGHT; ++i) + { + dest[i] = 0x0; + } + + simg1 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src1, WIDTH * 4); + simg2 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src2, WIDTH * 4); + simg3 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src3, WIDTH * 4); + dimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, 3 * WIDTH, 2 * HEIGHT, dest, 3 * WIDTH * 4); + + pixman_image_composite (PIXMAN_OP_SCREEN, simg1, NULL, dimg, 0, 0, 0, 0, WIDTH, HEIGHT / 4, WIDTH, HEIGHT); + pixman_image_composite (PIXMAN_OP_SCREEN, simg2, NULL, dimg, 0, 0, 0, 0, (WIDTH/2), HEIGHT / 4 + HEIGHT / 2, WIDTH, HEIGHT); + pixman_image_composite (PIXMAN_OP_SCREEN, simg3, NULL, dimg, 0, 0, 0, 0, (4 * WIDTH) / 3, HEIGHT, WIDTH, HEIGHT); + + show_image (dimg); + + return 0; +} diff --git a/pixman/demos/trap-test.c b/pixman/demos/trap-test.c new file mode 100644 index 000000000..19295e7a5 --- /dev/null +++ b/pixman/demos/trap-test.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include "pixman.h" +#include "gtk-utils.h" + +int +main (int argc, char **argv) +{ +#define WIDTH 200 +#define HEIGHT 200 + + pixman_image_t *src_img; + pixman_image_t *mask_img; + pixman_image_t *dest_img; + pixman_trap_t trap; + pixman_color_t white = { 0x0000, 0xffff, 0x0000, 0xffff }; + uint32_t *bits = malloc (WIDTH * HEIGHT * 4); + uint32_t *mbits = malloc (WIDTH * HEIGHT); + + memset (mbits, 0, WIDTH * HEIGHT); + memset (bits, 0xff, WIDTH * HEIGHT * 4); + + trap.top.l = pixman_int_to_fixed (50) + 0x8000; + trap.top.r = pixman_int_to_fixed (150) + 0x8000; + trap.top.y = pixman_int_to_fixed (30); + + trap.bot.l = pixman_int_to_fixed (50) + 0x8000; + trap.bot.r = pixman_int_to_fixed (150) + 0x8000; + trap.bot.y = pixman_int_to_fixed (150); + + mask_img = pixman_image_create_bits (PIXMAN_a8, WIDTH, HEIGHT, mbits, WIDTH); + src_img = pixman_image_create_solid_fill (&white); + dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4); + + pixman_add_traps (mask_img, 0, 0, 1, &trap); + + pixman_image_composite (PIXMAN_OP_OVER, + src_img, mask_img, dest_img, + 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT); + + show_image (dest_img); + + pixman_image_unref (src_img); + pixman_image_unref (dest_img); + free (bits); + + return 0; +} diff --git a/pixman/pixman/pixman-arm-common.h b/pixman/pixman/pixman-arm-common.h index 372e9f9a8..9b1322b6c 100644 --- a/pixman/pixman/pixman-arm-common.h +++ b/pixman/pixman/pixman-arm-common.h @@ -282,19 +282,20 @@ cputype##_composite_##name (pixman_implementation_t *imp, \ src_type, dst_type) \ void \ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \ - int32_t w, \ - dst_type * dst, \ - src_type * src, \ - pixman_fixed_t vx, \ - pixman_fixed_t unit_x);\ + int32_t w, \ + dst_type * dst, \ + const src_type * src, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x); \ \ static force_inline void \ scaled_nearest_scanline_##cputype##_##name##_##op (dst_type * pd, \ - src_type * ps, \ + const src_type * ps, \ int32_t w, \ pixman_fixed_t vx, \ pixman_fixed_t unit_x, \ - pixman_fixed_t max_vx) \ + pixman_fixed_t max_vx, \ + pixman_bool_t zero_src) \ { \ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \ vx, unit_x);\ @@ -316,4 +317,48 @@ FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op, \ SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \ SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func) +#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op, \ + src_type, dst_type) \ +void \ +pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \ + int32_t w, \ + dst_type * dst, \ + const src_type * src, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x, \ + const uint8_t * mask); \ + \ +static force_inline void \ +scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t * mask, \ + dst_type * pd, \ + const src_type * ps, \ + int32_t w, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx, \ + pixman_bool_t zero_src) \ +{ \ + if ((flags & SKIP_ZERO_SRC) && zero_src) \ + return; \ + pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \ + vx, unit_x, \ + mask); \ +} \ + \ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ + scaled_nearest_scanline_##cputype##_##name##_##op,\ + src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ + scaled_nearest_scanline_##cputype##_##name##_##op,\ + src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ + scaled_nearest_scanline_##cputype##_##name##_##op,\ + src_type, uint8_t, dst_type, PAD, TRUE, FALSE) + +/* Provide entries for the fast path table */ +#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func) + #endif diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S index 51533d42f..47daf457c 100644 --- a/pixman/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman/pixman-arm-neon-asm.S @@ -1,2365 +1,2393 @@ -/* - * Copyright © 2009 Nokia Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) - */ - -/* - * This file contains implementations of NEON optimized pixel processing - * functions. There is no full and detailed tutorial, but some functions - * (those which are exposing some new or interesting features) are - * extensively commented and can be used as examples. - * - * You may want to have a look at the comments for following functions: - * - pixman_composite_over_8888_0565_asm_neon - * - pixman_composite_over_n_8_0565_asm_neon - */ - -/* Prevent the stack from becoming executable for no reason... */ -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - - .text - .fpu neon - .arch armv7a - .object_arch armv4 - .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ - .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ - .arm - .altmacro - -#include "pixman-arm-neon-asm.h" - -/* Global configuration options and preferences */ - -/* - * The code can optionally make use of unaligned memory accesses to improve - * performance of handling leading/trailing pixels for each scanline. - * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for - * example in linux if unaligned memory accesses are not configured to - * generate.exceptions. - */ -.set RESPECT_STRICT_ALIGNMENT, 1 - -/* - * Set default prefetch type. There is a choice between the following options: - * - * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work - * as NOP to workaround some HW bugs or for whatever other reason) - * - * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where - * advanced prefetch intruduces heavy overhead) - * - * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 - * which can run ARM and NEON instructions simultaneously so that extra ARM - * instructions do not add (many) extra cycles, but improve prefetch efficiency) - * - * Note: some types of function can't support advanced prefetch and fallback - * to simple one (those which handle 24bpp pixels) - */ -.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED - -/* Prefetch distance in pixels for simple prefetch */ -.set PREFETCH_DISTANCE_SIMPLE, 64 - -/* - * Implementation of pixman_composite_over_8888_0565_asm_neon - * - * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and - * performs OVER compositing operation. Function fast_composite_over_8888_0565 - * from pixman-fast-path.c does the same in C and can be used as a reference. - * - * First we need to have some NEON assembly code which can do the actual - * operation on the pixels and provide it to the template macro. - * - * Template macro quite conveniently takes care of emitting all the necessary - * code for memory reading and writing (including quite tricky cases of - * handling unaligned leading/trailing pixels), so we only need to deal with - * the data in NEON registers. - * - * NEON registers allocation in general is recommented to be the following: - * d0, d1, d2, d3 - contain loaded source pixel data - * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) - * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) - * d28, d29, d30, d31 - place for storing the result (destination pixels) - * - * As can be seen above, four 64-bit NEON registers are used for keeping - * intermediate pixel data and up to 8 pixels can be processed in one step - * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). - * - * This particular function uses the following registers allocation: - * d0, d1, d2, d3 - contain loaded source pixel data - * d4, d5 - contain loaded destination pixels (they are needed) - * d28, d29 - place for storing the result (destination pixels) - */ - -/* - * Step one. We need to have some code to do some arithmetics on pixel data. - * This is implemented as a pair of macros: '*_head' and '*_tail'. When used - * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, - * perform all the needed calculations and write the result to {d28, d29}. - * The rationale for having two macros and not just one will be explained - * later. In practice, any single monolitic function which does the work can - * be split into two parts in any arbitrary way without affecting correctness. - * - * There is one special trick here too. Common template macro can optionally - * make our life a bit easier by doing R, G, B, A color components - * deinterleaving for 32bpp pixel formats (and this feature is used in - * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that - * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we - * actually use d0 register for blue channel (a vector of eight 8-bit - * values), d1 register for green, d2 for red and d3 for alpha. This - * simple conversion can be also done with a few NEON instructions: - * - * Packed to planar conversion: - * vuzp.8 d0, d1 - * vuzp.8 d2, d3 - * vuzp.8 d1, d3 - * vuzp.8 d0, d2 - * - * Planar to packed conversion: - * vzip.8 d0, d2 - * vzip.8 d1, d3 - * vzip.8 d2, d3 - * vzip.8 d0, d1 - * - * But pixel can be loaded directly in planar format using VLD4.8 NEON - * instruction. It is 1 cycle slower than VLD1.32, so this is not always - * desirable, that's why deinterleaving is optional. - * - * But anyway, here is the code: - */ -.macro pixman_composite_over_8888_0565_process_pixblock_head - /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format - and put data into d6 - red, d7 - green, d30 - blue */ - vshrn.u16 d6, q2, #8 - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vsri.u8 d6, d6, #5 - vmvn.8 d3, d3 /* invert source alpha */ - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 - /* now do alpha blending, storing results in 8-bit planar format - into d16 - red, d19 - green, d18 - blue */ - vmull.u8 q10, d3, d6 - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 - vrshr.u16 q13, q10, #8 - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 - vraddhn.u16 d22, q12, q15 -.endm - -.macro pixman_composite_over_8888_0565_process_pixblock_tail - /* ... continue alpha blending */ - vqadd.u8 d16, d2, d20 - vqadd.u8 q9, q0, q11 - /* convert the result to r5g6b5 and store it into {d28, d29} */ - vshll.u8 q14, d16, #8 - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 -.endm - -/* - * OK, now we got almost everything that we need. Using the above two - * macros, the work can be done right. But now we want to optimize - * it a bit. ARM Cortex-A8 is an in-order core, and benefits really - * a lot from good code scheduling and software pipelining. - * - * Let's construct some code, which will run in the core main loop. - * Some pseudo-code of the main loop will look like this: - * head - * while (...) { - * tail - * head - * } - * tail - * - * It may look a bit weird, but this setup allows to hide instruction - * latencies better and also utilize dual-issue capability more - * efficiently (make pairs of load-store and ALU instructions). - * - * So what we need now is a '*_tail_head' macro, which will be used - * in the core main loop. A trivial straightforward implementation - * of this macro would look like this: - * - * pixman_composite_over_8888_0565_process_pixblock_tail - * vst1.16 {d28, d29}, [DST_W, :128]! - * vld1.16 {d4, d5}, [DST_R, :128]! - * vld4.32 {d0, d1, d2, d3}, [SRC]! - * pixman_composite_over_8888_0565_process_pixblock_head - * cache_preload 8, 8 - * - * Now it also got some VLD/VST instructions. We simply can't move from - * processing one block of pixels to the other one with just arithmetics. - * The previously processed data needs to be written to memory and new - * data needs to be fetched. Fortunately, this main loop does not deal - * with partial leading/trailing pixels and can load/store a full block - * of pixels in a bulk. Additionally, destination buffer is already - * 16 bytes aligned here (which is good for performance). - * - * New things here are DST_R, DST_W, SRC and MASK identifiers. These - * are the aliases for ARM registers which are used as pointers for - * accessing data. We maintain separate pointers for reading and writing - * destination buffer (DST_R and DST_W). - * - * Another new thing is 'cache_preload' macro. It is used for prefetching - * data into CPU L2 cache and improve performance when dealing with large - * images which are far larger than cache size. It uses one argument - * (actually two, but they need to be the same here) - number of pixels - * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some - * details about this macro. Moreover, if good performance is needed - * the code from this macro needs to be copied into '*_tail_head' macro - * and mixed with the rest of code for optimal instructions scheduling. - * We are actually doing it below. - * - * Now after all the explanations, here is the optimized code. - * Different instruction streams (originaling from '*_head', '*_tail' - * and 'cache_preload' macro) use different indentation levels for - * better readability. Actually taking the code from one of these - * indentation levels and ignoring a few VLD/VST instructions would - * result in exactly the code from '*_head', '*_tail' or 'cache_preload' - * macro! - */ - -#if 1 - -.macro pixman_composite_over_8888_0565_process_pixblock_tail_head - vqadd.u8 d16, d2, d20 - vld1.16 {d4, d5}, [DST_R, :128]! - vqadd.u8 q9, q0, q11 - vshrn.u16 d6, q2, #8 - fetch_src_pixblock - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vshll.u8 q14, d16, #8 - PF add PF_X, PF_X, #8 - vshll.u8 q8, d19, #8 - PF tst PF_CTL, #0xF - vsri.u8 d6, d6, #5 - PF addne PF_X, PF_X, #8 - vmvn.8 d3, d3 - PF subne PF_CTL, PF_CTL, #1 - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 - vmull.u8 q10, d3, d6 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vsri.u16 q14, q8, #5 - PF cmp PF_X, ORIG_W - vshll.u8 q9, d18, #8 - vrshr.u16 q13, q10, #8 - PF subge PF_X, PF_X, ORIG_W - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 - PF subges PF_CTL, PF_CTL, #0x10 - vsri.u16 q14, q9, #11 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vraddhn.u16 d22, q12, q15 - vst1.16 {d28, d29}, [DST_W, :128]! -.endm - -#else - -/* If we did not care much about the performance, we would just use this... */ -.macro pixman_composite_over_8888_0565_process_pixblock_tail_head - pixman_composite_over_8888_0565_process_pixblock_tail - vst1.16 {d28, d29}, [DST_W, :128]! - vld1.16 {d4, d5}, [DST_R, :128]! - fetch_src_pixblock - pixman_composite_over_8888_0565_process_pixblock_head - cache_preload 8, 8 -.endm - -#endif - -/* - * And now the final part. We are using 'generate_composite_function' macro - * to put all the stuff together. We are specifying the name of the function - * which we want to get, number of bits per pixel for the source, mask and - * destination (0 if unused, like mask in this case). Next come some bit - * flags: - * FLAG_DST_READWRITE - tells that the destination buffer is both read - * and written, for write-only buffer we would use - * FLAG_DST_WRITEONLY flag instead - * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data - * and separate color channels for 32bpp format. - * The next things are: - * - the number of pixels processed per iteration (8 in this case, because - * that's the maximum what can fit into four 64-bit NEON registers). - * - prefetch distance, measured in pixel blocks. In this case it is 5 times - * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal - * prefetch distance can be selected by running some benchmarks. - * - * After that we specify some macros, these are 'default_init', - * 'default_cleanup' here which are empty (but it is possible to have custom - * init/cleanup macros to be able to save/restore some extra NEON registers - * like d8-d15 or do anything else) followed by - * 'pixman_composite_over_8888_0565_process_pixblock_head', - * 'pixman_composite_over_8888_0565_process_pixblock_tail' and - * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' - * which we got implemented above. - * - * The last part is the NEON registers allocation scheme. - */ -generate_composite_function \ - pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_over_8888_0565_process_pixblock_head, \ - pixman_composite_over_8888_0565_process_pixblock_tail, \ - pixman_composite_over_8888_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 24 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_n_0565_process_pixblock_head - /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format - and put data into d6 - red, d7 - green, d30 - blue */ - vshrn.u16 d6, q2, #8 - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vsri.u8 d6, d6, #5 - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 - /* now do alpha blending, storing results in 8-bit planar format - into d16 - red, d19 - green, d18 - blue */ - vmull.u8 q10, d3, d6 - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 - vrshr.u16 q13, q10, #8 - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 - vraddhn.u16 d22, q12, q15 -.endm - -.macro pixman_composite_over_n_0565_process_pixblock_tail - /* ... continue alpha blending */ - vqadd.u8 d16, d2, d20 - vqadd.u8 q9, q0, q11 - /* convert the result to r5g6b5 and store it into {d28, d29} */ - vshll.u8 q14, d16, #8 - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_n_0565_process_pixblock_tail_head - pixman_composite_over_n_0565_process_pixblock_tail - vld1.16 {d4, d5}, [DST_R, :128]! - vst1.16 {d28, d29}, [DST_W, :128]! - pixman_composite_over_n_0565_process_pixblock_head - cache_preload 8, 8 -.endm - -.macro pixman_composite_over_n_0565_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d0, d3[0] - vdup.8 d1, d3[1] - vdup.8 d2, d3[2] - vdup.8 d3, d3[3] - vmvn.8 d3, d3 /* invert source alpha */ -.endm - -generate_composite_function \ - pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_0565_init, \ - default_cleanup, \ - pixman_composite_over_n_0565_process_pixblock_head, \ - pixman_composite_over_n_0565_process_pixblock_tail, \ - pixman_composite_over_n_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 24 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_8888_0565_process_pixblock_head - vshll.u8 q8, d1, #8 - vshll.u8 q14, d2, #8 - vshll.u8 q9, d0, #8 -.endm - -.macro pixman_composite_src_8888_0565_process_pixblock_tail - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 -.endm - -.macro pixman_composite_src_8888_0565_process_pixblock_tail_head - vsri.u16 q14, q8, #5 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - fetch_src_pixblock - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vsri.u16 q14, q9, #11 - PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vshll.u8 q8, d1, #8 - vst1.16 {d28, d29}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - vshll.u8 q14, d2, #8 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vshll.u8 q9, d0, #8 -.endm - -generate_composite_function \ - pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_8888_0565_process_pixblock_head, \ - pixman_composite_src_8888_0565_process_pixblock_tail, \ - pixman_composite_src_8888_0565_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_src_0565_8888_process_pixblock_head - vshrn.u16 d30, q0, #8 - vshrn.u16 d29, q0, #3 - vsli.u16 q0, q0, #5 - vmov.u8 d31, #255 - vsri.u8 d30, d30, #5 - vsri.u8 d29, d29, #6 - vshrn.u16 d28, q0, #2 -.endm - -.macro pixman_composite_src_0565_8888_process_pixblock_tail -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_src_0565_8888_process_pixblock_tail_head - pixman_composite_src_0565_8888_process_pixblock_tail - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - fetch_src_pixblock - pixman_composite_src_0565_8888_process_pixblock_head - cache_preload 8, 8 -.endm - -generate_composite_function \ - pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_0565_8888_process_pixblock_head, \ - pixman_composite_src_0565_8888_process_pixblock_tail, \ - pixman_composite_src_0565_8888_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_add_8_8_process_pixblock_head - vqadd.u8 q14, q0, q2 - vqadd.u8 q15, q1, q3 -.endm - -.macro pixman_composite_add_8_8_process_pixblock_tail -.endm - -.macro pixman_composite_add_8_8_process_pixblock_tail_head - fetch_src_pixblock - PF add PF_X, PF_X, #32 - PF tst PF_CTL, #0xF - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - PF addne PF_X, PF_X, #32 - PF subne PF_CTL, PF_CTL, #1 - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - vqadd.u8 q14, q0, q2 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vqadd.u8 q15, q1, q3 -.endm - -generate_composite_function \ - pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ - FLAG_DST_READWRITE, \ - 32, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8_8_process_pixblock_head, \ - pixman_composite_add_8_8_process_pixblock_tail, \ - pixman_composite_add_8_8_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_add_8888_8888_process_pixblock_tail_head - fetch_src_pixblock - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - vqadd.u8 q14, q0, q2 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vqadd.u8 q15, q1, q3 -.endm - -generate_composite_function \ - pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8_8_process_pixblock_head, \ - pixman_composite_add_8_8_process_pixblock_tail, \ - pixman_composite_add_8888_8888_process_pixblock_tail_head - -generate_composite_function_single_scanline \ - pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8_8_process_pixblock_head, \ - pixman_composite_add_8_8_process_pixblock_tail, \ - pixman_composite_add_8888_8888_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head - vmvn.8 d24, d3 /* get inverted alpha */ - /* do alpha blending */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 - vmull.u8 q11, d24, d7 -.endm - -.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 -.endm - -.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q14, q8, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - PF cmp PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - fetch_src_pixblock - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 -.endm - -generate_composite_function_single_scanline \ - pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ - pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ - pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_over_8888_8888_process_pixblock_head - pixman_composite_out_reverse_8888_8888_process_pixblock_head -.endm - -.macro pixman_composite_over_8888_8888_process_pixblock_tail - pixman_composite_out_reverse_8888_8888_process_pixblock_tail - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 -.endm - -.macro pixman_composite_over_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q14, q8, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - PF cmp PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 - fetch_src_pixblock - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 -.endm - -generate_composite_function \ - pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_process_pixblock_tail_head - -generate_composite_function_single_scanline \ - pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_process_pixblock_tail_head - -/******************************************************************************/ - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_n_8888_process_pixblock_tail_head - pixman_composite_over_8888_8888_process_pixblock_tail - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - pixman_composite_over_8888_8888_process_pixblock_head - cache_preload 8, 8 -.endm - -.macro pixman_composite_over_n_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d0, d3[0] - vdup.8 d1, d3[1] - vdup.8 d2, d3[2] - vdup.8 d3, d3[3] -.endm - -generate_composite_function \ - pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_8888_init, \ - default_cleanup, \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_n_8888_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head - vrshr.u16 q14, q8, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - PF cmp PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 - vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 - vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 -.endm - -.macro pixman_composite_over_reverse_n_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d7[0]}, [DUMMY] - vdup.8 d4, d7[0] - vdup.8 d5, d7[1] - vdup.8 d6, d7[2] - vdup.8 d7, d7[3] -.endm - -generate_composite_function \ - pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_reverse_n_8888_init, \ - default_cleanup, \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 4, /* src_basereg */ \ - 24 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_8888_8_0565_process_pixblock_head - vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ - vmull.u8 q1, d24, d9 - vmull.u8 q6, d24, d10 - vmull.u8 q7, d24, d11 - vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ - vrshr.u16 q9, q1, #8 - vrshr.u16 q10, q6, #8 - vrshr.u16 q11, q7, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q9 - vraddhn.u16 d2, q6, q10 - vraddhn.u16 d3, q7, q11 - vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ - vsri.u8 d7, d7, #6 - vmvn.8 d3, d3 - vshrn.u16 d30, q2, #2 - vmull.u8 q8, d3, d6 /* now do alpha blending */ - vmull.u8 q9, d3, d7 - vmull.u8 q10, d3, d30 -.endm - -.macro pixman_composite_over_8888_8_0565_process_pixblock_tail - /* 3 cycle bubble (after vmull.u8) */ - vrshr.u16 q13, q8, #8 - vrshr.u16 q11, q9, #8 - vrshr.u16 q15, q10, #8 - vraddhn.u16 d16, q8, q13 - vraddhn.u16 d27, q9, q11 - vraddhn.u16 d26, q10, q15 - vqadd.u8 d16, d2, d16 - /* 1 cycle bubble */ - vqadd.u8 q9, q0, q13 - vshll.u8 q14, d16, #8 /* convert to 16bpp */ - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - /* 1 cycle bubble */ - vsri.u16 q14, q9, #11 -.endm - -.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head - vld1.16 {d4, d5}, [DST_R, :128]! - vshrn.u16 d6, q2, #8 - fetch_mask_pixblock - vshrn.u16 d7, q2, #3 - fetch_src_pixblock - vmull.u8 q6, d24, d10 - vrshr.u16 q13, q8, #8 - vrshr.u16 q11, q9, #8 - vrshr.u16 q15, q10, #8 - vraddhn.u16 d16, q8, q13 - vraddhn.u16 d27, q9, q11 - vraddhn.u16 d26, q10, q15 - vqadd.u8 d16, d2, d16 - vmull.u8 q1, d24, d9 - vqadd.u8 q9, q0, q13 - vshll.u8 q14, d16, #8 - vmull.u8 q0, d24, d8 - vshll.u8 q8, d19, #8 - vshll.u8 q9, d18, #8 - vsri.u16 q14, q8, #5 - vmull.u8 q7, d24, d11 - vsri.u16 q14, q9, #11 - - cache_preload 8, 8 - - vsli.u16 q2, q2, #5 - vrshr.u16 q8, q0, #8 - vrshr.u16 q9, q1, #8 - vrshr.u16 q10, q6, #8 - vrshr.u16 q11, q7, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q9 - vraddhn.u16 d2, q6, q10 - vraddhn.u16 d3, q7, q11 - vsri.u8 d6, d6, #5 - vsri.u8 d7, d7, #6 - vmvn.8 d3, d3 - vshrn.u16 d30, q2, #2 - vst1.16 {d28, d29}, [DST_W, :128]! - vmull.u8 q8, d3, d6 - vmull.u8 q9, d3, d7 - vmull.u8 q10, d3, d30 -.endm - -generate_composite_function \ - pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_8_0565_process_pixblock_head, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 24 /* mask_basereg */ - -/******************************************************************************/ - -/* - * This function needs a special initialization of solid mask. - * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET - * offset, split into color components and replicated in d8-d11 - * registers. Additionally, this function needs all the NEON registers, - * so it has to save d8-d15 registers which are callee saved according - * to ABI. These registers are restored from 'cleanup' macro. All the - * other NEON registers are caller saved, so can be clobbered freely - * without introducing any problems. - */ -.macro pixman_composite_over_n_8_0565_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d8, d11[0] - vdup.8 d9, d11[1] - vdup.8 d10, d11[2] - vdup.8 d11, d11[3] -.endm - -.macro pixman_composite_over_n_8_0565_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_8_0565_init, \ - pixman_composite_over_n_8_0565_cleanup, \ - pixman_composite_over_8888_8_0565_process_pixblock_head, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_over_8888_n_0565_init - add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) - vpush {d8-d15} - vld1.32 {d24[0]}, [DUMMY] - vdup.8 d24, d24[3] -.endm - -.macro pixman_composite_over_8888_n_0565_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_8888_n_0565_init, \ - pixman_composite_over_8888_n_0565_cleanup, \ - pixman_composite_over_8888_8_0565_process_pixblock_head, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail, \ - pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 24 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_0565_0565_process_pixblock_head -.endm - -.macro pixman_composite_src_0565_0565_process_pixblock_tail -.endm - -.macro pixman_composite_src_0565_0565_process_pixblock_tail_head - vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! - fetch_src_pixblock - cache_preload 16, 16 -.endm - -generate_composite_function \ - pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ - FLAG_DST_WRITEONLY, \ - 16, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_0565_0565_process_pixblock_head, \ - pixman_composite_src_0565_0565_process_pixblock_tail, \ - pixman_composite_src_0565_0565_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_n_8_process_pixblock_head -.endm - -.macro pixman_composite_src_n_8_process_pixblock_tail -.endm - -.macro pixman_composite_src_n_8_process_pixblock_tail_head - vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! -.endm - -.macro pixman_composite_src_n_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d0[0]}, [DUMMY] - vsli.u64 d0, d0, #8 - vsli.u64 d0, d0, #16 - vsli.u64 d0, d0, #32 - vorr d1, d0, d0 - vorr q1, q0, q0 -.endm - -.macro pixman_composite_src_n_8_cleanup -.endm - -generate_composite_function \ - pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ - FLAG_DST_WRITEONLY, \ - 32, /* number of pixels, processed in a single block */ \ - 0, /* prefetch distance */ \ - pixman_composite_src_n_8_init, \ - pixman_composite_src_n_8_cleanup, \ - pixman_composite_src_n_8_process_pixblock_head, \ - pixman_composite_src_n_8_process_pixblock_tail, \ - pixman_composite_src_n_8_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_n_0565_process_pixblock_head -.endm - -.macro pixman_composite_src_n_0565_process_pixblock_tail -.endm - -.macro pixman_composite_src_n_0565_process_pixblock_tail_head - vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! -.endm - -.macro pixman_composite_src_n_0565_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d0[0]}, [DUMMY] - vsli.u64 d0, d0, #16 - vsli.u64 d0, d0, #32 - vorr d1, d0, d0 - vorr q1, q0, q0 -.endm - -.macro pixman_composite_src_n_0565_cleanup -.endm - -generate_composite_function \ - pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ - FLAG_DST_WRITEONLY, \ - 16, /* number of pixels, processed in a single block */ \ - 0, /* prefetch distance */ \ - pixman_composite_src_n_0565_init, \ - pixman_composite_src_n_0565_cleanup, \ - pixman_composite_src_n_0565_process_pixblock_head, \ - pixman_composite_src_n_0565_process_pixblock_tail, \ - pixman_composite_src_n_0565_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_n_8888_process_pixblock_head -.endm - -.macro pixman_composite_src_n_8888_process_pixblock_tail -.endm - -.macro pixman_composite_src_n_8888_process_pixblock_tail_head - vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! -.endm - -.macro pixman_composite_src_n_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d0[0]}, [DUMMY] - vsli.u64 d0, d0, #32 - vorr d1, d0, d0 - vorr q1, q0, q0 -.endm - -.macro pixman_composite_src_n_8888_cleanup -.endm - -generate_composite_function \ - pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ - FLAG_DST_WRITEONLY, \ - 8, /* number of pixels, processed in a single block */ \ - 0, /* prefetch distance */ \ - pixman_composite_src_n_8888_init, \ - pixman_composite_src_n_8888_cleanup, \ - pixman_composite_src_n_8888_process_pixblock_head, \ - pixman_composite_src_n_8888_process_pixblock_tail, \ - pixman_composite_src_n_8888_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_8888_8888_process_pixblock_head -.endm - -.macro pixman_composite_src_8888_8888_process_pixblock_tail -.endm - -.macro pixman_composite_src_8888_8888_process_pixblock_tail_head - vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! - fetch_src_pixblock - cache_preload 8, 8 -.endm - -generate_composite_function \ - pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_8888_8888_process_pixblock_head, \ - pixman_composite_src_8888_8888_process_pixblock_tail, \ - pixman_composite_src_8888_8888_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_x888_8888_process_pixblock_head - vorr q0, q0, q2 - vorr q1, q1, q2 -.endm - -.macro pixman_composite_src_x888_8888_process_pixblock_tail -.endm - -.macro pixman_composite_src_x888_8888_process_pixblock_tail_head - vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! - fetch_src_pixblock - vorr q0, q0, q2 - vorr q1, q1, q2 - cache_preload 8, 8 -.endm - -.macro pixman_composite_src_x888_8888_init - vmov.u8 q2, #0xFF - vshl.u32 q2, q2, #24 -.endm - -generate_composite_function \ - pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - pixman_composite_src_x888_8888_init, \ - default_cleanup, \ - pixman_composite_src_x888_8888_process_pixblock_head, \ - pixman_composite_src_x888_8888_process_pixblock_tail, \ - pixman_composite_src_x888_8888_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_n_8_8888_process_pixblock_head - /* expecting deinterleaved source data in {d8, d9, d10, d11} */ - /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ - /* and destination data in {d4, d5, d6, d7} */ - /* mask is in d24 (d25, d26, d27 are unused) */ - - /* in */ - vmull.u8 q0, d24, d8 - vmull.u8 q1, d24, d9 - vmull.u8 q6, d24, d10 - vmull.u8 q7, d24, d11 - vrshr.u16 q10, q0, #8 - vrshr.u16 q11, q1, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q13, q7, #8 - vraddhn.u16 d0, q0, q10 - vraddhn.u16 d1, q1, q11 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d3, q7, q13 - vmvn.8 d24, d3 /* get inverted alpha */ - /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ - /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ - /* now do alpha blending */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 - vmull.u8 q11, d24, d7 -.endm - -.macro pixman_composite_over_n_8_8888_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head - pixman_composite_over_n_8_8888_process_pixblock_tail - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - fetch_mask_pixblock - cache_preload 8, 8 - pixman_composite_over_n_8_8888_process_pixblock_head -.endm - -.macro pixman_composite_over_n_8_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d8, d11[0] - vdup.8 d9, d11[1] - vdup.8 d10, d11[2] - vdup.8 d11, d11[3] -.endm - -.macro pixman_composite_over_n_8_8888_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_8_8888_init, \ - pixman_composite_over_n_8_8888_cleanup, \ - pixman_composite_over_n_8_8888_process_pixblock_head, \ - pixman_composite_over_n_8_8888_process_pixblock_tail, \ - pixman_composite_over_n_8_8888_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_over_n_8_8_process_pixblock_head - vmull.u8 q0, d24, d8 - vmull.u8 q1, d25, d8 - vmull.u8 q6, d26, d8 - vmull.u8 q7, d27, d8 - vrshr.u16 q10, q0, #8 - vrshr.u16 q11, q1, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q13, q7, #8 - vraddhn.u16 d0, q0, q10 - vraddhn.u16 d1, q1, q11 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d3, q7, q13 - vmvn.8 q12, q0 - vmvn.8 q13, q1 - vmull.u8 q8, d24, d4 - vmull.u8 q9, d25, d5 - vmull.u8 q10, d26, d6 - vmull.u8 q11, d27, d7 -.endm - -.macro pixman_composite_over_n_8_8_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_n_8_8_process_pixblock_tail_head - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - pixman_composite_over_n_8_8_process_pixblock_tail - fetch_mask_pixblock - cache_preload 32, 32 - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - pixman_composite_over_n_8_8_process_pixblock_head -.endm - -.macro pixman_composite_over_n_8_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d8[0]}, [DUMMY] - vdup.8 d8, d8[3] -.endm - -.macro pixman_composite_over_n_8_8_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ - FLAG_DST_READWRITE, \ - 32, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_8_8_init, \ - pixman_composite_over_n_8_8_cleanup, \ - pixman_composite_over_n_8_8_process_pixblock_head, \ - pixman_composite_over_n_8_8_process_pixblock_tail, \ - pixman_composite_over_n_8_8_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head - /* - * 'combine_mask_ca' replacement - * - * input: solid src (n) in {d8, d9, d10, d11} - * dest in {d4, d5, d6, d7 } - * mask in {d24, d25, d26, d27} - * output: updated src in {d0, d1, d2, d3 } - * updated mask in {d24, d25, d26, d3 } - */ - vmull.u8 q0, d24, d8 - vmull.u8 q1, d25, d9 - vmull.u8 q6, d26, d10 - vmull.u8 q7, d27, d11 - vmull.u8 q9, d11, d25 - vmull.u8 q12, d11, d24 - vmull.u8 q13, d11, d26 - vrshr.u16 q8, q0, #8 - vrshr.u16 q10, q1, #8 - vrshr.u16 q11, q6, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q10 - vraddhn.u16 d2, q6, q11 - vrshr.u16 q11, q12, #8 - vrshr.u16 q8, q9, #8 - vrshr.u16 q6, q13, #8 - vrshr.u16 q10, q7, #8 - vraddhn.u16 d24, q12, q11 - vraddhn.u16 d25, q9, q8 - vraddhn.u16 d26, q13, q6 - vraddhn.u16 d3, q7, q10 - /* - * 'combine_over_ca' replacement - * - * output: updated dest in {d28, d29, d30, d31} - */ - vmvn.8 d24, d24 - vmvn.8 d25, d25 - vmull.u8 q8, d24, d4 - vmull.u8 q9, d25, d5 - vmvn.8 d26, d26 - vmvn.8 d27, d3 - vmull.u8 q10, d26, d6 - vmull.u8 q11, d27, d7 -.endm - -.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail - /* ... continue 'combine_over_ca' replacement */ - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q6, q10, #8 - vrshr.u16 q7, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q6, q10 - vraddhn.u16 d31, q7, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 -.endm - -.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q6, q10, #8 - vrshr.u16 q7, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q6, q10 - vraddhn.u16 d31, q7, q11 - fetch_mask_pixblock - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 - cache_preload 8, 8 - pixman_composite_over_n_8888_8888_ca_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -.macro pixman_composite_over_n_8888_8888_ca_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d8, d11[0] - vdup.8 d9, d11[1] - vdup.8 d10, d11[2] - vdup.8 d11, d11[3] -.endm - -.macro pixman_composite_over_n_8888_8888_ca_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_n_8888_8888_ca_init, \ - pixman_composite_over_n_8888_8888_ca_cleanup, \ - pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ - pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ - pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_in_n_8_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* and destination data in {d4, d5, d6, d7} */ - vmull.u8 q8, d4, d3 - vmull.u8 q9, d5, d3 - vmull.u8 q10, d6, d3 - vmull.u8 q11, d7, d3 -.endm - -.macro pixman_composite_in_n_8_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q8, q14 - vraddhn.u16 d29, q9, q15 - vraddhn.u16 d30, q10, q12 - vraddhn.u16 d31, q11, q13 -.endm - -.macro pixman_composite_in_n_8_process_pixblock_tail_head - pixman_composite_in_n_8_process_pixblock_tail - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - cache_preload 32, 32 - pixman_composite_in_n_8_process_pixblock_head - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -.macro pixman_composite_in_n_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d3, d3[3] -.endm - -.macro pixman_composite_in_n_8_cleanup -.endm - -generate_composite_function \ - pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ - FLAG_DST_READWRITE, \ - 32, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_in_n_8_init, \ - pixman_composite_in_n_8_cleanup, \ - pixman_composite_in_n_8_process_pixblock_head, \ - pixman_composite_in_n_8_process_pixblock_tail, \ - pixman_composite_in_n_8_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 24 /* mask_basereg */ - -.macro pixman_composite_add_n_8_8_process_pixblock_head - /* expecting source data in {d8, d9, d10, d11} */ - /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ - /* and destination data in {d4, d5, d6, d7} */ - /* mask is in d24, d25, d26, d27 */ - vmull.u8 q0, d24, d11 - vmull.u8 q1, d25, d11 - vmull.u8 q6, d26, d11 - vmull.u8 q7, d27, d11 - vrshr.u16 q10, q0, #8 - vrshr.u16 q11, q1, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q13, q7, #8 - vraddhn.u16 d0, q0, q10 - vraddhn.u16 d1, q1, q11 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d3, q7, q13 - vqadd.u8 q14, q0, q2 - vqadd.u8 q15, q1, q3 -.endm - -.macro pixman_composite_add_n_8_8_process_pixblock_tail -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_add_n_8_8_process_pixblock_tail_head - pixman_composite_add_n_8_8_process_pixblock_tail - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - fetch_mask_pixblock - cache_preload 32, 32 - pixman_composite_add_n_8_8_process_pixblock_head -.endm - -.macro pixman_composite_add_n_8_8_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vpush {d8-d15} - vld1.32 {d11[0]}, [DUMMY] - vdup.8 d11, d11[3] -.endm - -.macro pixman_composite_add_n_8_8_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ - FLAG_DST_READWRITE, \ - 32, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_add_n_8_8_init, \ - pixman_composite_add_n_8_8_cleanup, \ - pixman_composite_add_n_8_8_process_pixblock_head, \ - pixman_composite_add_n_8_8_process_pixblock_tail, \ - pixman_composite_add_n_8_8_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_add_8_8_8_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* destination data in {d4, d5, d6, d7} */ - /* mask in {d24, d25, d26, d27} */ - vmull.u8 q8, d24, d0 - vmull.u8 q9, d25, d1 - vmull.u8 q10, d26, d2 - vmull.u8 q11, d27, d3 - vrshr.u16 q0, q8, #8 - vrshr.u16 q1, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d0, q0, q8 - vraddhn.u16 d1, q1, q9 - vraddhn.u16 d2, q12, q10 - vraddhn.u16 d3, q13, q11 - vqadd.u8 q14, q0, q2 - vqadd.u8 q15, q1, q3 -.endm - -.macro pixman_composite_add_8_8_8_process_pixblock_tail -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_add_8_8_8_process_pixblock_tail_head - pixman_composite_add_8_8_8_process_pixblock_tail - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - fetch_mask_pixblock - fetch_src_pixblock - cache_preload 32, 32 - pixman_composite_add_8_8_8_process_pixblock_head -.endm - -.macro pixman_composite_add_8_8_8_init -.endm - -.macro pixman_composite_add_8_8_8_cleanup -.endm - -generate_composite_function \ - pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ - FLAG_DST_READWRITE, \ - 32, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_add_8_8_8_init, \ - pixman_composite_add_8_8_8_cleanup, \ - pixman_composite_add_8_8_8_process_pixblock_head, \ - pixman_composite_add_8_8_8_process_pixblock_tail, \ - pixman_composite_add_8_8_8_process_pixblock_tail_head - -/******************************************************************************/ - -.macro pixman_composite_add_8888_8888_8888_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* destination data in {d4, d5, d6, d7} */ - /* mask in {d24, d25, d26, d27} */ - vmull.u8 q8, d27, d0 - vmull.u8 q9, d27, d1 - vmull.u8 q10, d27, d2 - vmull.u8 q11, d27, d3 - /* 1 cycle bubble */ - vrsra.u16 q8, q8, #8 - vrsra.u16 q9, q9, #8 - vrsra.u16 q10, q10, #8 - vrsra.u16 q11, q11, #8 -.endm - -.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail - /* 2 cycle bubble */ - vrshrn.u16 d28, q8, #8 - vrshrn.u16 d29, q9, #8 - vrshrn.u16 d30, q10, #8 - vrshrn.u16 d31, q11, #8 - vqadd.u8 q14, q2, q14 - /* 1 cycle bubble */ - vqadd.u8 q15, q3, q15 -.endm - -.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head - fetch_src_pixblock - vrshrn.u16 d28, q8, #8 - fetch_mask_pixblock - vrshrn.u16 d29, q9, #8 - vmull.u8 q8, d27, d0 - vrshrn.u16 d30, q10, #8 - vmull.u8 q9, d27, d1 - vrshrn.u16 d31, q11, #8 - vmull.u8 q10, d27, d2 - vqadd.u8 q14, q2, q14 - vmull.u8 q11, d27, d3 - vqadd.u8 q15, q3, q15 - vrsra.u16 q8, q8, #8 - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrsra.u16 q9, q9, #8 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - vrsra.u16 q10, q10, #8 - - cache_preload 8, 8 - - vrsra.u16 q11, q11, #8 -.endm - -generate_composite_function \ - pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8888_8888_8888_process_pixblock_head, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head - -generate_composite_function_single_scanline \ - pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8888_8888_8888_process_pixblock_head, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head - -/******************************************************************************/ - -generate_composite_function \ - pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_add_8888_8888_8888_process_pixblock_head, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 27 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_add_n_8_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d0, d3[0] - vdup.8 d1, d3[1] - vdup.8 d2, d3[2] - vdup.8 d3, d3[3] -.endm - -.macro pixman_composite_add_n_8_8888_cleanup -.endm - -generate_composite_function \ - pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_add_n_8_8888_init, \ - pixman_composite_add_n_8_8888_cleanup, \ - pixman_composite_add_8888_8888_8888_process_pixblock_head, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 27 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_add_8888_n_8888_init - add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) - vld1.32 {d27[0]}, [DUMMY] - vdup.8 d27, d27[3] -.endm - -.macro pixman_composite_add_8888_n_8888_cleanup -.endm - -generate_composite_function \ - pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_add_8888_n_8888_init, \ - pixman_composite_add_8888_n_8888_cleanup, \ - pixman_composite_add_8888_8888_8888_process_pixblock_head, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ - pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 27 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head - /* expecting source data in {d0, d1, d2, d3} */ - /* destination data in {d4, d5, d6, d7} */ - /* solid mask is in d15 */ - - /* 'in' */ - vmull.u8 q8, d15, d3 - vmull.u8 q6, d15, d2 - vmull.u8 q5, d15, d1 - vmull.u8 q4, d15, d0 - vrshr.u16 q13, q8, #8 - vrshr.u16 q12, q6, #8 - vrshr.u16 q11, q5, #8 - vrshr.u16 q10, q4, #8 - vraddhn.u16 d3, q8, q13 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d1, q5, q11 - vraddhn.u16 d0, q4, q10 - vmvn.8 d24, d3 /* get inverted alpha */ - /* now do alpha blending */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 - vmull.u8 q11, d24, d7 -.endm - -.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail - fetch_src_pixblock - cache_preload 8, 8 - fetch_mask_pixblock - pixman_composite_out_reverse_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -generate_composite_function_single_scanline \ - pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ - pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ - pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 12 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_8888_n_8888_process_pixblock_head - pixman_composite_out_reverse_8888_n_8888_process_pixblock_head -.endm - -.macro pixman_composite_over_8888_n_8888_process_pixblock_tail - pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - pixman_composite_over_8888_n_8888_process_pixblock_tail - fetch_src_pixblock - cache_preload 8, 8 - pixman_composite_over_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -.macro pixman_composite_over_8888_n_8888_init - add DUMMY, sp, #48 - vpush {d8-d15} - vld1.32 {d15[0]}, [DUMMY] - vdup.8 d15, d15[3] -.endm - -.macro pixman_composite_over_8888_n_8888_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_8888_n_8888_init, \ - pixman_composite_over_8888_n_8888_cleanup, \ - pixman_composite_over_8888_n_8888_process_pixblock_head, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail_head - -/******************************************************************************/ - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - pixman_composite_over_8888_n_8888_process_pixblock_tail - fetch_src_pixblock - cache_preload 8, 8 - fetch_mask_pixblock - pixman_composite_over_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -generate_composite_function \ - pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_n_8888_process_pixblock_head, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 12 /* mask_basereg */ - -generate_composite_function_single_scanline \ - pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_n_8888_process_pixblock_head, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 12 /* mask_basereg */ - -/******************************************************************************/ - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - pixman_composite_over_8888_n_8888_process_pixblock_tail - fetch_src_pixblock - cache_preload 8, 8 - fetch_mask_pixblock - pixman_composite_over_8888_n_8888_process_pixblock_head - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -.endm - -generate_composite_function \ - pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_n_8888_process_pixblock_head, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 15 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_0888_0888_process_pixblock_head -.endm - -.macro pixman_composite_src_0888_0888_process_pixblock_tail -.endm - -.macro pixman_composite_src_0888_0888_process_pixblock_tail_head - vst3.8 {d0, d1, d2}, [DST_W]! - fetch_src_pixblock - cache_preload 8, 8 -.endm - -generate_composite_function \ - pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ - FLAG_DST_WRITEONLY, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_0888_0888_process_pixblock_head, \ - pixman_composite_src_0888_0888_process_pixblock_tail, \ - pixman_composite_src_0888_0888_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_0888_8888_rev_process_pixblock_head - vswp d0, d2 -.endm - -.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail -.endm - -.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head - vst4.8 {d0, d1, d2, d3}, [DST_W]! - fetch_src_pixblock - vswp d0, d2 - cache_preload 8, 8 -.endm - -.macro pixman_composite_src_0888_8888_rev_init - veor d3, d3, d3 -.endm - -generate_composite_function \ - pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - pixman_composite_src_0888_8888_rev_init, \ - default_cleanup, \ - pixman_composite_src_0888_8888_rev_process_pixblock_head, \ - pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ - pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ - 0, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_0888_0565_rev_process_pixblock_head - vshll.u8 q8, d1, #8 - vshll.u8 q9, d2, #8 -.endm - -.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail - vshll.u8 q14, d0, #8 - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 -.endm - -.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head - vshll.u8 q14, d0, #8 - fetch_src_pixblock - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 - vshll.u8 q8, d1, #8 - vst1.16 {d28, d29}, [DST_W, :128]! - vshll.u8 q9, d2, #8 -.endm - -generate_composite_function \ - pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ - FLAG_DST_WRITEONLY, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_0888_0565_rev_process_pixblock_head, \ - pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ - pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_pixbuf_8888_process_pixblock_head - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 -.endm - -.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - vraddhn.u16 d30, q11, q8 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d28, q13, q10 -.endm - -.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - fetch_src_pixblock - vraddhn.u16 d30, q11, q8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d28, q13, q10 - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -.endm - -generate_composite_function \ - pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_pixbuf_8888_process_pixblock_head, \ - pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ - pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 -.endm - -.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - vraddhn.u16 d28, q11, q8 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d30, q13, q10 -.endm - -.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - fetch_src_pixblock - vraddhn.u16 d28, q11, q8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d30, q13, q10 - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -.endm - -generate_composite_function \ - pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ - pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ - pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 0, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_0565_8_0565_process_pixblock_head - /* mask is in d15 */ - convert_0565_to_x888 q4, d2, d1, d0 - convert_0565_to_x888 q5, d6, d5, d4 - /* source pixel data is in {d0, d1, d2, XX} */ - /* destination pixel data is in {d4, d5, d6, XX} */ - vmvn.8 d7, d15 - vmull.u8 q6, d15, d2 - vmull.u8 q5, d15, d1 - vmull.u8 q4, d15, d0 - vmull.u8 q8, d7, d4 - vmull.u8 q9, d7, d5 - vmull.u8 q13, d7, d6 - vrshr.u16 q12, q6, #8 - vrshr.u16 q11, q5, #8 - vrshr.u16 q10, q4, #8 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d1, q5, q11 - vraddhn.u16 d0, q4, q10 -.endm - -.macro pixman_composite_over_0565_8_0565_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q13, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q13 - vqadd.u8 q0, q0, q14 - vqadd.u8 q1, q1, q15 - /* 32bpp result is in {d0, d1, d2, XX} */ - convert_8888_to_0565 d2, d1, d0, q14, q15, q3 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head - fetch_mask_pixblock - pixman_composite_over_0565_8_0565_process_pixblock_tail - fetch_src_pixblock - vld1.16 {d10, d11}, [DST_R, :128]! - cache_preload 8, 8 - pixman_composite_over_0565_8_0565_process_pixblock_head - vst1.16 {d28, d29}, [DST_W, :128]! -.endm - -generate_composite_function \ - pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_0565_8_0565_process_pixblock_head, \ - pixman_composite_over_0565_8_0565_process_pixblock_tail, \ - pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 10, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 15 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_over_0565_n_0565_init - add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) - vpush {d8-d15} - vld1.32 {d15[0]}, [DUMMY] - vdup.8 d15, d15[3] -.endm - -.macro pixman_composite_over_0565_n_0565_cleanup - vpop {d8-d15} -.endm - -generate_composite_function \ - pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - pixman_composite_over_0565_n_0565_init, \ - pixman_composite_over_0565_n_0565_cleanup, \ - pixman_composite_over_0565_8_0565_process_pixblock_head, \ - pixman_composite_over_0565_8_0565_process_pixblock_tail, \ - pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 10, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 15 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_add_0565_8_0565_process_pixblock_head - /* mask is in d15 */ - convert_0565_to_x888 q4, d2, d1, d0 - convert_0565_to_x888 q5, d6, d5, d4 - /* source pixel data is in {d0, d1, d2, XX} */ - /* destination pixel data is in {d4, d5, d6, XX} */ - vmull.u8 q6, d15, d2 - vmull.u8 q5, d15, d1 - vmull.u8 q4, d15, d0 - vrshr.u16 q12, q6, #8 - vrshr.u16 q11, q5, #8 - vrshr.u16 q10, q4, #8 - vraddhn.u16 d2, q6, q12 - vraddhn.u16 d1, q5, q11 - vraddhn.u16 d0, q4, q10 -.endm - -.macro pixman_composite_add_0565_8_0565_process_pixblock_tail - vqadd.u8 q0, q0, q2 - vqadd.u8 q1, q1, q3 - /* 32bpp result is in {d0, d1, d2, XX} */ - convert_8888_to_0565 d2, d1, d0, q14, q15, q3 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head - fetch_mask_pixblock - pixman_composite_add_0565_8_0565_process_pixblock_tail - fetch_src_pixblock - vld1.16 {d10, d11}, [DST_R, :128]! - cache_preload 8, 8 - pixman_composite_add_0565_8_0565_process_pixblock_head - vst1.16 {d28, d29}, [DST_W, :128]! -.endm - -generate_composite_function \ - pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_add_0565_8_0565_process_pixblock_head, \ - pixman_composite_add_0565_8_0565_process_pixblock_tail, \ - pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 10, /* dst_r_basereg */ \ - 8, /* src_basereg */ \ - 15 /* mask_basereg */ - -/******************************************************************************/ - -.macro pixman_composite_out_reverse_8_0565_process_pixblock_head - /* mask is in d15 */ - convert_0565_to_x888 q5, d6, d5, d4 - /* destination pixel data is in {d4, d5, d6, xx} */ - vmvn.8 d24, d15 /* get inverted alpha */ - /* now do alpha blending */ - vmull.u8 q8, d24, d4 - vmull.u8 q9, d24, d5 - vmull.u8 q10, d24, d6 -.endm - -.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail - vrshr.u16 q14, q8, #8 - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vraddhn.u16 d0, q14, q8 - vraddhn.u16 d1, q15, q9 - vraddhn.u16 d2, q12, q10 - /* 32bpp result is in {d0, d1, d2, XX} */ - convert_8888_to_0565 d2, d1, d0, q14, q15, q3 -.endm - -/* TODO: expand macros and do better instructions scheduling */ -.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head - fetch_src_pixblock - pixman_composite_out_reverse_8_0565_process_pixblock_tail - vld1.16 {d10, d11}, [DST_R, :128]! - cache_preload 8, 8 - pixman_composite_out_reverse_8_0565_process_pixblock_head - vst1.16 {d28, d29}, [DST_W, :128]! -.endm - -generate_composite_function \ - pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_out_reverse_8_0565_process_pixblock_head, \ - pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ - pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 10, /* dst_r_basereg */ \ - 15, /* src_basereg */ \ - 0 /* mask_basereg */ - -/******************************************************************************/ - -generate_composite_function_nearest_scanline \ - pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_process_pixblock_tail_head - -generate_composite_function_nearest_scanline \ - pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_over_8888_0565_process_pixblock_head, \ - pixman_composite_over_8888_0565_process_pixblock_tail, \ - pixman_composite_over_8888_0565_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 24 /* mask_basereg */ - -generate_composite_function_nearest_scanline \ - pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_8888_0565_process_pixblock_head, \ - pixman_composite_src_8888_0565_process_pixblock_tail, \ - pixman_composite_src_8888_0565_process_pixblock_tail_head - -generate_composite_function_nearest_scanline \ - pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ - default_cleanup, \ - pixman_composite_src_0565_8888_process_pixblock_head, \ - pixman_composite_src_0565_8888_process_pixblock_tail, \ - pixman_composite_src_0565_8888_process_pixblock_tail_head +/* + * Copyright © 2009 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +/* + * This file contains implementations of NEON optimized pixel processing + * functions. There is no full and detailed tutorial, but some functions + * (those which are exposing some new or interesting features) are + * extensively commented and can be used as examples. + * + * You may want to have a look at the comments for following functions: + * - pixman_composite_over_8888_0565_asm_neon + * - pixman_composite_over_n_8_0565_asm_neon + */ + +/* Prevent the stack from becoming executable for no reason... */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .fpu neon + .arch armv7a + .object_arch armv4 + .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ + .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ + .arm + .altmacro + +#include "pixman-arm-neon-asm.h" + +/* Global configuration options and preferences */ + +/* + * The code can optionally make use of unaligned memory accesses to improve + * performance of handling leading/trailing pixels for each scanline. + * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for + * example in linux if unaligned memory accesses are not configured to + * generate.exceptions. + */ +.set RESPECT_STRICT_ALIGNMENT, 1 + +/* + * Set default prefetch type. There is a choice between the following options: + * + * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work + * as NOP to workaround some HW bugs or for whatever other reason) + * + * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where + * advanced prefetch intruduces heavy overhead) + * + * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 + * which can run ARM and NEON instructions simultaneously so that extra ARM + * instructions do not add (many) extra cycles, but improve prefetch efficiency) + * + * Note: some types of function can't support advanced prefetch and fallback + * to simple one (those which handle 24bpp pixels) + */ +.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED + +/* Prefetch distance in pixels for simple prefetch */ +.set PREFETCH_DISTANCE_SIMPLE, 64 + +/* + * Implementation of pixman_composite_over_8888_0565_asm_neon + * + * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and + * performs OVER compositing operation. Function fast_composite_over_8888_0565 + * from pixman-fast-path.c does the same in C and can be used as a reference. + * + * First we need to have some NEON assembly code which can do the actual + * operation on the pixels and provide it to the template macro. + * + * Template macro quite conveniently takes care of emitting all the necessary + * code for memory reading and writing (including quite tricky cases of + * handling unaligned leading/trailing pixels), so we only need to deal with + * the data in NEON registers. + * + * NEON registers allocation in general is recommented to be the following: + * d0, d1, d2, d3 - contain loaded source pixel data + * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) + * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) + * d28, d29, d30, d31 - place for storing the result (destination pixels) + * + * As can be seen above, four 64-bit NEON registers are used for keeping + * intermediate pixel data and up to 8 pixels can be processed in one step + * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). + * + * This particular function uses the following registers allocation: + * d0, d1, d2, d3 - contain loaded source pixel data + * d4, d5 - contain loaded destination pixels (they are needed) + * d28, d29 - place for storing the result (destination pixels) + */ + +/* + * Step one. We need to have some code to do some arithmetics on pixel data. + * This is implemented as a pair of macros: '*_head' and '*_tail'. When used + * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, + * perform all the needed calculations and write the result to {d28, d29}. + * The rationale for having two macros and not just one will be explained + * later. In practice, any single monolitic function which does the work can + * be split into two parts in any arbitrary way without affecting correctness. + * + * There is one special trick here too. Common template macro can optionally + * make our life a bit easier by doing R, G, B, A color components + * deinterleaving for 32bpp pixel formats (and this feature is used in + * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that + * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we + * actually use d0 register for blue channel (a vector of eight 8-bit + * values), d1 register for green, d2 for red and d3 for alpha. This + * simple conversion can be also done with a few NEON instructions: + * + * Packed to planar conversion: + * vuzp.8 d0, d1 + * vuzp.8 d2, d3 + * vuzp.8 d1, d3 + * vuzp.8 d0, d2 + * + * Planar to packed conversion: + * vzip.8 d0, d2 + * vzip.8 d1, d3 + * vzip.8 d2, d3 + * vzip.8 d0, d1 + * + * But pixel can be loaded directly in planar format using VLD4.8 NEON + * instruction. It is 1 cycle slower than VLD1.32, so this is not always + * desirable, that's why deinterleaving is optional. + * + * But anyway, here is the code: + */ +.macro pixman_composite_over_8888_0565_process_pixblock_head + /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format + and put data into d6 - red, d7 - green, d30 - blue */ + vshrn.u16 d6, q2, #8 + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vsri.u8 d6, d6, #5 + vmvn.8 d3, d3 /* invert source alpha */ + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + /* now do alpha blending, storing results in 8-bit planar format + into d16 - red, d19 - green, d18 - blue */ + vmull.u8 q10, d3, d6 + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + vrshr.u16 q13, q10, #8 + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 + vraddhn.u16 d22, q12, q15 +.endm + +.macro pixman_composite_over_8888_0565_process_pixblock_tail + /* ... continue alpha blending */ + vqadd.u8 d16, d2, d20 + vqadd.u8 q9, q0, q11 + /* convert the result to r5g6b5 and store it into {d28, d29} */ + vshll.u8 q14, d16, #8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +/* + * OK, now we got almost everything that we need. Using the above two + * macros, the work can be done right. But now we want to optimize + * it a bit. ARM Cortex-A8 is an in-order core, and benefits really + * a lot from good code scheduling and software pipelining. + * + * Let's construct some code, which will run in the core main loop. + * Some pseudo-code of the main loop will look like this: + * head + * while (...) { + * tail + * head + * } + * tail + * + * It may look a bit weird, but this setup allows to hide instruction + * latencies better and also utilize dual-issue capability more + * efficiently (make pairs of load-store and ALU instructions). + * + * So what we need now is a '*_tail_head' macro, which will be used + * in the core main loop. A trivial straightforward implementation + * of this macro would look like this: + * + * pixman_composite_over_8888_0565_process_pixblock_tail + * vst1.16 {d28, d29}, [DST_W, :128]! + * vld1.16 {d4, d5}, [DST_R, :128]! + * vld4.32 {d0, d1, d2, d3}, [SRC]! + * pixman_composite_over_8888_0565_process_pixblock_head + * cache_preload 8, 8 + * + * Now it also got some VLD/VST instructions. We simply can't move from + * processing one block of pixels to the other one with just arithmetics. + * The previously processed data needs to be written to memory and new + * data needs to be fetched. Fortunately, this main loop does not deal + * with partial leading/trailing pixels and can load/store a full block + * of pixels in a bulk. Additionally, destination buffer is already + * 16 bytes aligned here (which is good for performance). + * + * New things here are DST_R, DST_W, SRC and MASK identifiers. These + * are the aliases for ARM registers which are used as pointers for + * accessing data. We maintain separate pointers for reading and writing + * destination buffer (DST_R and DST_W). + * + * Another new thing is 'cache_preload' macro. It is used for prefetching + * data into CPU L2 cache and improve performance when dealing with large + * images which are far larger than cache size. It uses one argument + * (actually two, but they need to be the same here) - number of pixels + * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some + * details about this macro. Moreover, if good performance is needed + * the code from this macro needs to be copied into '*_tail_head' macro + * and mixed with the rest of code for optimal instructions scheduling. + * We are actually doing it below. + * + * Now after all the explanations, here is the optimized code. + * Different instruction streams (originaling from '*_head', '*_tail' + * and 'cache_preload' macro) use different indentation levels for + * better readability. Actually taking the code from one of these + * indentation levels and ignoring a few VLD/VST instructions would + * result in exactly the code from '*_head', '*_tail' or 'cache_preload' + * macro! + */ + +#if 1 + +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head + vqadd.u8 d16, d2, d20 + vld1.16 {d4, d5}, [DST_R, :128]! + vqadd.u8 q9, q0, q11 + vshrn.u16 d6, q2, #8 + fetch_src_pixblock + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vshll.u8 q14, d16, #8 + PF add PF_X, PF_X, #8 + vshll.u8 q8, d19, #8 + PF tst PF_CTL, #0xF + vsri.u8 d6, d6, #5 + PF addne PF_X, PF_X, #8 + vmvn.8 d3, d3 + PF subne PF_CTL, PF_CTL, #1 + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + vmull.u8 q10, d3, d6 + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vsri.u16 q14, q8, #5 + PF cmp PF_X, ORIG_W + vshll.u8 q9, d18, #8 + vrshr.u16 q13, q10, #8 + PF subge PF_X, PF_X, ORIG_W + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 + PF subges PF_CTL, PF_CTL, #0x10 + vsri.u16 q14, q9, #11 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vraddhn.u16 d22, q12, q15 + vst1.16 {d28, d29}, [DST_W, :128]! +.endm + +#else + +/* If we did not care much about the performance, we would just use this... */ +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head + pixman_composite_over_8888_0565_process_pixblock_tail + vst1.16 {d28, d29}, [DST_W, :128]! + vld1.16 {d4, d5}, [DST_R, :128]! + fetch_src_pixblock + pixman_composite_over_8888_0565_process_pixblock_head + cache_preload 8, 8 +.endm + +#endif + +/* + * And now the final part. We are using 'generate_composite_function' macro + * to put all the stuff together. We are specifying the name of the function + * which we want to get, number of bits per pixel for the source, mask and + * destination (0 if unused, like mask in this case). Next come some bit + * flags: + * FLAG_DST_READWRITE - tells that the destination buffer is both read + * and written, for write-only buffer we would use + * FLAG_DST_WRITEONLY flag instead + * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data + * and separate color channels for 32bpp format. + * The next things are: + * - the number of pixels processed per iteration (8 in this case, because + * that's the maximum what can fit into four 64-bit NEON registers). + * - prefetch distance, measured in pixel blocks. In this case it is 5 times + * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal + * prefetch distance can be selected by running some benchmarks. + * + * After that we specify some macros, these are 'default_init', + * 'default_cleanup' here which are empty (but it is possible to have custom + * init/cleanup macros to be able to save/restore some extra NEON registers + * like d8-d15 or do anything else) followed by + * 'pixman_composite_over_8888_0565_process_pixblock_head', + * 'pixman_composite_over_8888_0565_process_pixblock_tail' and + * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' + * which we got implemented above. + * + * The last part is the NEON registers allocation scheme. + */ +generate_composite_function \ + pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_0565_process_pixblock_head, \ + pixman_composite_over_8888_0565_process_pixblock_tail, \ + pixman_composite_over_8888_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_n_0565_process_pixblock_head + /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format + and put data into d6 - red, d7 - green, d30 - blue */ + vshrn.u16 d6, q2, #8 + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vsri.u8 d6, d6, #5 + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + /* now do alpha blending, storing results in 8-bit planar format + into d16 - red, d19 - green, d18 - blue */ + vmull.u8 q10, d3, d6 + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + vrshr.u16 q13, q10, #8 + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 + vraddhn.u16 d22, q12, q15 +.endm + +.macro pixman_composite_over_n_0565_process_pixblock_tail + /* ... continue alpha blending */ + vqadd.u8 d16, d2, d20 + vqadd.u8 q9, q0, q11 + /* convert the result to r5g6b5 and store it into {d28, d29} */ + vshll.u8 q14, d16, #8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_0565_process_pixblock_tail_head + pixman_composite_over_n_0565_process_pixblock_tail + vld1.16 {d4, d5}, [DST_R, :128]! + vst1.16 {d28, d29}, [DST_W, :128]! + pixman_composite_over_n_0565_process_pixblock_head + cache_preload 8, 8 +.endm + +.macro pixman_composite_over_n_0565_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] + vdup.8 d2, d3[2] + vdup.8 d3, d3[3] + vmvn.8 d3, d3 /* invert source alpha */ +.endm + +generate_composite_function \ + pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_0565_init, \ + default_cleanup, \ + pixman_composite_over_n_0565_process_pixblock_head, \ + pixman_composite_over_n_0565_process_pixblock_tail, \ + pixman_composite_over_n_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_8888_0565_process_pixblock_head + vshll.u8 q8, d1, #8 + vshll.u8 q14, d2, #8 + vshll.u8 q9, d0, #8 +.endm + +.macro pixman_composite_src_8888_0565_process_pixblock_tail + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +.macro pixman_composite_src_8888_0565_process_pixblock_tail_head + vsri.u16 q14, q8, #5 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + fetch_src_pixblock + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vsri.u16 q14, q9, #11 + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vshll.u8 q8, d1, #8 + vst1.16 {d28, d29}, [DST_W, :128]! + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + vshll.u8 q14, d2, #8 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vshll.u8 q9, d0, #8 +.endm + +generate_composite_function \ + pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_8888_0565_process_pixblock_head, \ + pixman_composite_src_8888_0565_process_pixblock_tail, \ + pixman_composite_src_8888_0565_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_src_0565_8888_process_pixblock_head + vshrn.u16 d30, q0, #8 + vshrn.u16 d29, q0, #3 + vsli.u16 q0, q0, #5 + vmov.u8 d31, #255 + vsri.u8 d30, d30, #5 + vsri.u8 d29, d29, #6 + vshrn.u16 d28, q0, #2 +.endm + +.macro pixman_composite_src_0565_8888_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_src_0565_8888_process_pixblock_tail_head + pixman_composite_src_0565_8888_process_pixblock_tail + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + fetch_src_pixblock + pixman_composite_src_0565_8888_process_pixblock_head + cache_preload 8, 8 +.endm + +generate_composite_function \ + pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0565_8888_process_pixblock_head, \ + pixman_composite_src_0565_8888_process_pixblock_tail, \ + pixman_composite_src_0565_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8_8_process_pixblock_head + vqadd.u8 q14, q0, q2 + vqadd.u8 q15, q1, q3 +.endm + +.macro pixman_composite_add_8_8_process_pixblock_tail +.endm + +.macro pixman_composite_add_8_8_process_pixblock_tail_head + fetch_src_pixblock + PF add PF_X, PF_X, #32 + PF tst PF_CTL, #0xF + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + PF addne PF_X, PF_X, #32 + PF subne PF_CTL, PF_CTL, #1 + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + vqadd.u8 q14, q0, q2 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vqadd.u8 q15, q1, q3 +.endm + +generate_composite_function \ + pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ + pixman_composite_add_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8888_8888_process_pixblock_tail_head + fetch_src_pixblock + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + vqadd.u8 q14, q0, q2 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vqadd.u8 q15, q1, q3 +.endm + +generate_composite_function \ + pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ + pixman_composite_add_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ + pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ + pixman_composite_add_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head + vmvn.8 d24, d3 /* get inverted alpha */ + /* do alpha blending */ + vmull.u8 q8, d24, d4 + vmull.u8 q9, d24, d5 + vmull.u8 q10, d24, d6 + vmull.u8 q11, d24, d7 +.endm + +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 +.endm + +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q14, q8, #8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + PF cmp PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + fetch_src_pixblock + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmvn.8 d22, d3 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vmull.u8 q10, d22, d6 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 +.endm + +generate_composite_function_single_scanline \ + pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ + pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ + pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_8888_8888_process_pixblock_head + pixman_composite_out_reverse_8888_8888_process_pixblock_head +.endm + +.macro pixman_composite_over_8888_8888_process_pixblock_tail + pixman_composite_out_reverse_8888_8888_process_pixblock_tail + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +.macro pixman_composite_over_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q14, q8, #8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + PF cmp PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + fetch_src_pixblock + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmvn.8 d22, d3 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vmull.u8 q10, d22, d6 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 +.endm + +generate_composite_function \ + pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ + pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8888_process_pixblock_tail_head + pixman_composite_over_8888_8888_process_pixblock_tail + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + pixman_composite_over_8888_8888_process_pixblock_head + cache_preload 8, 8 +.endm + +.macro pixman_composite_over_n_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] + vdup.8 d2, d3[2] + vdup.8 d3, d3[3] +.endm + +generate_composite_function \ + pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8888_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_n_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head + vrshr.u16 q14, q8, #8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + PF cmp PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! + vmvn.8 d22, d3 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 + vmull.u8 q10, d22, d6 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 +.endm + +.macro pixman_composite_over_reverse_n_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d7[0]}, [DUMMY] + vdup.8 d4, d7[0] + vdup.8 d5, d7[1] + vdup.8 d6, d7[2] + vdup.8 d7, d7[3] +.endm + +generate_composite_function \ + pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_reverse_n_8888_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 4, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_8888_8_0565_process_pixblock_head + vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ + vmull.u8 q1, d24, d9 + vmull.u8 q6, d24, d10 + vmull.u8 q7, d24, d11 + vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ + vrshr.u16 q9, q1, #8 + vrshr.u16 q10, q6, #8 + vrshr.u16 q11, q7, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q9 + vraddhn.u16 d2, q6, q10 + vraddhn.u16 d3, q7, q11 + vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ + vsri.u8 d7, d7, #6 + vmvn.8 d3, d3 + vshrn.u16 d30, q2, #2 + vmull.u8 q8, d3, d6 /* now do alpha blending */ + vmull.u8 q9, d3, d7 + vmull.u8 q10, d3, d30 +.endm + +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail + /* 3 cycle bubble (after vmull.u8) */ + vrshr.u16 q13, q8, #8 + vrshr.u16 q11, q9, #8 + vrshr.u16 q15, q10, #8 + vraddhn.u16 d16, q8, q13 + vraddhn.u16 d27, q9, q11 + vraddhn.u16 d26, q10, q15 + vqadd.u8 d16, d2, d16 + /* 1 cycle bubble */ + vqadd.u8 q9, q0, q13 + vshll.u8 q14, d16, #8 /* convert to 16bpp */ + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + /* 1 cycle bubble */ + vsri.u16 q14, q9, #11 +.endm + +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head + vld1.16 {d4, d5}, [DST_R, :128]! + vshrn.u16 d6, q2, #8 + fetch_mask_pixblock + vshrn.u16 d7, q2, #3 + fetch_src_pixblock + vmull.u8 q6, d24, d10 + vrshr.u16 q13, q8, #8 + vrshr.u16 q11, q9, #8 + vrshr.u16 q15, q10, #8 + vraddhn.u16 d16, q8, q13 + vraddhn.u16 d27, q9, q11 + vraddhn.u16 d26, q10, q15 + vqadd.u8 d16, d2, d16 + vmull.u8 q1, d24, d9 + vqadd.u8 q9, q0, q13 + vshll.u8 q14, d16, #8 + vmull.u8 q0, d24, d8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vmull.u8 q7, d24, d11 + vsri.u16 q14, q9, #11 + + cache_preload 8, 8 + + vsli.u16 q2, q2, #5 + vrshr.u16 q8, q0, #8 + vrshr.u16 q9, q1, #8 + vrshr.u16 q10, q6, #8 + vrshr.u16 q11, q7, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q9 + vraddhn.u16 d2, q6, q10 + vraddhn.u16 d3, q7, q11 + vsri.u8 d6, d6, #5 + vsri.u8 d7, d7, #6 + vmvn.8 d3, d3 + vshrn.u16 d30, q2, #2 + vst1.16 {d28, d29}, [DST_W, :128]! + vmull.u8 q8, d3, d6 + vmull.u8 q9, d3, d7 + vmull.u8 q10, d3, d30 +.endm + +generate_composite_function \ + pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +/* + * This function needs a special initialization of solid mask. + * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET + * offset, split into color components and replicated in d8-d11 + * registers. Additionally, this function needs all the NEON registers, + * so it has to save d8-d15 registers which are callee saved according + * to ABI. These registers are restored from 'cleanup' macro. All the + * other NEON registers are caller saved, so can be clobbered freely + * without introducing any problems. + */ +.macro pixman_composite_over_n_8_0565_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d8, d11[0] + vdup.8 d9, d11[1] + vdup.8 d10, d11[2] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_over_n_8_0565_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8_0565_init, \ + pixman_composite_over_n_8_0565_cleanup, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_8888_n_0565_init + add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) + vpush {d8-d15} + vld1.32 {d24[0]}, [DUMMY] + vdup.8 d24, d24[3] +.endm + +.macro pixman_composite_over_8888_n_0565_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_8888_n_0565_init, \ + pixman_composite_over_8888_n_0565_cleanup, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_0565_0565_process_pixblock_head +.endm + +.macro pixman_composite_src_0565_0565_process_pixblock_tail +.endm + +.macro pixman_composite_src_0565_0565_process_pixblock_tail_head + vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! + fetch_src_pixblock + cache_preload 16, 16 +.endm + +generate_composite_function \ + pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ + FLAG_DST_WRITEONLY, \ + 16, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0565_0565_process_pixblock_head, \ + pixman_composite_src_0565_0565_process_pixblock_tail, \ + pixman_composite_src_0565_0565_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_8_process_pixblock_head +.endm + +.macro pixman_composite_src_n_8_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_8_process_pixblock_tail_head + vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d0[0]}, [DUMMY] + vsli.u64 d0, d0, #8 + vsli.u64 d0, d0, #16 + vsli.u64 d0, d0, #32 + vorr d1, d0, d0 + vorr q1, q0, q0 +.endm + +.macro pixman_composite_src_n_8_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ + FLAG_DST_WRITEONLY, \ + 32, /* number of pixels, processed in a single block */ \ + 0, /* prefetch distance */ \ + pixman_composite_src_n_8_init, \ + pixman_composite_src_n_8_cleanup, \ + pixman_composite_src_n_8_process_pixblock_head, \ + pixman_composite_src_n_8_process_pixblock_tail, \ + pixman_composite_src_n_8_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_0565_process_pixblock_head +.endm + +.macro pixman_composite_src_n_0565_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_0565_process_pixblock_tail_head + vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_0565_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d0[0]}, [DUMMY] + vsli.u64 d0, d0, #16 + vsli.u64 d0, d0, #32 + vorr d1, d0, d0 + vorr q1, q0, q0 +.endm + +.macro pixman_composite_src_n_0565_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ + FLAG_DST_WRITEONLY, \ + 16, /* number of pixels, processed in a single block */ \ + 0, /* prefetch distance */ \ + pixman_composite_src_n_0565_init, \ + pixman_composite_src_n_0565_cleanup, \ + pixman_composite_src_n_0565_process_pixblock_head, \ + pixman_composite_src_n_0565_process_pixblock_tail, \ + pixman_composite_src_n_0565_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_8888_process_pixblock_head +.endm + +.macro pixman_composite_src_n_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_8888_process_pixblock_tail_head + vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d0[0]}, [DUMMY] + vsli.u64 d0, d0, #32 + vorr d1, d0, d0 + vorr q1, q0, q0 +.endm + +.macro pixman_composite_src_n_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 0, /* prefetch distance */ \ + pixman_composite_src_n_8888_init, \ + pixman_composite_src_n_8888_cleanup, \ + pixman_composite_src_n_8888_process_pixblock_head, \ + pixman_composite_src_n_8888_process_pixblock_tail, \ + pixman_composite_src_n_8888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_8888_8888_process_pixblock_head +.endm + +.macro pixman_composite_src_8888_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_8888_8888_process_pixblock_tail_head + vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! + fetch_src_pixblock + cache_preload 8, 8 +.endm + +generate_composite_function \ + pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_8888_8888_process_pixblock_head, \ + pixman_composite_src_8888_8888_process_pixblock_tail, \ + pixman_composite_src_8888_8888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_x888_8888_process_pixblock_head + vorr q0, q0, q2 + vorr q1, q1, q2 +.endm + +.macro pixman_composite_src_x888_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_x888_8888_process_pixblock_tail_head + vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! + fetch_src_pixblock + vorr q0, q0, q2 + vorr q1, q1, q2 + cache_preload 8, 8 +.endm + +.macro pixman_composite_src_x888_8888_init + vmov.u8 q2, #0xFF + vshl.u32 q2, q2, #24 +.endm + +generate_composite_function \ + pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + pixman_composite_src_x888_8888_init, \ + default_cleanup, \ + pixman_composite_src_x888_8888_process_pixblock_head, \ + pixman_composite_src_x888_8888_process_pixblock_tail, \ + pixman_composite_src_x888_8888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_n_8_8888_process_pixblock_head + /* expecting deinterleaved source data in {d8, d9, d10, d11} */ + /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ + /* and destination data in {d4, d5, d6, d7} */ + /* mask is in d24 (d25, d26, d27 are unused) */ + + /* in */ + vmull.u8 q0, d24, d8 + vmull.u8 q1, d24, d9 + vmull.u8 q6, d24, d10 + vmull.u8 q7, d24, d11 + vrshr.u16 q10, q0, #8 + vrshr.u16 q11, q1, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q13, q7, #8 + vraddhn.u16 d0, q0, q10 + vraddhn.u16 d1, q1, q11 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d3, q7, q13 + vmvn.8 d24, d3 /* get inverted alpha */ + /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ + /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ + /* now do alpha blending */ + vmull.u8 q8, d24, d4 + vmull.u8 q9, d24, d5 + vmull.u8 q10, d24, d6 + vmull.u8 q11, d24, d7 +.endm + +.macro pixman_composite_over_n_8_8888_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head + pixman_composite_over_n_8_8888_process_pixblock_tail + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + fetch_mask_pixblock + cache_preload 8, 8 + pixman_composite_over_n_8_8888_process_pixblock_head +.endm + +.macro pixman_composite_over_n_8_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d8, d11[0] + vdup.8 d9, d11[1] + vdup.8 d10, d11[2] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_over_n_8_8888_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8_8888_init, \ + pixman_composite_over_n_8_8888_cleanup, \ + pixman_composite_over_n_8_8888_process_pixblock_head, \ + pixman_composite_over_n_8_8888_process_pixblock_tail, \ + pixman_composite_over_n_8_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_n_8_8_process_pixblock_head + vmull.u8 q0, d24, d8 + vmull.u8 q1, d25, d8 + vmull.u8 q6, d26, d8 + vmull.u8 q7, d27, d8 + vrshr.u16 q10, q0, #8 + vrshr.u16 q11, q1, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q13, q7, #8 + vraddhn.u16 d0, q0, q10 + vraddhn.u16 d1, q1, q11 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d3, q7, q13 + vmvn.8 q12, q0 + vmvn.8 q13, q1 + vmull.u8 q8, d24, d4 + vmull.u8 q9, d25, d5 + vmull.u8 q10, d26, d6 + vmull.u8 q11, d27, d7 +.endm + +.macro pixman_composite_over_n_8_8_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8_8_process_pixblock_tail_head + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_n_8_8_process_pixblock_tail + fetch_mask_pixblock + cache_preload 32, 32 + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + pixman_composite_over_n_8_8_process_pixblock_head +.endm + +.macro pixman_composite_over_n_8_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d8[0]}, [DUMMY] + vdup.8 d8, d8[3] +.endm + +.macro pixman_composite_over_n_8_8_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8_8_init, \ + pixman_composite_over_n_8_8_cleanup, \ + pixman_composite_over_n_8_8_process_pixblock_head, \ + pixman_composite_over_n_8_8_process_pixblock_tail, \ + pixman_composite_over_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head + /* + * 'combine_mask_ca' replacement + * + * input: solid src (n) in {d8, d9, d10, d11} + * dest in {d4, d5, d6, d7 } + * mask in {d24, d25, d26, d27} + * output: updated src in {d0, d1, d2, d3 } + * updated mask in {d24, d25, d26, d3 } + */ + vmull.u8 q0, d24, d8 + vmull.u8 q1, d25, d9 + vmull.u8 q6, d26, d10 + vmull.u8 q7, d27, d11 + vmull.u8 q9, d11, d25 + vmull.u8 q12, d11, d24 + vmull.u8 q13, d11, d26 + vrshr.u16 q8, q0, #8 + vrshr.u16 q10, q1, #8 + vrshr.u16 q11, q6, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q10 + vraddhn.u16 d2, q6, q11 + vrshr.u16 q11, q12, #8 + vrshr.u16 q8, q9, #8 + vrshr.u16 q6, q13, #8 + vrshr.u16 q10, q7, #8 + vraddhn.u16 d24, q12, q11 + vraddhn.u16 d25, q9, q8 + vraddhn.u16 d26, q13, q6 + vraddhn.u16 d3, q7, q10 + /* + * 'combine_over_ca' replacement + * + * output: updated dest in {d28, d29, d30, d31} + */ + vmvn.8 d24, d24 + vmvn.8 d25, d25 + vmull.u8 q8, d24, d4 + vmull.u8 q9, d25, d5 + vmvn.8 d26, d26 + vmvn.8 d27, d3 + vmull.u8 q10, d26, d6 + vmull.u8 q11, d27, d7 +.endm + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail + /* ... continue 'combine_over_ca' replacement */ + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q6, q10, #8 + vrshr.u16 q7, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q6, q10 + vraddhn.u16 d31, q7, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q6, q10, #8 + vrshr.u16 q7, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q6, q10 + vraddhn.u16 d31, q7, q11 + fetch_mask_pixblock + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + cache_preload 8, 8 + pixman_composite_over_n_8888_8888_ca_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_n_8888_8888_ca_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d8, d11[0] + vdup.8 d9, d11[1] + vdup.8 d10, d11[2] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_over_n_8888_8888_ca_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8888_8888_ca_init, \ + pixman_composite_over_n_8888_8888_ca_cleanup, \ + pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ + pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ + pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_in_n_8_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* and destination data in {d4, d5, d6, d7} */ + vmull.u8 q8, d4, d3 + vmull.u8 q9, d5, d3 + vmull.u8 q10, d6, d3 + vmull.u8 q11, d7, d3 +.endm + +.macro pixman_composite_in_n_8_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q8, q14 + vraddhn.u16 d29, q9, q15 + vraddhn.u16 d30, q10, q12 + vraddhn.u16 d31, q11, q13 +.endm + +.macro pixman_composite_in_n_8_process_pixblock_tail_head + pixman_composite_in_n_8_process_pixblock_tail + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + cache_preload 32, 32 + pixman_composite_in_n_8_process_pixblock_head + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_in_n_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d3, d3[3] +.endm + +.macro pixman_composite_in_n_8_cleanup +.endm + +generate_composite_function \ + pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_in_n_8_init, \ + pixman_composite_in_n_8_cleanup, \ + pixman_composite_in_n_8_process_pixblock_head, \ + pixman_composite_in_n_8_process_pixblock_tail, \ + pixman_composite_in_n_8_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ + +.macro pixman_composite_add_n_8_8_process_pixblock_head + /* expecting source data in {d8, d9, d10, d11} */ + /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ + /* and destination data in {d4, d5, d6, d7} */ + /* mask is in d24, d25, d26, d27 */ + vmull.u8 q0, d24, d11 + vmull.u8 q1, d25, d11 + vmull.u8 q6, d26, d11 + vmull.u8 q7, d27, d11 + vrshr.u16 q10, q0, #8 + vrshr.u16 q11, q1, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q13, q7, #8 + vraddhn.u16 d0, q0, q10 + vraddhn.u16 d1, q1, q11 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d3, q7, q13 + vqadd.u8 q14, q0, q2 + vqadd.u8 q15, q1, q3 +.endm + +.macro pixman_composite_add_n_8_8_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_n_8_8_process_pixblock_tail_head + pixman_composite_add_n_8_8_process_pixblock_tail + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + fetch_mask_pixblock + cache_preload 32, 32 + pixman_composite_add_n_8_8_process_pixblock_head +.endm + +.macro pixman_composite_add_n_8_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_add_n_8_8_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_n_8_8_init, \ + pixman_composite_add_n_8_8_cleanup, \ + pixman_composite_add_n_8_8_process_pixblock_head, \ + pixman_composite_add_n_8_8_process_pixblock_tail, \ + pixman_composite_add_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8_8_8_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* destination data in {d4, d5, d6, d7} */ + /* mask in {d24, d25, d26, d27} */ + vmull.u8 q8, d24, d0 + vmull.u8 q9, d25, d1 + vmull.u8 q10, d26, d2 + vmull.u8 q11, d27, d3 + vrshr.u16 q0, q8, #8 + vrshr.u16 q1, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q9 + vraddhn.u16 d2, q12, q10 + vraddhn.u16 d3, q13, q11 + vqadd.u8 q14, q0, q2 + vqadd.u8 q15, q1, q3 +.endm + +.macro pixman_composite_add_8_8_8_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_8_8_8_process_pixblock_tail_head + pixman_composite_add_8_8_8_process_pixblock_tail + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + fetch_mask_pixblock + fetch_src_pixblock + cache_preload 32, 32 + pixman_composite_add_8_8_8_process_pixblock_head +.endm + +.macro pixman_composite_add_8_8_8_init +.endm + +.macro pixman_composite_add_8_8_8_cleanup +.endm + +generate_composite_function \ + pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_8_8_8_init, \ + pixman_composite_add_8_8_8_cleanup, \ + pixman_composite_add_8_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_8_process_pixblock_tail, \ + pixman_composite_add_8_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8888_8888_8888_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* destination data in {d4, d5, d6, d7} */ + /* mask in {d24, d25, d26, d27} */ + vmull.u8 q8, d27, d0 + vmull.u8 q9, d27, d1 + vmull.u8 q10, d27, d2 + vmull.u8 q11, d27, d3 + /* 1 cycle bubble */ + vrsra.u16 q8, q8, #8 + vrsra.u16 q9, q9, #8 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 +.endm + +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail + /* 2 cycle bubble */ + vrshrn.u16 d28, q8, #8 + vrshrn.u16 d29, q9, #8 + vrshrn.u16 d30, q10, #8 + vrshrn.u16 d31, q11, #8 + vqadd.u8 q14, q2, q14 + /* 1 cycle bubble */ + vqadd.u8 q15, q3, q15 +.endm + +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + fetch_src_pixblock + vrshrn.u16 d28, q8, #8 + fetch_mask_pixblock + vrshrn.u16 d29, q9, #8 + vmull.u8 q8, d27, d0 + vrshrn.u16 d30, q10, #8 + vmull.u8 q9, d27, d1 + vrshrn.u16 d31, q11, #8 + vmull.u8 q10, d27, d2 + vqadd.u8 q14, q2, q14 + vmull.u8 q11, d27, d3 + vqadd.u8 q15, q3, q15 + vrsra.u16 q8, q8, #8 + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrsra.u16 q9, q9, #8 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + vrsra.u16 q10, q10, #8 + + cache_preload 8, 8 + + vrsra.u16 q11, q11, #8 +.endm + +generate_composite_function \ + pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ + pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +generate_composite_function \ + pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 27 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_add_n_8_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] + vdup.8 d2, d3[2] + vdup.8 d3, d3[3] +.endm + +.macro pixman_composite_add_n_8_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_n_8_8888_init, \ + pixman_composite_add_n_8_8888_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 27 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_add_8888_n_8888_init + add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) + vld1.32 {d27[0]}, [DUMMY] + vdup.8 d27, d27[3] +.endm + +.macro pixman_composite_add_8888_n_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_8888_n_8888_init, \ + pixman_composite_add_8888_n_8888_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 27 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* destination data in {d4, d5, d6, d7} */ + /* solid mask is in d15 */ + + /* 'in' */ + vmull.u8 q8, d15, d3 + vmull.u8 q6, d15, d2 + vmull.u8 q5, d15, d1 + vmull.u8 q4, d15, d0 + vrshr.u16 q13, q8, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q11, q5, #8 + vrshr.u16 q10, q4, #8 + vraddhn.u16 d3, q8, q13 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d1, q5, q11 + vraddhn.u16 d0, q4, q10 + vmvn.8 d24, d3 /* get inverted alpha */ + /* now do alpha blending */ + vmull.u8 q8, d24, d4 + vmull.u8 q9, d24, d5 + vmull.u8 q10, d24, d6 + vmull.u8 q11, d24, d7 +.endm + +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail + fetch_src_pixblock + cache_preload 8, 8 + fetch_mask_pixblock + pixman_composite_out_reverse_8888_n_8888_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +generate_composite_function_single_scanline \ + pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ + pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ + pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_8888_n_8888_process_pixblock_head + pixman_composite_out_reverse_8888_n_8888_process_pixblock_head +.endm + +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail + pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_8888_n_8888_process_pixblock_tail + fetch_src_pixblock + cache_preload 8, 8 + pixman_composite_over_8888_n_8888_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_8888_n_8888_init + add DUMMY, sp, #48 + vpush {d8-d15} + vld1.32 {d15[0]}, [DUMMY] + vdup.8 d15, d15[3] +.endm + +.macro pixman_composite_over_8888_n_8888_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_8888_n_8888_init, \ + pixman_composite_over_8888_n_8888_cleanup, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail_head + +/******************************************************************************/ + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_8888_n_8888_process_pixblock_tail + fetch_src_pixblock + cache_preload 8, 8 + fetch_mask_pixblock + pixman_composite_over_8888_n_8888_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +generate_composite_function \ + pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + +generate_composite_function_single_scanline \ + pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + +/******************************************************************************/ + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_8888_n_8888_process_pixblock_tail + fetch_src_pixblock + cache_preload 8, 8 + fetch_mask_pixblock + pixman_composite_over_8888_n_8888_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +generate_composite_function \ + pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 15 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_0888_process_pixblock_head +.endm + +.macro pixman_composite_src_0888_0888_process_pixblock_tail +.endm + +.macro pixman_composite_src_0888_0888_process_pixblock_tail_head + vst3.8 {d0, d1, d2}, [DST_W]! + fetch_src_pixblock + cache_preload 8, 8 +.endm + +generate_composite_function \ + pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0888_0888_process_pixblock_head, \ + pixman_composite_src_0888_0888_process_pixblock_tail, \ + pixman_composite_src_0888_0888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_head + vswp d0, d2 +.endm + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail +.endm + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head + vst4.8 {d0, d1, d2, d3}, [DST_W]! + fetch_src_pixblock + vswp d0, d2 + cache_preload 8, 8 +.endm + +.macro pixman_composite_src_0888_8888_rev_init + veor d3, d3, d3 +.endm + +generate_composite_function \ + pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + pixman_composite_src_0888_8888_rev_init, \ + default_cleanup, \ + pixman_composite_src_0888_8888_rev_process_pixblock_head, \ + pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ + pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_head + vshll.u8 q8, d1, #8 + vshll.u8 q9, d2, #8 +.endm + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail + vshll.u8 q14, d0, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head + vshll.u8 q14, d0, #8 + fetch_src_pixblock + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 + vshll.u8 q8, d1, #8 + vst1.16 {d28, d29}, [DST_W, :128]! + vshll.u8 q9, d2, #8 +.endm + +generate_composite_function \ + pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0888_0565_rev_process_pixblock_head, \ + pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ + pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_head + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 +.endm + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + vraddhn.u16 d30, q11, q8 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d28, q13, q10 +.endm + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + fetch_src_pixblock + vraddhn.u16 d30, q11, q8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d28, q13, q10 + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +.endm + +generate_composite_function \ + pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_pixbuf_8888_process_pixblock_head, \ + pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ + pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 +.endm + +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + vraddhn.u16 d28, q11, q8 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d30, q13, q10 +.endm + +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + fetch_src_pixblock + vraddhn.u16 d28, q11, q8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d30, q13, q10 + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +.endm + +generate_composite_function \ + pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ + pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ + pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_0565_8_0565_process_pixblock_head + /* mask is in d15 */ + convert_0565_to_x888 q4, d2, d1, d0 + convert_0565_to_x888 q5, d6, d5, d4 + /* source pixel data is in {d0, d1, d2, XX} */ + /* destination pixel data is in {d4, d5, d6, XX} */ + vmvn.8 d7, d15 + vmull.u8 q6, d15, d2 + vmull.u8 q5, d15, d1 + vmull.u8 q4, d15, d0 + vmull.u8 q8, d7, d4 + vmull.u8 q9, d7, d5 + vmull.u8 q13, d7, d6 + vrshr.u16 q12, q6, #8 + vrshr.u16 q11, q5, #8 + vrshr.u16 q10, q4, #8 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d1, q5, q11 + vraddhn.u16 d0, q4, q10 +.endm + +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q13, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q13 + vqadd.u8 q0, q0, q14 + vqadd.u8 q1, q1, q15 + /* 32bpp result is in {d0, d1, d2, XX} */ + convert_8888_to_0565 d2, d1, d0, q14, q15, q3 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head + fetch_mask_pixblock + pixman_composite_over_0565_8_0565_process_pixblock_tail + fetch_src_pixblock + vld1.16 {d10, d11}, [DST_R, :128]! + cache_preload 8, 8 + pixman_composite_over_0565_8_0565_process_pixblock_head + vst1.16 {d28, d29}, [DST_W, :128]! +.endm + +generate_composite_function \ + pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_0565_8_0565_process_pixblock_head, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 10, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 15 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_0565_n_0565_init + add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) + vpush {d8-d15} + vld1.32 {d15[0]}, [DUMMY] + vdup.8 d15, d15[3] +.endm + +.macro pixman_composite_over_0565_n_0565_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_0565_n_0565_init, \ + pixman_composite_over_0565_n_0565_cleanup, \ + pixman_composite_over_0565_8_0565_process_pixblock_head, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 10, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 15 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_add_0565_8_0565_process_pixblock_head + /* mask is in d15 */ + convert_0565_to_x888 q4, d2, d1, d0 + convert_0565_to_x888 q5, d6, d5, d4 + /* source pixel data is in {d0, d1, d2, XX} */ + /* destination pixel data is in {d4, d5, d6, XX} */ + vmull.u8 q6, d15, d2 + vmull.u8 q5, d15, d1 + vmull.u8 q4, d15, d0 + vrshr.u16 q12, q6, #8 + vrshr.u16 q11, q5, #8 + vrshr.u16 q10, q4, #8 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d1, q5, q11 + vraddhn.u16 d0, q4, q10 +.endm + +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail + vqadd.u8 q0, q0, q2 + vqadd.u8 q1, q1, q3 + /* 32bpp result is in {d0, d1, d2, XX} */ + convert_8888_to_0565 d2, d1, d0, q14, q15, q3 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head + fetch_mask_pixblock + pixman_composite_add_0565_8_0565_process_pixblock_tail + fetch_src_pixblock + vld1.16 {d10, d11}, [DST_R, :128]! + cache_preload 8, 8 + pixman_composite_add_0565_8_0565_process_pixblock_head + vst1.16 {d28, d29}, [DST_W, :128]! +.endm + +generate_composite_function \ + pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_add_0565_8_0565_process_pixblock_head, \ + pixman_composite_add_0565_8_0565_process_pixblock_tail, \ + pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 10, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 15 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_out_reverse_8_0565_process_pixblock_head + /* mask is in d15 */ + convert_0565_to_x888 q5, d6, d5, d4 + /* destination pixel data is in {d4, d5, d6, xx} */ + vmvn.8 d24, d15 /* get inverted alpha */ + /* now do alpha blending */ + vmull.u8 q8, d24, d4 + vmull.u8 q9, d24, d5 + vmull.u8 q10, d24, d6 +.endm + +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vraddhn.u16 d0, q14, q8 + vraddhn.u16 d1, q15, q9 + vraddhn.u16 d2, q12, q10 + /* 32bpp result is in {d0, d1, d2, XX} */ + convert_8888_to_0565 d2, d1, d0, q14, q15, q3 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head + fetch_src_pixblock + pixman_composite_out_reverse_8_0565_process_pixblock_tail + vld1.16 {d10, d11}, [DST_R, :128]! + cache_preload 8, 8 + pixman_composite_out_reverse_8_0565_process_pixblock_head + vst1.16 {d28, d29}, [DST_W, :128]! +.endm + +generate_composite_function \ + pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_out_reverse_8_0565_process_pixblock_head, \ + pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ + pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 10, /* dst_r_basereg */ \ + 15, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_process_pixblock_tail_head + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_0565_process_pixblock_head, \ + pixman_composite_over_8888_0565_process_pixblock_tail, \ + pixman_composite_over_8888_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_8888_0565_process_pixblock_head, \ + pixman_composite_src_8888_0565_process_pixblock_tail, \ + pixman_composite_src_8888_0565_process_pixblock_tail_head + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0565_8888_process_pixblock_head, \ + pixman_composite_src_0565_8888_process_pixblock_tail, \ + pixman_composite_src_0565_8888_process_pixblock_tail_head + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_8_0565_process_pixblock_head, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail, \ + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 24 /* mask_basereg */ + +generate_composite_function_nearest_scanline \ + pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_0565_8_0565_process_pixblock_head, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail, \ + pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 10, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 15 /* mask_basereg */ diff --git a/pixman/pixman/pixman-arm-neon.c b/pixman/pixman/pixman-arm-neon.c index 7d6c83775..3e0c0d1c2 100644 --- a/pixman/pixman/pixman-arm-neon.c +++ b/pixman/pixman/pixman-arm-neon.c @@ -122,6 +122,11 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC, PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC, uint16_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565, + OVER, uint32_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565, + OVER, uint16_t, uint16_t) + void pixman_composite_src_n_8_asm_neon (int32_t w, int32_t h, @@ -332,6 +337,12 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888), SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888), + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565), + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565), + + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565), + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565), + { PIXMAN_OP_NONE }, }; diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c index 4cb8321aa..92f030871 100644 --- a/pixman/pixman/pixman-fast-path.c +++ b/pixman/pixman/pixman-fast-path.c @@ -1,2227 +1,2228 @@ -/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ -/* - * Copyright © 2000 SuSE, Inc. - * Copyright © 2007 Red Hat, Inc. - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of SuSE not be used in advertising or - * publicity pertaining to distribution of the software without specific, - * written prior permission. SuSE makes no representations about the - * suitability of this software for any purpose. It is provided "as is" - * without express or implied warranty. - * - * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE - * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - * Author: Keith Packard, SuSE, Inc. - */ - -#ifdef HAVE_CONFIG_H -#include -#endif -#include -#include -#include "pixman-private.h" -#include "pixman-combine32.h" -#include "pixman-fast-path.h" - -static force_inline uint32_t -fetch_24 (uint8_t *a) -{ - if (((unsigned long)a) & 1) - { -#ifdef WORDS_BIGENDIAN - return (*a << 16) | (*(uint16_t *)(a + 1)); -#else - return *a | (*(uint16_t *)(a + 1) << 8); -#endif - } - else - { -#ifdef WORDS_BIGENDIAN - return (*(uint16_t *)a << 8) | *(a + 2); -#else - return *(uint16_t *)a | (*(a + 2) << 16); -#endif - } -} - -static force_inline void -store_24 (uint8_t *a, - uint32_t v) -{ - if (((unsigned long)a) & 1) - { -#ifdef WORDS_BIGENDIAN - *a = (uint8_t) (v >> 16); - *(uint16_t *)(a + 1) = (uint16_t) (v); -#else - *a = (uint8_t) (v); - *(uint16_t *)(a + 1) = (uint16_t) (v >> 8); -#endif - } - else - { -#ifdef WORDS_BIGENDIAN - *(uint16_t *)a = (uint16_t)(v >> 8); - *(a + 2) = (uint8_t)v; -#else - *(uint16_t *)a = (uint16_t)v; - *(a + 2) = (uint8_t)(v >> 16); -#endif - } -} - -static force_inline uint32_t -over (uint32_t src, - uint32_t dest) -{ - uint32_t a = ~src >> 24; - - UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src); - - return dest; -} - -static uint32_t -in (uint32_t x, - uint8_t y) -{ - uint16_t a = y; - - UN8x4_MUL_UN8 (x, a); - - return x; -} - -/* - * Naming convention: - * - * op_src_mask_dest - */ -static void -fast_composite_over_x888_8_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *src, *src_line; - uint32_t *dst, *dst_line; - uint8_t *mask, *mask_line; - int src_stride, mask_stride, dst_stride; - uint8_t m; - uint32_t s, d; - int32_t w; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - while (height--) - { - src = src_line; - src_line += src_stride; - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - - w = width; - while (w--) - { - m = *mask++; - if (m) - { - s = *src | 0xff000000; - - if (m == 0xff) - { - *dst = s; - } - else - { - d = in (s, m); - *dst = over (d, *dst); - } - } - src++; - dst++; - } - } -} - -static void -fast_composite_in_n_8_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dest_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint8_t *dst_line, *dst; - uint8_t *mask_line, *mask, m; - int dst_stride, mask_stride; - int32_t w; - uint16_t t; - - src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); - - srca = src >> 24; - - PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - - if (srca == 0xff) - { - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w--) - { - m = *mask++; - - if (m == 0) - *dst = 0; - else if (m != 0xff) - *dst = MUL_UN8 (m, *dst, t); - - dst++; - } - } - } - else - { - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w--) - { - m = *mask++; - m = MUL_UN8 (m, srca, t); - - if (m == 0) - *dst = 0; - else if (m != 0xff) - *dst = MUL_UN8 (m, *dst, t); - - dst++; - } - } - } -} - -static void -fast_composite_in_8_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dest_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint8_t *dst_line, *dst; - uint8_t *src_line, *src; - int dst_stride, src_stride; - int32_t w; - uint8_t s; - uint16_t t; - - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); - PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w--) - { - s = *src++; - - if (s == 0) - *dst = 0; - else if (s != 0xff) - *dst = MUL_UN8 (s, *dst, t); - - dst++; - } - } -} - -static void -fast_composite_over_n_8_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint32_t *dst_line, *dst, d; - uint8_t *mask_line, *mask, m; - int dst_stride, mask_stride; - int32_t w; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - srca = src >> 24; - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w--) - { - m = *mask++; - if (m == 0xff) - { - if (srca == 0xff) - *dst = src; - else - *dst = over (src, *dst); - } - else if (m) - { - d = in (src, m); - *dst = over (d, *dst); - } - dst++; - } - } -} - -static void -fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, s; - uint32_t *dst_line, *dst, d; - uint32_t *mask_line, *mask, ma; - int dst_stride, mask_stride; - int32_t w; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w--) - { - ma = *mask++; - - if (ma) - { - d = *dst; - s = src; - - UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d); - - *dst = s; - } - - dst++; - } - } -} - -static void -fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca, s; - uint32_t *dst_line, *dst, d; - uint32_t *mask_line, *mask, ma; - int dst_stride, mask_stride; - int32_t w; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - srca = src >> 24; - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w--) - { - ma = *mask++; - if (ma == 0xffffffff) - { - if (srca == 0xff) - *dst = src; - else - *dst = over (src, *dst); - } - else if (ma) - { - d = *dst; - s = src; - - UN8x4_MUL_UN8x4 (s, ma); - UN8x4_MUL_UN8 (ma, srca); - ma = ~ma; - UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s); - - *dst = d; - } - - dst++; - } - } -} - -static void -fast_composite_over_n_8_0888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint8_t *dst_line, *dst; - uint32_t d; - uint8_t *mask_line, *mask, m; - int dst_stride, mask_stride; - int32_t w; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - srca = src >> 24; - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3); - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w--) - { - m = *mask++; - if (m == 0xff) - { - if (srca == 0xff) - { - d = src; - } - else - { - d = fetch_24 (dst); - d = over (src, d); - } - store_24 (dst, d); - } - else if (m) - { - d = over (in (src, m), fetch_24 (dst)); - store_24 (dst, d); - } - dst += 3; - } - } -} - -static void -fast_composite_over_n_8_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint16_t *dst_line, *dst; - uint32_t d; - uint8_t *mask_line, *mask, m; - int dst_stride, mask_stride; - int32_t w; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - srca = src >> 24; - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w--) - { - m = *mask++; - if (m == 0xff) - { - if (srca == 0xff) - { - d = src; - } - else - { - d = *dst; - d = over (src, CONVERT_0565_TO_0888 (d)); - } - *dst = CONVERT_8888_TO_0565 (d); - } - else if (m) - { - d = *dst; - d = over (in (src, m), CONVERT_0565_TO_0888 (d)); - *dst = CONVERT_8888_TO_0565 (d); - } - dst++; - } - } -} - -static void -fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca, s; - uint16_t src16; - uint16_t *dst_line, *dst; - uint32_t d; - uint32_t *mask_line, *mask, ma; - int dst_stride, mask_stride; - int32_t w; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - srca = src >> 24; - if (src == 0) - return; - - src16 = CONVERT_8888_TO_0565 (src); - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w--) - { - ma = *mask++; - if (ma == 0xffffffff) - { - if (srca == 0xff) - { - *dst = src16; - } - else - { - d = *dst; - d = over (src, CONVERT_0565_TO_0888 (d)); - *dst = CONVERT_8888_TO_0565 (d); - } - } - else if (ma) - { - d = *dst; - d = CONVERT_0565_TO_0888 (d); - - s = src; - - UN8x4_MUL_UN8x4 (s, ma); - UN8x4_MUL_UN8 (ma, srca); - ma = ~ma; - UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s); - - *dst = CONVERT_8888_TO_0565 (d); - } - dst++; - } - } -} - -static void -fast_composite_over_8888_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *dst_line, *dst; - uint32_t *src_line, *src, s; - int dst_stride, src_stride; - uint8_t a; - int32_t w; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w--) - { - s = *src++; - a = s >> 24; - if (a == 0xff) - *dst = s; - else if (s) - *dst = over (s, *dst); - dst++; - } - } -} - -static void -fast_composite_src_x888_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *dst_line, *dst; - uint32_t *src_line, *src; - int dst_stride, src_stride; - int32_t w; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w--) - *dst++ = (*src++) | 0xff000000; - } -} - -#if 0 -static void -fast_composite_over_8888_0888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint8_t *dst_line, *dst; - uint32_t d; - uint32_t *src_line, *src, s; - uint8_t a; - int dst_stride, src_stride; - int32_t w; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3); - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w--) - { - s = *src++; - a = s >> 24; - if (a) - { - if (a == 0xff) - d = s; - else - d = over (s, fetch_24 (dst)); - - store_24 (dst, d); - } - dst += 3; - } - } -} -#endif - -static void -fast_composite_over_8888_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint16_t *dst_line, *dst; - uint32_t d; - uint32_t *src_line, *src, s; - uint8_t a; - int dst_stride, src_stride; - int32_t w; - - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w--) - { - s = *src++; - a = s >> 24; - if (s) - { - if (a == 0xff) - { - d = s; - } - else - { - d = *dst; - d = over (s, CONVERT_0565_TO_0888 (d)); - } - *dst = CONVERT_8888_TO_0565 (d); - } - dst++; - } - } -} - -static void -fast_composite_src_x888_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint16_t *dst_line, *dst; - uint32_t *src_line, *src, s; - int dst_stride, src_stride; - int32_t w; - - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w--) - { - s = *src++; - *dst = CONVERT_8888_TO_0565 (s); - dst++; - } - } -} - -static void -fast_composite_add_8_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint8_t *dst_line, *dst; - uint8_t *src_line, *src; - int dst_stride, src_stride; - int32_t w; - uint8_t s, d; - uint16_t t; - - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w--) - { - s = *src++; - if (s) - { - if (s != 0xff) - { - d = *dst; - t = d + s; - s = t | (0 - (t >> 8)); - } - *dst = s; - } - dst++; - } - } -} - -static void -fast_composite_add_8888_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *dst_line, *dst; - uint32_t *src_line, *src; - int dst_stride, src_stride; - int32_t w; - uint32_t s, d; - - PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w--) - { - s = *src++; - if (s) - { - if (s != 0xffffffff) - { - d = *dst; - if (d) - UN8x4_ADD_UN8x4 (s, d); - } - *dst = s; - } - dst++; - } - } -} - -static void -fast_composite_add_n_8_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint8_t *dst_line, *dst; - uint8_t *mask_line, *mask; - int dst_stride, mask_stride; - int32_t w; - uint32_t src; - uint8_t sa; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - sa = (src >> 24); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - while (w--) - { - uint16_t tmp; - uint16_t a; - uint32_t m, d; - uint32_t r; - - a = *mask++; - d = *dst; - - m = MUL_UN8 (sa, a, tmp); - r = ADD_UN8 (m, d, tmp); - - *dst++ = r; - } - } -} - -#ifdef WORDS_BIGENDIAN -#define CREATE_BITMASK(n) (0x80000000 >> (n)) -#define UPDATE_BITMASK(n) ((n) >> 1) -#else -#define CREATE_BITMASK(n) (1 << (n)) -#define UPDATE_BITMASK(n) ((n) << 1) -#endif - -#define TEST_BIT(p, n) \ - (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31)) -#define SET_BIT(p, n) \ - do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0); - -static void -fast_composite_add_1000_1000 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *dst_line, *dst; - uint32_t *src_line, *src; - int dst_stride, src_stride; - int32_t w; - - PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t, - src_stride, src_line, 1); - PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t, - dst_stride, dst_line, 1); - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - while (w--) - { - /* - * TODO: improve performance by processing uint32_t data instead - * of individual bits - */ - if (TEST_BIT (src, src_x + w)) - SET_BIT (dst, dest_x + w); - } - } -} - -static void -fast_composite_over_n_1_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint32_t *dst, *dst_line; - uint32_t *mask, *mask_line; - int mask_stride, dst_stride; - uint32_t bitcache, bitmask; - int32_t w; - - if (width <= 0) - return; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - srca = src >> 24; - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, - dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t, - mask_stride, mask_line, 1); - mask_line += mask_x >> 5; - - if (srca == 0xff) - { - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - bitcache = *mask++; - bitmask = CREATE_BITMASK (mask_x & 31); - - while (w--) - { - if (bitmask == 0) - { - bitcache = *mask++; - bitmask = CREATE_BITMASK (0); - } - if (bitcache & bitmask) - *dst = src; - bitmask = UPDATE_BITMASK (bitmask); - dst++; - } - } - } - else - { - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - bitcache = *mask++; - bitmask = CREATE_BITMASK (mask_x & 31); - - while (w--) - { - if (bitmask == 0) - { - bitcache = *mask++; - bitmask = CREATE_BITMASK (0); - } - if (bitcache & bitmask) - *dst = over (src, *dst); - bitmask = UPDATE_BITMASK (bitmask); - dst++; - } - } - } -} - -static void -fast_composite_over_n_1_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint16_t *dst, *dst_line; - uint32_t *mask, *mask_line; - int mask_stride, dst_stride; - uint32_t bitcache, bitmask; - int32_t w; - uint32_t d; - uint16_t src565; - - if (width <= 0) - return; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - srca = src >> 24; - if (src == 0) - return; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, - dst_stride, dst_line, 1); - PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t, - mask_stride, mask_line, 1); - mask_line += mask_x >> 5; - - if (srca == 0xff) - { - src565 = CONVERT_8888_TO_0565 (src); - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - bitcache = *mask++; - bitmask = CREATE_BITMASK (mask_x & 31); - - while (w--) - { - if (bitmask == 0) - { - bitcache = *mask++; - bitmask = CREATE_BITMASK (0); - } - if (bitcache & bitmask) - *dst = src565; - bitmask = UPDATE_BITMASK (bitmask); - dst++; - } - } - } - else - { - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - - bitcache = *mask++; - bitmask = CREATE_BITMASK (mask_x & 31); - - while (w--) - { - if (bitmask == 0) - { - bitcache = *mask++; - bitmask = CREATE_BITMASK (0); - } - if (bitcache & bitmask) - { - d = over (src, CONVERT_0565_TO_0888 (*dst)); - *dst = CONVERT_8888_TO_0565 (d); - } - bitmask = UPDATE_BITMASK (bitmask); - dst++; - } - } - } -} - -/* - * Simple bitblt - */ - -static void -fast_composite_solid_fill (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t src; - - src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); - - if (dst_image->bits.format == PIXMAN_a1) - { - src = src >> 31; - } - else if (dst_image->bits.format == PIXMAN_a8) - { - src = src >> 24; - } - else if (dst_image->bits.format == PIXMAN_r5g6b5 || - dst_image->bits.format == PIXMAN_b5g6r5) - { - src = CONVERT_8888_TO_0565 (src); - } - - pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride, - PIXMAN_FORMAT_BPP (dst_image->bits.format), - dest_x, dest_y, - width, height, - src); -} - -static void -fast_composite_src_memcpy (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8; - uint32_t n_bytes = width * bpp; - int dst_stride, src_stride; - uint8_t *dst; - uint8_t *src; - - src_stride = src_image->bits.rowstride * 4; - dst_stride = dst_image->bits.rowstride * 4; - - src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp; - dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp; - - while (height--) - { - memcpy (dst, src, n_bytes); - - dst += dst_stride; - src += src_stride; - } -} - -FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER) -FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE) -FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD) -FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL) -FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER) -FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE) -FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD) -FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL) -FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER) -FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE) -FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD) -FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL) -FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL) -FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER) -FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE) -FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD) -FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL) - -/* Use more unrolling for src_0565_0565 because it is typically CPU bound */ -static force_inline void -scaled_nearest_scanline_565_565_SRC (uint16_t * dst, - uint16_t * src, - int32_t w, - pixman_fixed_t vx, - pixman_fixed_t unit_x, - pixman_fixed_t max_vx) -{ - uint16_t tmp1, tmp2, tmp3, tmp4; - while ((w -= 4) >= 0) - { - tmp1 = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - tmp2 = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - tmp3 = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - tmp4 = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - *dst++ = tmp1; - *dst++ = tmp2; - *dst++ = tmp3; - *dst++ = tmp4; - } - if (w & 2) - { - tmp1 = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - tmp2 = src[pixman_fixed_to_int (vx)]; - vx += unit_x; - *dst++ = tmp1; - *dst++ = tmp2; - } - if (w & 1) - *dst++ = src[pixman_fixed_to_int (vx)]; -} - -FAST_NEAREST_MAINLOOP (565_565_cover_SRC, - scaled_nearest_scanline_565_565_SRC, - uint16_t, uint16_t, COVER) -FAST_NEAREST_MAINLOOP (565_565_none_SRC, - scaled_nearest_scanline_565_565_SRC, - uint16_t, uint16_t, NONE) -FAST_NEAREST_MAINLOOP (565_565_pad_SRC, - scaled_nearest_scanline_565_565_SRC, - uint16_t, uint16_t, PAD) - -static force_inline uint32_t -fetch_nearest (pixman_repeat_t src_repeat, - pixman_format_code_t format, - uint32_t *src, int x, int src_width) -{ - if (repeat (src_repeat, &x, src_width)) - { - if (format == PIXMAN_x8r8g8b8) - return *(src + x) | 0xff000000; - else - return *(src + x); - } - else - { - return 0; - } -} - -static force_inline void -combine_over (uint32_t s, uint32_t *dst) -{ - if (s) - { - uint8_t ia = 0xff - (s >> 24); - - if (ia) - UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s); - else - *dst = s; - } -} - -static force_inline void -combine_src (uint32_t s, uint32_t *dst) -{ - *dst = s; -} - -static void -fast_composite_scaled_nearest (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - uint32_t *dst_line; - uint32_t *src_line; - int dst_stride, src_stride; - int src_width, src_height; - pixman_repeat_t src_repeat; - pixman_fixed_t unit_x, unit_y; - pixman_format_code_t src_format; - pixman_vector_t v; - pixman_fixed_t vy; - - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - /* pass in 0 instead of src_x and src_y because src_x and src_y need to be - * transformed from destination space to source space - */ - PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1); - - /* reference point is the center of the pixel */ - v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; - v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; - v.vector[2] = pixman_fixed_1; - - if (!pixman_transform_point_3d (src_image->common.transform, &v)) - return; - - unit_x = src_image->common.transform->matrix[0][0]; - unit_y = src_image->common.transform->matrix[1][1]; - - /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ - v.vector[0] -= pixman_fixed_e; - v.vector[1] -= pixman_fixed_e; - - src_height = src_image->bits.height; - src_width = src_image->bits.width; - src_repeat = src_image->common.repeat; - src_format = src_image->bits.format; - - vy = v.vector[1]; - while (height--) - { - pixman_fixed_t vx = v.vector[0]; - int y = pixman_fixed_to_int (vy); - uint32_t *dst = dst_line; - - dst_line += dst_stride; - - /* adjust the y location by a unit vector in the y direction - * this is equivalent to transforming y+1 of the destination point to source space */ - vy += unit_y; - - if (!repeat (src_repeat, &y, src_height)) - { - if (op == PIXMAN_OP_SRC) - memset (dst, 0, sizeof (*dst) * width); - } - else - { - int w = width; - - uint32_t *src = src_line + y * src_stride; - - while (w >= 2) - { - uint32_t s1, s2; - int x1, x2; - - x1 = pixman_fixed_to_int (vx); - vx += unit_x; - - x2 = pixman_fixed_to_int (vx); - vx += unit_x; - - w -= 2; - - s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width); - s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width); - - if (op == PIXMAN_OP_OVER) - { - combine_over (s1, dst++); - combine_over (s2, dst++); - } - else - { - combine_src (s1, dst++); - combine_src (s2, dst++); - } - } - - while (w--) - { - uint32_t s; - int x; - - x = pixman_fixed_to_int (vx); - vx += unit_x; - - s = fetch_nearest (src_repeat, src_format, src, x, src_width); - - if (op == PIXMAN_OP_OVER) - combine_over (s, dst++); - else - combine_src (s, dst++); - } - } - } -} - -#define CACHE_LINE_SIZE 64 - -#define FAST_SIMPLE_ROTATE(suffix, pix_type) \ - \ -static void \ -blt_rotated_90_trivial_##suffix (pix_type *dst, \ - int dst_stride, \ - const pix_type *src, \ - int src_stride, \ - int w, \ - int h) \ -{ \ - int x, y; \ - for (y = 0; y < h; y++) \ - { \ - const pix_type *s = src + (h - y - 1); \ - pix_type *d = dst + dst_stride * y; \ - for (x = 0; x < w; x++) \ - { \ - *d++ = *s; \ - s += src_stride; \ - } \ - } \ -} \ - \ -static void \ -blt_rotated_270_trivial_##suffix (pix_type *dst, \ - int dst_stride, \ - const pix_type *src, \ - int src_stride, \ - int w, \ - int h) \ -{ \ - int x, y; \ - for (y = 0; y < h; y++) \ - { \ - const pix_type *s = src + src_stride * (w - 1) + y; \ - pix_type *d = dst + dst_stride * y; \ - for (x = 0; x < w; x++) \ - { \ - *d++ = *s; \ - s -= src_stride; \ - } \ - } \ -} \ - \ -static void \ -blt_rotated_90_##suffix (pix_type *dst, \ - int dst_stride, \ - const pix_type *src, \ - int src_stride, \ - int W, \ - int H) \ -{ \ - int x; \ - int leading_pixels = 0, trailing_pixels = 0; \ - const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \ - \ - /* \ - * split processing into handling destination as TILE_SIZExH cache line \ - * aligned vertical stripes (optimistically assuming that destination \ - * stride is a multiple of cache line, if not - it will be just a bit \ - * slower) \ - */ \ - \ - if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \ - { \ - leading_pixels = TILE_SIZE - (((uintptr_t)dst & \ - (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ - if (leading_pixels > W) \ - leading_pixels = W; \ - \ - /* unaligned leading part NxH (where N < TILE_SIZE) */ \ - blt_rotated_90_trivial_##suffix ( \ - dst, \ - dst_stride, \ - src, \ - src_stride, \ - leading_pixels, \ - H); \ - \ - dst += leading_pixels; \ - src += leading_pixels * src_stride; \ - W -= leading_pixels; \ - } \ - \ - if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \ - { \ - trailing_pixels = (((uintptr_t)(dst + W) & \ - (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ - if (trailing_pixels > W) \ - trailing_pixels = W; \ - W -= trailing_pixels; \ - } \ - \ - for (x = 0; x < W; x += TILE_SIZE) \ - { \ - /* aligned middle part TILE_SIZExH */ \ - blt_rotated_90_trivial_##suffix ( \ - dst + x, \ - dst_stride, \ - src + src_stride * x, \ - src_stride, \ - TILE_SIZE, \ - H); \ - } \ - \ - if (trailing_pixels) \ - { \ - /* unaligned trailing part NxH (where N < TILE_SIZE) */ \ - blt_rotated_90_trivial_##suffix ( \ - dst + W, \ - dst_stride, \ - src + W * src_stride, \ - src_stride, \ - trailing_pixels, \ - H); \ - } \ -} \ - \ -static void \ -blt_rotated_270_##suffix (pix_type *dst, \ - int dst_stride, \ - const pix_type *src, \ - int src_stride, \ - int W, \ - int H) \ -{ \ - int x; \ - int leading_pixels = 0, trailing_pixels = 0; \ - const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \ - \ - /* \ - * split processing into handling destination as TILE_SIZExH cache line \ - * aligned vertical stripes (optimistically assuming that destination \ - * stride is a multiple of cache line, if not - it will be just a bit \ - * slower) \ - */ \ - \ - if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \ - { \ - leading_pixels = TILE_SIZE - (((uintptr_t)dst & \ - (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ - if (leading_pixels > W) \ - leading_pixels = W; \ - \ - /* unaligned leading part NxH (where N < TILE_SIZE) */ \ - blt_rotated_270_trivial_##suffix ( \ - dst, \ - dst_stride, \ - src + src_stride * (W - leading_pixels), \ - src_stride, \ - leading_pixels, \ - H); \ - \ - dst += leading_pixels; \ - W -= leading_pixels; \ - } \ - \ - if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \ - { \ - trailing_pixels = (((uintptr_t)(dst + W) & \ - (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ - if (trailing_pixels > W) \ - trailing_pixels = W; \ - W -= trailing_pixels; \ - src += trailing_pixels * src_stride; \ - } \ - \ - for (x = 0; x < W; x += TILE_SIZE) \ - { \ - /* aligned middle part TILE_SIZExH */ \ - blt_rotated_270_trivial_##suffix ( \ - dst + x, \ - dst_stride, \ - src + src_stride * (W - x - TILE_SIZE), \ - src_stride, \ - TILE_SIZE, \ - H); \ - } \ - \ - if (trailing_pixels) \ - { \ - /* unaligned trailing part NxH (where N < TILE_SIZE) */ \ - blt_rotated_270_trivial_##suffix ( \ - dst + W, \ - dst_stride, \ - src - trailing_pixels * src_stride, \ - src_stride, \ - trailing_pixels, \ - H); \ - } \ -} \ - \ -static void \ -fast_composite_rotate_90_##suffix (pixman_implementation_t *imp, \ - pixman_op_t op, \ - pixman_image_t * src_image, \ - pixman_image_t * mask_image, \ - pixman_image_t * dst_image, \ - int32_t src_x, \ - int32_t src_y, \ - int32_t mask_x, \ - int32_t mask_y, \ - int32_t dest_x, \ - int32_t dest_y, \ - int32_t width, \ - int32_t height) \ -{ \ - pix_type *dst_line; \ - pix_type *src_line; \ - int dst_stride, src_stride; \ - int src_x_t, src_y_t; \ - \ - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \ - dst_stride, dst_line, 1); \ - src_x_t = -src_y + pixman_fixed_to_int ( \ - src_image->common.transform->matrix[0][2] + \ - pixman_fixed_1 / 2 - pixman_fixed_e) - height;\ - src_y_t = src_x + pixman_fixed_to_int ( \ - src_image->common.transform->matrix[1][2] + \ - pixman_fixed_1 / 2 - pixman_fixed_e); \ - PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \ - src_stride, src_line, 1); \ - blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride, \ - width, height); \ -} \ - \ -static void \ -fast_composite_rotate_270_##suffix (pixman_implementation_t *imp, \ - pixman_op_t op, \ - pixman_image_t * src_image, \ - pixman_image_t * mask_image, \ - pixman_image_t * dst_image, \ - int32_t src_x, \ - int32_t src_y, \ - int32_t mask_x, \ - int32_t mask_y, \ - int32_t dest_x, \ - int32_t dest_y, \ - int32_t width, \ - int32_t height) \ -{ \ - pix_type *dst_line; \ - pix_type *src_line; \ - int dst_stride, src_stride; \ - int src_x_t, src_y_t; \ - \ - PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \ - dst_stride, dst_line, 1); \ - src_x_t = src_y + pixman_fixed_to_int ( \ - src_image->common.transform->matrix[0][2] + \ - pixman_fixed_1 / 2 - pixman_fixed_e); \ - src_y_t = -src_x + pixman_fixed_to_int ( \ - src_image->common.transform->matrix[1][2] + \ - pixman_fixed_1 / 2 - pixman_fixed_e) - width; \ - PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \ - src_stride, src_line, 1); \ - blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride, \ - width, height); \ -} - -FAST_SIMPLE_ROTATE (8, uint8_t) -FAST_SIMPLE_ROTATE (565, uint16_t) -FAST_SIMPLE_ROTATE (8888, uint32_t) - -static const pixman_fast_path_t c_fast_paths[] = -{ - PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5, fast_composite_over_n_1_0565), - PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5, fast_composite_over_n_1_0565), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca), - PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565), - PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888), - PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888), - PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8), - PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000), - PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8), - PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill), - PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill), - PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill), - PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill), - PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill), - PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill), - PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill), - PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888), - PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888), - PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy), - PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565), - PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565), - PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565), - PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565), - PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8), - PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8), - - SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888), - SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888), - SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888), - SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888), - - SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888), - SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888), - - SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565), - SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565), - - SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565), - - SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888), - SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888), - SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888), - SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888), - - SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565), - -#define NEAREST_FAST_PATH(op,s,d) \ - { PIXMAN_OP_ ## op, \ - PIXMAN_ ## s, SCALED_NEAREST_FLAGS, \ - PIXMAN_null, 0, \ - PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ - fast_composite_scaled_nearest, \ - } - - NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8), - NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8), - NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8), - NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8), - - NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8), - NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8), - NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8), - NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8), - - NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8), - NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8), - NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8), - NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8), - - NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8), - NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8), - NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8), - NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8), - -#define SIMPLE_ROTATE_FLAGS(angle) \ - (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM | \ - FAST_PATH_NEAREST_FILTER | \ - FAST_PATH_SAMPLES_COVER_CLIP | \ - FAST_PATH_STANDARD_FLAGS) - -#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix) \ - { PIXMAN_OP_ ## op, \ - PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90), \ - PIXMAN_null, 0, \ - PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ - fast_composite_rotate_90_##suffix, \ - }, \ - { PIXMAN_OP_ ## op, \ - PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270), \ - PIXMAN_null, 0, \ - PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ - fast_composite_rotate_270_##suffix, \ - } - - SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888), - SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888), - SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888), - SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565), - SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8), - - { PIXMAN_OP_NONE }, -}; - -#ifdef WORDS_BIGENDIAN -#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n))) -#else -#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs)) -#endif - -static force_inline void -pixman_fill1_line (uint32_t *dst, int offs, int width, int v) -{ - if (offs) - { - int leading_pixels = 32 - offs; - if (leading_pixels >= width) - { - if (v) - *dst |= A1_FILL_MASK (width, offs); - else - *dst &= ~A1_FILL_MASK (width, offs); - return; - } - else - { - if (v) - *dst++ |= A1_FILL_MASK (leading_pixels, offs); - else - *dst++ &= ~A1_FILL_MASK (leading_pixels, offs); - width -= leading_pixels; - } - } - while (width >= 32) - { - if (v) - *dst++ = 0xFFFFFFFF; - else - *dst++ = 0; - width -= 32; - } - if (width > 0) - { - if (v) - *dst |= A1_FILL_MASK (width, 0); - else - *dst &= ~A1_FILL_MASK (width, 0); - } -} - -static void -pixman_fill1 (uint32_t *bits, - int stride, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - uint32_t *dst = bits + y * stride + (x >> 5); - int offs = x & 31; - - if (xor & 1) - { - while (height--) - { - pixman_fill1_line (dst, offs, width, 1); - dst += stride; - } - } - else - { - while (height--) - { - pixman_fill1_line (dst, offs, width, 0); - dst += stride; - } - } -} - -static void -pixman_fill8 (uint32_t *bits, - int stride, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - int byte_stride = stride * (int) sizeof (uint32_t); - uint8_t *dst = (uint8_t *) bits; - uint8_t v = xor & 0xff; - int i; - - dst = dst + y * byte_stride + x; - - while (height--) - { - for (i = 0; i < width; ++i) - dst[i] = v; - - dst += byte_stride; - } -} - -static void -pixman_fill16 (uint32_t *bits, - int stride, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - int short_stride = - (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t); - uint16_t *dst = (uint16_t *)bits; - uint16_t v = xor & 0xffff; - int i; - - dst = dst + y * short_stride + x; - - while (height--) - { - for (i = 0; i < width; ++i) - dst[i] = v; - - dst += short_stride; - } -} - -static void -pixman_fill32 (uint32_t *bits, - int stride, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - int i; - - bits = bits + y * stride + x; - - while (height--) - { - for (i = 0; i < width; ++i) - bits[i] = xor; - - bits += stride; - } -} - -static pixman_bool_t -fast_path_fill (pixman_implementation_t *imp, - uint32_t * bits, - int stride, - int bpp, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - switch (bpp) - { - case 1: - pixman_fill1 (bits, stride, x, y, width, height, xor); - break; - - case 8: - pixman_fill8 (bits, stride, x, y, width, height, xor); - break; - - case 16: - pixman_fill16 (bits, stride, x, y, width, height, xor); - break; - - case 32: - pixman_fill32 (bits, stride, x, y, width, height, xor); - break; - - default: - return _pixman_implementation_fill ( - imp->delegate, bits, stride, bpp, x, y, width, height, xor); - break; - } - - return TRUE; -} - -pixman_implementation_t * -_pixman_implementation_create_fast_path (pixman_implementation_t *fallback) -{ - pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths); - - imp->fill = fast_path_fill; - - return imp; -} +/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. SuSE makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author: Keith Packard, SuSE, Inc. + */ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "pixman-fast-path.h" + +static force_inline uint32_t +fetch_24 (uint8_t *a) +{ + if (((unsigned long)a) & 1) + { +#ifdef WORDS_BIGENDIAN + return (*a << 16) | (*(uint16_t *)(a + 1)); +#else + return *a | (*(uint16_t *)(a + 1) << 8); +#endif + } + else + { +#ifdef WORDS_BIGENDIAN + return (*(uint16_t *)a << 8) | *(a + 2); +#else + return *(uint16_t *)a | (*(a + 2) << 16); +#endif + } +} + +static force_inline void +store_24 (uint8_t *a, + uint32_t v) +{ + if (((unsigned long)a) & 1) + { +#ifdef WORDS_BIGENDIAN + *a = (uint8_t) (v >> 16); + *(uint16_t *)(a + 1) = (uint16_t) (v); +#else + *a = (uint8_t) (v); + *(uint16_t *)(a + 1) = (uint16_t) (v >> 8); +#endif + } + else + { +#ifdef WORDS_BIGENDIAN + *(uint16_t *)a = (uint16_t)(v >> 8); + *(a + 2) = (uint8_t)v; +#else + *(uint16_t *)a = (uint16_t)v; + *(a + 2) = (uint8_t)(v >> 16); +#endif + } +} + +static force_inline uint32_t +over (uint32_t src, + uint32_t dest) +{ + uint32_t a = ~src >> 24; + + UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src); + + return dest; +} + +static uint32_t +in (uint32_t x, + uint8_t y) +{ + uint16_t a = y; + + UN8x4_MUL_UN8 (x, a); + + return x; +} + +/* + * Naming convention: + * + * op_src_mask_dest + */ +static void +fast_composite_over_x888_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *src, *src_line; + uint32_t *dst, *dst_line; + uint8_t *mask, *mask_line; + int src_stride, mask_stride, dst_stride; + uint8_t m; + uint32_t s, d; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + while (w--) + { + m = *mask++; + if (m) + { + s = *src | 0xff000000; + + if (m == 0xff) + { + *dst = s; + } + else + { + d = in (s, m); + *dst = over (d, *dst); + } + } + src++; + dst++; + } + } +} + +static void +fast_composite_in_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dest_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + uint16_t t; + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + + srca = src >> 24; + + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + if (srca == 0xff) + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + + if (m == 0) + *dst = 0; + else if (m != 0xff) + *dst = MUL_UN8 (m, *dst, t); + + dst++; + } + } + } + else + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + m = MUL_UN8 (m, srca, t); + + if (m == 0) + *dst = 0; + else if (m != 0xff) + *dst = MUL_UN8 (m, *dst, t); + + dst++; + } + } + } +} + +static void +fast_composite_in_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dest_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint8_t s; + uint16_t t; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + + if (s == 0) + *dst = 0; + else if (s != 0xff) + *dst = MUL_UN8 (s, *dst, t); + + dst++; + } + } +} + +static void +fast_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, *dst, d; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + if (m == 0xff) + { + if (srca == 0xff) + *dst = src; + else + *dst = over (src, *dst); + } + else if (m) + { + d = in (src, m); + *dst = over (d, *dst); + } + dst++; + } + } +} + +static void +fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, s; + uint32_t *dst_line, *dst, d; + uint32_t *mask_line, *mask, ma; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + ma = *mask++; + + if (ma) + { + d = *dst; + s = src; + + UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d); + + *dst = s; + } + + dst++; + } + } +} + +static void +fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca, s; + uint32_t *dst_line, *dst, d; + uint32_t *mask_line, *mask, ma; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + ma = *mask++; + if (ma == 0xffffffff) + { + if (srca == 0xff) + *dst = src; + else + *dst = over (src, *dst); + } + else if (ma) + { + d = *dst; + s = src; + + UN8x4_MUL_UN8x4 (s, ma); + UN8x4_MUL_UN8 (ma, srca); + ma = ~ma; + UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s); + + *dst = d; + } + + dst++; + } + } +} + +static void +fast_composite_over_n_8_0888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint8_t *dst_line, *dst; + uint32_t d; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + if (m == 0xff) + { + if (srca == 0xff) + { + d = src; + } + else + { + d = fetch_24 (dst); + d = over (src, d); + } + store_24 (dst, d); + } + else if (m) + { + d = over (in (src, m), fetch_24 (dst)); + store_24 (dst, d); + } + dst += 3; + } + } +} + +static void +fast_composite_over_n_8_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint16_t *dst_line, *dst; + uint32_t d; + uint8_t *mask_line, *mask, m; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + m = *mask++; + if (m == 0xff) + { + if (srca == 0xff) + { + d = src; + } + else + { + d = *dst; + d = over (src, CONVERT_0565_TO_0888 (d)); + } + *dst = CONVERT_8888_TO_0565 (d); + } + else if (m) + { + d = *dst; + d = over (in (src, m), CONVERT_0565_TO_0888 (d)); + *dst = CONVERT_8888_TO_0565 (d); + } + dst++; + } + } +} + +static void +fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca, s; + uint16_t src16; + uint16_t *dst_line, *dst; + uint32_t d; + uint32_t *mask_line, *mask, ma; + int dst_stride, mask_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + src16 = CONVERT_8888_TO_0565 (src); + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + ma = *mask++; + if (ma == 0xffffffff) + { + if (srca == 0xff) + { + *dst = src16; + } + else + { + d = *dst; + d = over (src, CONVERT_0565_TO_0888 (d)); + *dst = CONVERT_8888_TO_0565 (d); + } + } + else if (ma) + { + d = *dst; + d = CONVERT_0565_TO_0888 (d); + + s = src; + + UN8x4_MUL_UN8x4 (s, ma); + UN8x4_MUL_UN8 (ma, srca); + ma = ~ma; + UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s); + + *dst = CONVERT_8888_TO_0565 (d); + } + dst++; + } + } +} + +static void +fast_composite_over_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + uint8_t a; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + a = s >> 24; + if (a == 0xff) + *dst = s; + else if (s) + *dst = over (s, *dst); + dst++; + } + } +} + +static void +fast_composite_src_x888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + *dst++ = (*src++) | 0xff000000; + } +} + +#if 0 +static void +fast_composite_over_8888_0888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint32_t d; + uint32_t *src_line, *src, s; + uint8_t a; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + a = s >> 24; + if (a) + { + if (a == 0xff) + d = s; + else + d = over (s, fetch_24 (dst)); + + store_24 (dst, d); + } + dst += 3; + } + } +} +#endif + +static void +fast_composite_over_8888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst; + uint32_t d; + uint32_t *src_line, *src, s; + uint8_t a; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + a = s >> 24; + if (s) + { + if (a == 0xff) + { + d = s; + } + else + { + d = *dst; + d = over (s, CONVERT_0565_TO_0888 (d)); + } + *dst = CONVERT_8888_TO_0565 (d); + } + dst++; + } + } +} + +static void +fast_composite_src_x888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + *dst = CONVERT_8888_TO_0565 (s); + dst++; + } + } +} + +static void +fast_composite_add_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint8_t s, d; + uint16_t t; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + if (s) + { + if (s != 0xff) + { + d = *dst; + t = d + s; + s = t | (0 - (t >> 8)); + } + *dst = s; + } + dst++; + } + } +} + +static void +fast_composite_add_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint32_t s, d; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + if (s) + { + if (s != 0xffffffff) + { + d = *dst; + if (d) + UN8x4_ADD_UN8x4 (s, d); + } + *dst = s; + } + dst++; + } + } +} + +static void +fast_composite_add_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t src; + uint8_t sa; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + sa = (src >> 24); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w--) + { + uint16_t tmp; + uint16_t a; + uint32_t m, d; + uint32_t r; + + a = *mask++; + d = *dst; + + m = MUL_UN8 (sa, a, tmp); + r = ADD_UN8 (m, d, tmp); + + *dst++ = r; + } + } +} + +#ifdef WORDS_BIGENDIAN +#define CREATE_BITMASK(n) (0x80000000 >> (n)) +#define UPDATE_BITMASK(n) ((n) >> 1) +#else +#define CREATE_BITMASK(n) (1 << (n)) +#define UPDATE_BITMASK(n) ((n) << 1) +#endif + +#define TEST_BIT(p, n) \ + (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31)) +#define SET_BIT(p, n) \ + do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0); + +static void +fast_composite_add_1000_1000 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t, + src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t, + dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + /* + * TODO: improve performance by processing uint32_t data instead + * of individual bits + */ + if (TEST_BIT (src, src_x + w)) + SET_BIT (dst, dest_x + w); + } + } +} + +static void +fast_composite_over_n_1_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst, *dst_line; + uint32_t *mask, *mask_line; + int mask_stride, dst_stride; + uint32_t bitcache, bitmask; + int32_t w; + + if (width <= 0) + return; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, + dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t, + mask_stride, mask_line, 1); + mask_line += mask_x >> 5; + + if (srca == 0xff) + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + *dst = src; + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } + else + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + *dst = over (src, *dst); + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } +} + +static void +fast_composite_over_n_1_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint16_t *dst, *dst_line; + uint32_t *mask, *mask_line; + int mask_stride, dst_stride; + uint32_t bitcache, bitmask; + int32_t w; + uint32_t d; + uint16_t src565; + + if (width <= 0) + return; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, + dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t, + mask_stride, mask_line, 1); + mask_line += mask_x >> 5; + + if (srca == 0xff) + { + src565 = CONVERT_8888_TO_0565 (src); + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + *dst = src565; + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } + else + { + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + bitcache = *mask++; + bitmask = CREATE_BITMASK (mask_x & 31); + + while (w--) + { + if (bitmask == 0) + { + bitcache = *mask++; + bitmask = CREATE_BITMASK (0); + } + if (bitcache & bitmask) + { + d = over (src, CONVERT_0565_TO_0888 (*dst)); + *dst = CONVERT_8888_TO_0565 (d); + } + bitmask = UPDATE_BITMASK (bitmask); + dst++; + } + } + } +} + +/* + * Simple bitblt + */ + +static void +fast_composite_solid_fill (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + + src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); + + if (dst_image->bits.format == PIXMAN_a1) + { + src = src >> 31; + } + else if (dst_image->bits.format == PIXMAN_a8) + { + src = src >> 24; + } + else if (dst_image->bits.format == PIXMAN_r5g6b5 || + dst_image->bits.format == PIXMAN_b5g6r5) + { + src = CONVERT_8888_TO_0565 (src); + } + + pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride, + PIXMAN_FORMAT_BPP (dst_image->bits.format), + dest_x, dest_y, + width, height, + src); +} + +static void +fast_composite_src_memcpy (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8; + uint32_t n_bytes = width * bpp; + int dst_stride, src_stride; + uint8_t *dst; + uint8_t *src; + + src_stride = src_image->bits.rowstride * 4; + dst_stride = dst_image->bits.rowstride * 4; + + src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp; + dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp; + + while (height--) + { + memcpy (dst, src, n_bytes); + + dst += dst_stride; + src += src_stride; + } +} + +FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER) +FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE) +FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD) +FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL) +FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER) +FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE) +FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD) +FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL) +FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER) +FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE) +FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD) +FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL) +FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL) +FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER) +FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE) +FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD) +FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL) + +/* Use more unrolling for src_0565_0565 because it is typically CPU bound */ +static force_inline void +scaled_nearest_scanline_565_565_SRC (uint16_t * dst, + const uint16_t * src, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t fully_transparent_src) +{ + uint16_t tmp1, tmp2, tmp3, tmp4; + while ((w -= 4) >= 0) + { + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp3 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp4 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + *dst++ = tmp1; + *dst++ = tmp2; + *dst++ = tmp3; + *dst++ = tmp4; + } + if (w & 2) + { + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + *dst++ = tmp1; + *dst++ = tmp2; + } + if (w & 1) + *dst++ = src[pixman_fixed_to_int (vx)]; +} + +FAST_NEAREST_MAINLOOP (565_565_cover_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, COVER) +FAST_NEAREST_MAINLOOP (565_565_none_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, NONE) +FAST_NEAREST_MAINLOOP (565_565_pad_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, PAD) + +static force_inline uint32_t +fetch_nearest (pixman_repeat_t src_repeat, + pixman_format_code_t format, + uint32_t *src, int x, int src_width) +{ + if (repeat (src_repeat, &x, src_width)) + { + if (format == PIXMAN_x8r8g8b8) + return *(src + x) | 0xff000000; + else + return *(src + x); + } + else + { + return 0; + } +} + +static force_inline void +combine_over (uint32_t s, uint32_t *dst) +{ + if (s) + { + uint8_t ia = 0xff - (s >> 24); + + if (ia) + UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s); + else + *dst = s; + } +} + +static force_inline void +combine_src (uint32_t s, uint32_t *dst) +{ + *dst = s; +} + +static void +fast_composite_scaled_nearest (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line; + uint32_t *src_line; + int dst_stride, src_stride; + int src_width, src_height; + pixman_repeat_t src_repeat; + pixman_fixed_t unit_x, unit_y; + pixman_format_code_t src_format; + pixman_vector_t v; + pixman_fixed_t vy; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + /* pass in 0 instead of src_x and src_y because src_x and src_y need to be + * transformed from destination space to source space + */ + PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1); + + /* reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (src_image->common.transform, &v)) + return; + + unit_x = src_image->common.transform->matrix[0][0]; + unit_y = src_image->common.transform->matrix[1][1]; + + /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ + v.vector[0] -= pixman_fixed_e; + v.vector[1] -= pixman_fixed_e; + + src_height = src_image->bits.height; + src_width = src_image->bits.width; + src_repeat = src_image->common.repeat; + src_format = src_image->bits.format; + + vy = v.vector[1]; + while (height--) + { + pixman_fixed_t vx = v.vector[0]; + int y = pixman_fixed_to_int (vy); + uint32_t *dst = dst_line; + + dst_line += dst_stride; + + /* adjust the y location by a unit vector in the y direction + * this is equivalent to transforming y+1 of the destination point to source space */ + vy += unit_y; + + if (!repeat (src_repeat, &y, src_height)) + { + if (op == PIXMAN_OP_SRC) + memset (dst, 0, sizeof (*dst) * width); + } + else + { + int w = width; + + uint32_t *src = src_line + y * src_stride; + + while (w >= 2) + { + uint32_t s1, s2; + int x1, x2; + + x1 = pixman_fixed_to_int (vx); + vx += unit_x; + + x2 = pixman_fixed_to_int (vx); + vx += unit_x; + + w -= 2; + + s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width); + s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width); + + if (op == PIXMAN_OP_OVER) + { + combine_over (s1, dst++); + combine_over (s2, dst++); + } + else + { + combine_src (s1, dst++); + combine_src (s2, dst++); + } + } + + while (w--) + { + uint32_t s; + int x; + + x = pixman_fixed_to_int (vx); + vx += unit_x; + + s = fetch_nearest (src_repeat, src_format, src, x, src_width); + + if (op == PIXMAN_OP_OVER) + combine_over (s, dst++); + else + combine_src (s, dst++); + } + } + } +} + +#define CACHE_LINE_SIZE 64 + +#define FAST_SIMPLE_ROTATE(suffix, pix_type) \ + \ +static void \ +blt_rotated_90_trivial_##suffix (pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int w, \ + int h) \ +{ \ + int x, y; \ + for (y = 0; y < h; y++) \ + { \ + const pix_type *s = src + (h - y - 1); \ + pix_type *d = dst + dst_stride * y; \ + for (x = 0; x < w; x++) \ + { \ + *d++ = *s; \ + s += src_stride; \ + } \ + } \ +} \ + \ +static void \ +blt_rotated_270_trivial_##suffix (pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int w, \ + int h) \ +{ \ + int x, y; \ + for (y = 0; y < h; y++) \ + { \ + const pix_type *s = src + src_stride * (w - 1) + y; \ + pix_type *d = dst + dst_stride * y; \ + for (x = 0; x < w; x++) \ + { \ + *d++ = *s; \ + s -= src_stride; \ + } \ + } \ +} \ + \ +static void \ +blt_rotated_90_##suffix (pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int W, \ + int H) \ +{ \ + int x; \ + int leading_pixels = 0, trailing_pixels = 0; \ + const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \ + \ + /* \ + * split processing into handling destination as TILE_SIZExH cache line \ + * aligned vertical stripes (optimistically assuming that destination \ + * stride is a multiple of cache line, if not - it will be just a bit \ + * slower) \ + */ \ + \ + if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \ + { \ + leading_pixels = TILE_SIZE - (((uintptr_t)dst & \ + (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (leading_pixels > W) \ + leading_pixels = W; \ + \ + /* unaligned leading part NxH (where N < TILE_SIZE) */ \ + blt_rotated_90_trivial_##suffix ( \ + dst, \ + dst_stride, \ + src, \ + src_stride, \ + leading_pixels, \ + H); \ + \ + dst += leading_pixels; \ + src += leading_pixels * src_stride; \ + W -= leading_pixels; \ + } \ + \ + if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \ + { \ + trailing_pixels = (((uintptr_t)(dst + W) & \ + (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (trailing_pixels > W) \ + trailing_pixels = W; \ + W -= trailing_pixels; \ + } \ + \ + for (x = 0; x < W; x += TILE_SIZE) \ + { \ + /* aligned middle part TILE_SIZExH */ \ + blt_rotated_90_trivial_##suffix ( \ + dst + x, \ + dst_stride, \ + src + src_stride * x, \ + src_stride, \ + TILE_SIZE, \ + H); \ + } \ + \ + if (trailing_pixels) \ + { \ + /* unaligned trailing part NxH (where N < TILE_SIZE) */ \ + blt_rotated_90_trivial_##suffix ( \ + dst + W, \ + dst_stride, \ + src + W * src_stride, \ + src_stride, \ + trailing_pixels, \ + H); \ + } \ +} \ + \ +static void \ +blt_rotated_270_##suffix (pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int W, \ + int H) \ +{ \ + int x; \ + int leading_pixels = 0, trailing_pixels = 0; \ + const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type); \ + \ + /* \ + * split processing into handling destination as TILE_SIZExH cache line \ + * aligned vertical stripes (optimistically assuming that destination \ + * stride is a multiple of cache line, if not - it will be just a bit \ + * slower) \ + */ \ + \ + if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1)) \ + { \ + leading_pixels = TILE_SIZE - (((uintptr_t)dst & \ + (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (leading_pixels > W) \ + leading_pixels = W; \ + \ + /* unaligned leading part NxH (where N < TILE_SIZE) */ \ + blt_rotated_270_trivial_##suffix ( \ + dst, \ + dst_stride, \ + src + src_stride * (W - leading_pixels), \ + src_stride, \ + leading_pixels, \ + H); \ + \ + dst += leading_pixels; \ + W -= leading_pixels; \ + } \ + \ + if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1)) \ + { \ + trailing_pixels = (((uintptr_t)(dst + W) & \ + (CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (trailing_pixels > W) \ + trailing_pixels = W; \ + W -= trailing_pixels; \ + src += trailing_pixels * src_stride; \ + } \ + \ + for (x = 0; x < W; x += TILE_SIZE) \ + { \ + /* aligned middle part TILE_SIZExH */ \ + blt_rotated_270_trivial_##suffix ( \ + dst + x, \ + dst_stride, \ + src + src_stride * (W - x - TILE_SIZE), \ + src_stride, \ + TILE_SIZE, \ + H); \ + } \ + \ + if (trailing_pixels) \ + { \ + /* unaligned trailing part NxH (where N < TILE_SIZE) */ \ + blt_rotated_270_trivial_##suffix ( \ + dst + W, \ + dst_stride, \ + src - trailing_pixels * src_stride, \ + src_stride, \ + trailing_pixels, \ + H); \ + } \ +} \ + \ +static void \ +fast_composite_rotate_90_##suffix (pixman_implementation_t *imp, \ + pixman_op_t op, \ + pixman_image_t * src_image, \ + pixman_image_t * mask_image, \ + pixman_image_t * dst_image, \ + int32_t src_x, \ + int32_t src_y, \ + int32_t mask_x, \ + int32_t mask_y, \ + int32_t dest_x, \ + int32_t dest_y, \ + int32_t width, \ + int32_t height) \ +{ \ + pix_type *dst_line; \ + pix_type *src_line; \ + int dst_stride, src_stride; \ + int src_x_t, src_y_t; \ + \ + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \ + dst_stride, dst_line, 1); \ + src_x_t = -src_y + pixman_fixed_to_int ( \ + src_image->common.transform->matrix[0][2] + \ + pixman_fixed_1 / 2 - pixman_fixed_e) - height;\ + src_y_t = src_x + pixman_fixed_to_int ( \ + src_image->common.transform->matrix[1][2] + \ + pixman_fixed_1 / 2 - pixman_fixed_e); \ + PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \ + src_stride, src_line, 1); \ + blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride, \ + width, height); \ +} \ + \ +static void \ +fast_composite_rotate_270_##suffix (pixman_implementation_t *imp, \ + pixman_op_t op, \ + pixman_image_t * src_image, \ + pixman_image_t * mask_image, \ + pixman_image_t * dst_image, \ + int32_t src_x, \ + int32_t src_y, \ + int32_t mask_x, \ + int32_t mask_y, \ + int32_t dest_x, \ + int32_t dest_y, \ + int32_t width, \ + int32_t height) \ +{ \ + pix_type *dst_line; \ + pix_type *src_line; \ + int dst_stride, src_stride; \ + int src_x_t, src_y_t; \ + \ + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type, \ + dst_stride, dst_line, 1); \ + src_x_t = src_y + pixman_fixed_to_int ( \ + src_image->common.transform->matrix[0][2] + \ + pixman_fixed_1 / 2 - pixman_fixed_e); \ + src_y_t = -src_x + pixman_fixed_to_int ( \ + src_image->common.transform->matrix[1][2] + \ + pixman_fixed_1 / 2 - pixman_fixed_e) - width; \ + PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type, \ + src_stride, src_line, 1); \ + blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride, \ + width, height); \ +} + +FAST_SIMPLE_ROTATE (8, uint8_t) +FAST_SIMPLE_ROTATE (565, uint16_t) +FAST_SIMPLE_ROTATE (8888, uint32_t) + +static const pixman_fast_path_t c_fast_paths[] = +{ + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5, fast_composite_over_n_1_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5, fast_composite_over_n_1_0565), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8), + PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000), + PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8), + PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8), + + SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888), + + SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888), + + SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565), + SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565), + + SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565), + + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888), + + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565), + +#define NEAREST_FAST_PATH(op,s,d) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, SCALED_NEAREST_FLAGS, \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest, \ + } + + NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8), + NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8), + + NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8), + NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8), + + NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8), + NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8), + NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8), + + NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8), + NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8), + NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8), + +#define SIMPLE_ROTATE_FLAGS(angle) \ + (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM | \ + FAST_PATH_NEAREST_FILTER | \ + FAST_PATH_SAMPLES_COVER_CLIP | \ + FAST_PATH_STANDARD_FLAGS) + +#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_rotate_90_##suffix, \ + }, \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_rotate_270_##suffix, \ + } + + SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888), + SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888), + SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888), + SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565), + SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8), + + { PIXMAN_OP_NONE }, +}; + +#ifdef WORDS_BIGENDIAN +#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n))) +#else +#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs)) +#endif + +static force_inline void +pixman_fill1_line (uint32_t *dst, int offs, int width, int v) +{ + if (offs) + { + int leading_pixels = 32 - offs; + if (leading_pixels >= width) + { + if (v) + *dst |= A1_FILL_MASK (width, offs); + else + *dst &= ~A1_FILL_MASK (width, offs); + return; + } + else + { + if (v) + *dst++ |= A1_FILL_MASK (leading_pixels, offs); + else + *dst++ &= ~A1_FILL_MASK (leading_pixels, offs); + width -= leading_pixels; + } + } + while (width >= 32) + { + if (v) + *dst++ = 0xFFFFFFFF; + else + *dst++ = 0; + width -= 32; + } + if (width > 0) + { + if (v) + *dst |= A1_FILL_MASK (width, 0); + else + *dst &= ~A1_FILL_MASK (width, 0); + } +} + +static void +pixman_fill1 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + uint32_t *dst = bits + y * stride + (x >> 5); + int offs = x & 31; + + if (xor & 1) + { + while (height--) + { + pixman_fill1_line (dst, offs, width, 1); + dst += stride; + } + } + else + { + while (height--) + { + pixman_fill1_line (dst, offs, width, 0); + dst += stride; + } + } +} + +static void +pixman_fill8 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + int byte_stride = stride * (int) sizeof (uint32_t); + uint8_t *dst = (uint8_t *) bits; + uint8_t v = xor & 0xff; + int i; + + dst = dst + y * byte_stride + x; + + while (height--) + { + for (i = 0; i < width; ++i) + dst[i] = v; + + dst += byte_stride; + } +} + +static void +pixman_fill16 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + int short_stride = + (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t); + uint16_t *dst = (uint16_t *)bits; + uint16_t v = xor & 0xffff; + int i; + + dst = dst + y * short_stride + x; + + while (height--) + { + for (i = 0; i < width; ++i) + dst[i] = v; + + dst += short_stride; + } +} + +static void +pixman_fill32 (uint32_t *bits, + int stride, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + int i; + + bits = bits + y * stride + x; + + while (height--) + { + for (i = 0; i < width; ++i) + bits[i] = xor; + + bits += stride; + } +} + +static pixman_bool_t +fast_path_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + switch (bpp) + { + case 1: + pixman_fill1 (bits, stride, x, y, width, height, xor); + break; + + case 8: + pixman_fill8 (bits, stride, x, y, width, height, xor); + break; + + case 16: + pixman_fill16 (bits, stride, x, y, width, height, xor); + break; + + case 32: + pixman_fill32 (bits, stride, x, y, width, height, xor); + break; + + default: + return _pixman_implementation_fill ( + imp->delegate, bits, stride, bpp, x, y, width, height, xor); + break; + } + + return TRUE; +} + +pixman_implementation_t * +_pixman_implementation_create_fast_path (pixman_implementation_t *fallback) +{ + pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths); + + imp->fill = fast_path_fill; + + return imp; +} diff --git a/pixman/pixman/pixman-fast-path.h b/pixman/pixman/pixman-fast-path.h index bb7032d86..d08122293 100644 --- a/pixman/pixman/pixman-fast-path.h +++ b/pixman/pixman/pixman-fast-path.h @@ -1,449 +1,590 @@ -/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ -/* - * Copyright © 2000 SuSE, Inc. - * Copyright © 2007 Red Hat, Inc. - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of SuSE not be used in advertising or - * publicity pertaining to distribution of the software without specific, - * written prior permission. SuSE makes no representations about the - * suitability of this software for any purpose. It is provided "as is" - * without express or implied warranty. - * - * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE - * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - * Author: Keith Packard, SuSE, Inc. - */ - -#ifndef PIXMAN_FAST_PATH_H__ -#define PIXMAN_FAST_PATH_H__ - -#include "pixman-private.h" - -#define PIXMAN_REPEAT_COVER -1 - -static force_inline pixman_bool_t -repeat (pixman_repeat_t repeat, int *c, int size) -{ - if (repeat == PIXMAN_REPEAT_NONE) - { - if (*c < 0 || *c >= size) - return FALSE; - } - else if (repeat == PIXMAN_REPEAT_NORMAL) - { - while (*c >= size) - *c -= size; - while (*c < 0) - *c += size; - } - else if (repeat == PIXMAN_REPEAT_PAD) - { - *c = CLIP (*c, 0, size - 1); - } - else /* REFLECT */ - { - *c = MOD (*c, size * 2); - if (*c >= size) - *c = size * 2 - *c - 1; - } - return TRUE; -} - -/* - * For each scanline fetched from source image with PAD repeat: - * - calculate how many pixels need to be padded on the left side - * - calculate how many pixels need to be padded on the right side - * - update width to only count pixels which are fetched from the image - * All this information is returned via 'width', 'left_pad', 'right_pad' - * arguments. The code is assuming that 'unit_x' is positive. - * - * Note: 64-bit math is used in order to avoid potential overflows, which - * is probably excessive in many cases. This particular function - * may need its own correctness test and performance tuning. - */ -static force_inline void -pad_repeat_get_scanline_bounds (int32_t source_image_width, - pixman_fixed_t vx, - pixman_fixed_t unit_x, - int32_t * width, - int32_t * left_pad, - int32_t * right_pad) -{ - int64_t max_vx = (int64_t) source_image_width << 16; - int64_t tmp; - if (vx < 0) - { - tmp = ((int64_t) unit_x - 1 - vx) / unit_x; - if (tmp > *width) - { - *left_pad = *width; - *width = 0; - } - else - { - *left_pad = (int32_t) tmp; - *width -= (int32_t) tmp; - } - } - else - { - *left_pad = 0; - } - tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad; - if (tmp < 0) - { - *right_pad = *width; - *width = 0; - } - else if (tmp >= *width) - { - *right_pad = 0; - } - else - { - *right_pad = *width - (int32_t) tmp; - *width = (int32_t) tmp; - } -} - -/* A macroified version of specialized nearest scalers for some - * common 8888 and 565 formats. It supports SRC and OVER ops. - * - * There are two repeat versions, one that handles repeat normal, - * and one without repeat handling that only works if the src region - * used is completely covered by the pre-repeated source samples. - * - * The loops are unrolled to process two pixels per iteration for better - * performance on most CPU architectures (superscalar processors - * can issue several operations simultaneously, other processors can hide - * instructions latencies by pipelining operations). Unrolling more - * does not make much sense because the compiler will start running out - * of spare registers soon. - */ - -#define GET_8888_ALPHA(s) ((s) >> 24) - /* This is not actually used since we don't have an OVER with - 565 source, but it is needed to build. */ -#define GET_0565_ALPHA(s) 0xff - -#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT, \ - src_type_t, dst_type_t, OP, repeat_mode) \ -static force_inline void \ -scanline_func_name (dst_type_t *dst, \ - src_type_t *src, \ - int32_t w, \ - pixman_fixed_t vx, \ - pixman_fixed_t unit_x, \ - pixman_fixed_t max_vx) \ -{ \ - uint32_t d; \ - src_type_t s1, s2; \ - uint8_t a1, a2; \ - int x1, x2; \ - \ - if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER) \ - abort(); \ - \ - while ((w -= 2) >= 0) \ - { \ - x1 = vx >> 16; \ - vx += unit_x; \ - if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ - { \ - /* This works because we know that unit_x is positive */ \ - while (vx >= max_vx) \ - vx -= max_vx; \ - } \ - s1 = src[x1]; \ - \ - x2 = vx >> 16; \ - vx += unit_x; \ - if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ - { \ - /* This works because we know that unit_x is positive */ \ - while (vx >= max_vx) \ - vx -= max_vx; \ - } \ - s2 = src[x2]; \ - \ - if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ - { \ - a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ - a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \ - \ - if (a1 == 0xff) \ - { \ - *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ - } \ - else if (s1) \ - { \ - d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst); \ - s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \ - a1 ^= 0xff; \ - UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \ - *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ - } \ - dst++; \ - \ - if (a2 == 0xff) \ - { \ - *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \ - } \ - else if (s2) \ - { \ - d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \ - s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2); \ - a2 ^= 0xff; \ - UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2); \ - *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ - } \ - dst++; \ - } \ - else /* PIXMAN_OP_SRC */ \ - { \ - *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ - *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \ - } \ - } \ - \ - if (w & 1) \ - { \ - x1 = vx >> 16; \ - s1 = src[x1]; \ - \ - if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ - { \ - a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ - \ - if (a1 == 0xff) \ - { \ - *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ - } \ - else if (s1) \ - { \ - d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \ - s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \ - a1 ^= 0xff; \ - UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \ - *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ - } \ - dst++; \ - } \ - else /* PIXMAN_OP_SRC */ \ - { \ - *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ - } \ - } \ -} - -#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, dst_type_t, \ - repeat_mode) \ -static void \ -fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, \ - pixman_op_t op, \ - pixman_image_t * src_image, \ - pixman_image_t * mask_image, \ - pixman_image_t * dst_image, \ - int32_t src_x, \ - int32_t src_y, \ - int32_t mask_x, \ - int32_t mask_y, \ - int32_t dst_x, \ - int32_t dst_y, \ - int32_t width, \ - int32_t height) \ -{ \ - dst_type_t *dst_line; \ - src_type_t *src_first_line; \ - int y; \ - pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \ - pixman_fixed_t max_vy; \ - pixman_vector_t v; \ - pixman_fixed_t vx, vy; \ - pixman_fixed_t unit_x, unit_y; \ - int32_t left_pad, right_pad; \ - \ - src_type_t *src; \ - dst_type_t *dst; \ - int src_stride, dst_stride; \ - \ - PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \ - /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \ - * transformed from destination space to source space */ \ - PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \ - \ - /* reference point is the center of the pixel */ \ - v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \ - v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \ - v.vector[2] = pixman_fixed_1; \ - \ - if (!pixman_transform_point_3d (src_image->common.transform, &v)) \ - return; \ - \ - unit_x = src_image->common.transform->matrix[0][0]; \ - unit_y = src_image->common.transform->matrix[1][1]; \ - \ - /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ \ - v.vector[0] -= pixman_fixed_e; \ - v.vector[1] -= pixman_fixed_e; \ - \ - vx = v.vector[0]; \ - vy = v.vector[1]; \ - \ - if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ - { \ - /* Clamp repeating positions inside the actual samples */ \ - max_vx = src_image->bits.width << 16; \ - max_vy = src_image->bits.height << 16; \ - \ - repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \ - repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ - } \ - \ - if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \ - PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ - { \ - pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x, \ - &width, &left_pad, &right_pad); \ - vx += left_pad * unit_x; \ - } \ - \ - while (--height >= 0) \ - { \ - dst = dst_line; \ - dst_line += dst_stride; \ - \ - y = vy >> 16; \ - vy += unit_y; \ - if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ - repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ - if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \ - { \ - repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height); \ - src = src_first_line + src_stride * y; \ - if (left_pad > 0) \ - { \ - scanline_func (dst, src, left_pad, 0, 0, 0); \ - } \ - if (width > 0) \ - { \ - scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \ - } \ - if (right_pad > 0) \ - { \ - scanline_func (dst + left_pad + width, src + src_image->bits.width - 1, \ - right_pad, 0, 0, 0); \ - } \ - } \ - else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ - { \ - static src_type_t zero[1] = { 0 }; \ - if (y < 0 || y >= src_image->bits.height) \ - { \ - scanline_func (dst, zero, left_pad + width + right_pad, 0, 0, 0); \ - continue; \ - } \ - src = src_first_line + src_stride * y; \ - if (left_pad > 0) \ - { \ - scanline_func (dst, zero, left_pad, 0, 0, 0); \ - } \ - if (width > 0) \ - { \ - scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \ - } \ - if (right_pad > 0) \ - { \ - scanline_func (dst + left_pad + width, zero, right_pad, 0, 0, 0); \ - } \ - } \ - else \ - { \ - src = src_first_line + src_stride * y; \ - scanline_func (dst, src, width, vx, unit_x, max_vx); \ - } \ - } \ -} - -/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ -#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t, \ - repeat_mode) \ - FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, dst_type_t, \ - repeat_mode) \ - -#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT, \ - src_type_t, dst_type_t, OP, repeat_mode) \ - FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \ - SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t, \ - OP, repeat_mode) \ - FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name ## _ ## OP, \ - scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \ - src_type_t, dst_type_t, repeat_mode) - - -#define SCALED_NEAREST_FLAGS \ - (FAST_PATH_SCALE_TRANSFORM | \ - FAST_PATH_NO_ALPHA_MAP | \ - FAST_PATH_NEAREST_FILTER | \ - FAST_PATH_NO_ACCESSORS | \ - FAST_PATH_NARROW_FORMAT) - -#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func) \ - { PIXMAN_OP_ ## op, \ - PIXMAN_ ## s, \ - (SCALED_NEAREST_FLAGS | \ - FAST_PATH_NORMAL_REPEAT | \ - FAST_PATH_X_UNIT_POSITIVE), \ - PIXMAN_null, 0, \ - PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ - fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \ - } - -#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func) \ - { PIXMAN_OP_ ## op, \ - PIXMAN_ ## s, \ - (SCALED_NEAREST_FLAGS | \ - FAST_PATH_PAD_REPEAT | \ - FAST_PATH_X_UNIT_POSITIVE), \ - PIXMAN_null, 0, \ - PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ - fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \ - } - -#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func) \ - { PIXMAN_OP_ ## op, \ - PIXMAN_ ## s, \ - (SCALED_NEAREST_FLAGS | \ - FAST_PATH_NONE_REPEAT | \ - FAST_PATH_X_UNIT_POSITIVE), \ - PIXMAN_null, 0, \ - PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ - fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \ - } - -#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func) \ - { PIXMAN_OP_ ## op, \ - PIXMAN_ ## s, \ - SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \ - PIXMAN_null, 0, \ - PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ - fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \ - } - -/* Prefer the use of 'cover' variant, because it is faster */ -#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \ - SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \ - SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \ - SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \ - SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func) - -#endif +/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. SuSE makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author: Keith Packard, SuSE, Inc. + */ + +#ifndef PIXMAN_FAST_PATH_H__ +#define PIXMAN_FAST_PATH_H__ + +#include "pixman-private.h" + +#define PIXMAN_REPEAT_COVER -1 + +static force_inline pixman_bool_t +repeat (pixman_repeat_t repeat, int *c, int size) +{ + if (repeat == PIXMAN_REPEAT_NONE) + { + if (*c < 0 || *c >= size) + return FALSE; + } + else if (repeat == PIXMAN_REPEAT_NORMAL) + { + while (*c >= size) + *c -= size; + while (*c < 0) + *c += size; + } + else if (repeat == PIXMAN_REPEAT_PAD) + { + *c = CLIP (*c, 0, size - 1); + } + else /* REFLECT */ + { + *c = MOD (*c, size * 2); + if (*c >= size) + *c = size * 2 - *c - 1; + } + return TRUE; +} + +/* + * For each scanline fetched from source image with PAD repeat: + * - calculate how many pixels need to be padded on the left side + * - calculate how many pixels need to be padded on the right side + * - update width to only count pixels which are fetched from the image + * All this information is returned via 'width', 'left_pad', 'right_pad' + * arguments. The code is assuming that 'unit_x' is positive. + * + * Note: 64-bit math is used in order to avoid potential overflows, which + * is probably excessive in many cases. This particular function + * may need its own correctness test and performance tuning. + */ +static force_inline void +pad_repeat_get_scanline_bounds (int32_t source_image_width, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + int32_t * width, + int32_t * left_pad, + int32_t * right_pad) +{ + int64_t max_vx = (int64_t) source_image_width << 16; + int64_t tmp; + if (vx < 0) + { + tmp = ((int64_t) unit_x - 1 - vx) / unit_x; + if (tmp > *width) + { + *left_pad = *width; + *width = 0; + } + else + { + *left_pad = (int32_t) tmp; + *width -= (int32_t) tmp; + } + } + else + { + *left_pad = 0; + } + tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad; + if (tmp < 0) + { + *right_pad = *width; + *width = 0; + } + else if (tmp >= *width) + { + *right_pad = 0; + } + else + { + *right_pad = *width - (int32_t) tmp; + *width = (int32_t) tmp; + } +} + +/* A macroified version of specialized nearest scalers for some + * common 8888 and 565 formats. It supports SRC and OVER ops. + * + * There are two repeat versions, one that handles repeat normal, + * and one without repeat handling that only works if the src region + * used is completely covered by the pre-repeated source samples. + * + * The loops are unrolled to process two pixels per iteration for better + * performance on most CPU architectures (superscalar processors + * can issue several operations simultaneously, other processors can hide + * instructions latencies by pipelining operations). Unrolling more + * does not make much sense because the compiler will start running out + * of spare registers soon. + */ + +#define GET_8888_ALPHA(s) ((s) >> 24) + /* This is not actually used since we don't have an OVER with + 565 source, but it is needed to build. */ +#define GET_0565_ALPHA(s) 0xff + +#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT, \ + src_type_t, dst_type_t, OP, repeat_mode) \ +static force_inline void \ +scanline_func_name (dst_type_t *dst, \ + const src_type_t *src, \ + int32_t w, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx, \ + pixman_bool_t fully_transparent_src) \ +{ \ + uint32_t d; \ + src_type_t s1, s2; \ + uint8_t a1, a2; \ + int x1, x2; \ + \ + if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src) \ + return; \ + \ + if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER) \ + abort(); \ + \ + while ((w -= 2) >= 0) \ + { \ + x1 = vx >> 16; \ + vx += unit_x; \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + { \ + /* This works because we know that unit_x is positive */ \ + while (vx >= max_vx) \ + vx -= max_vx; \ + } \ + s1 = src[x1]; \ + \ + x2 = vx >> 16; \ + vx += unit_x; \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + { \ + /* This works because we know that unit_x is positive */ \ + while (vx >= max_vx) \ + vx -= max_vx; \ + } \ + s2 = src[x2]; \ + \ + if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ + { \ + a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ + a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2); \ + \ + if (a1 == 0xff) \ + { \ + *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + } \ + else if (s1) \ + { \ + d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst); \ + s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \ + a1 ^= 0xff; \ + UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \ + *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + } \ + dst++; \ + \ + if (a2 == 0xff) \ + { \ + *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \ + } \ + else if (s2) \ + { \ + d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \ + s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2); \ + a2 ^= 0xff; \ + UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2); \ + *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + } \ + dst++; \ + } \ + else /* PIXMAN_OP_SRC */ \ + { \ + *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2); \ + } \ + } \ + \ + if (w & 1) \ + { \ + x1 = vx >> 16; \ + s1 = src[x1]; \ + \ + if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ + { \ + a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1); \ + \ + if (a1 == 0xff) \ + { \ + *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + } \ + else if (s1) \ + { \ + d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst); \ + s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1); \ + a1 ^= 0xff; \ + UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1); \ + *dst = CONVERT_8888_TO_ ## DST_FORMAT (d); \ + } \ + dst++; \ + } \ + else /* PIXMAN_OP_SRC */ \ + { \ + *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1); \ + } \ + } \ +} + +#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \ + dst_type_t, repeat_mode, have_mask, mask_is_solid) \ +static void \ +fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, \ + pixman_op_t op, \ + pixman_image_t * src_image, \ + pixman_image_t * mask_image, \ + pixman_image_t * dst_image, \ + int32_t src_x, \ + int32_t src_y, \ + int32_t mask_x, \ + int32_t mask_y, \ + int32_t dst_x, \ + int32_t dst_y, \ + int32_t width, \ + int32_t height) \ +{ \ + dst_type_t *dst_line; \ + mask_type_t *mask_line; \ + src_type_t *src_first_line; \ + int y; \ + pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \ + pixman_fixed_t max_vy; \ + pixman_vector_t v; \ + pixman_fixed_t vx, vy; \ + pixman_fixed_t unit_x, unit_y; \ + int32_t left_pad, right_pad; \ + \ + src_type_t *src; \ + dst_type_t *dst; \ + mask_type_t solid_mask; \ + const mask_type_t *mask = &solid_mask; \ + int src_stride, mask_stride, dst_stride; \ + \ + PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \ + if (have_mask) \ + { \ + if (mask_is_solid) \ + solid_mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format); \ + else \ + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t, \ + mask_stride, mask_line, 1); \ + } \ + /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \ + * transformed from destination space to source space */ \ + PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \ + \ + /* reference point is the center of the pixel */ \ + v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \ + v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \ + v.vector[2] = pixman_fixed_1; \ + \ + if (!pixman_transform_point_3d (src_image->common.transform, &v)) \ + return; \ + \ + unit_x = src_image->common.transform->matrix[0][0]; \ + unit_y = src_image->common.transform->matrix[1][1]; \ + \ + /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ \ + v.vector[0] -= pixman_fixed_e; \ + v.vector[1] -= pixman_fixed_e; \ + \ + vx = v.vector[0]; \ + vy = v.vector[1]; \ + \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + { \ + /* Clamp repeating positions inside the actual samples */ \ + max_vx = src_image->bits.width << 16; \ + max_vy = src_image->bits.height << 16; \ + \ + repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \ + repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ + } \ + \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \ + PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ + { \ + pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x, \ + &width, &left_pad, &right_pad); \ + vx += left_pad * unit_x; \ + } \ + \ + while (--height >= 0) \ + { \ + dst = dst_line; \ + dst_line += dst_stride; \ + if (have_mask && !mask_is_solid) \ + { \ + mask = mask_line; \ + mask_line += mask_stride; \ + } \ + \ + y = vy >> 16; \ + vy += unit_y; \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ + repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ + if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \ + { \ + repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height); \ + src = src_first_line + src_stride * y; \ + if (left_pad > 0) \ + { \ + scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE); \ + } \ + if (width > 0) \ + { \ + scanline_func (mask + (mask_is_solid ? 0 : left_pad), \ + dst + left_pad, src, width, vx, unit_x, 0, FALSE); \ + } \ + if (right_pad > 0) \ + { \ + scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \ + dst + left_pad + width, src + src_image->bits.width - 1, \ + right_pad, 0, 0, 0, FALSE); \ + } \ + } \ + else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ + { \ + static const src_type_t zero[1] = { 0 }; \ + if (y < 0 || y >= src_image->bits.height) \ + { \ + scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE); \ + continue; \ + } \ + src = src_first_line + src_stride * y; \ + if (left_pad > 0) \ + { \ + scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE); \ + } \ + if (width > 0) \ + { \ + scanline_func (mask + (mask_is_solid ? 0 : left_pad), \ + dst + left_pad, src, width, vx, unit_x, 0, FALSE); \ + } \ + if (right_pad > 0) \ + { \ + scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \ + dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE); \ + } \ + } \ + else \ + { \ + src = src_first_line + src_stride * y; \ + scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE); \ + } \ + } \ +} + +/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ +#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \ + dst_type_t, repeat_mode, have_mask, mask_is_solid) \ + FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t, \ + dst_type_t, repeat_mode, have_mask, mask_is_solid) + +#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t, \ + repeat_mode) \ + static force_inline void \ + scanline_func##scale_func_name##_wrapper ( \ + const uint8_t *mask, \ + dst_type_t *dst, \ + const src_type_t *src, \ + int32_t w, \ + pixman_fixed_t vx, \ + pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx, \ + pixman_bool_t fully_transparent_src) \ + { \ + scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src); \ + } \ + FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper, \ + src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE) + +#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t, \ + repeat_mode) \ + FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t, \ + dst_type_t, repeat_mode) + +#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT, \ + src_type_t, dst_type_t, OP, repeat_mode) \ + FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \ + SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t, \ + OP, repeat_mode) \ + FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP, \ + scaled_nearest_scanline_ ## scale_func_name ## _ ## OP, \ + src_type_t, dst_type_t, repeat_mode) + + +#define SCALED_NEAREST_FLAGS \ + (FAST_PATH_SCALE_TRANSFORM | \ + FAST_PATH_NO_ALPHA_MAP | \ + FAST_PATH_NEAREST_FILTER | \ + FAST_PATH_NO_ACCESSORS | \ + FAST_PATH_NARROW_FORMAT) + +#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NORMAL_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \ + } + +#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_PAD_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \ + } + +#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NONE_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \ + } + +#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \ + PIXMAN_null, 0, \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \ + } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NORMAL_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \ + } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_PAD_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \ + } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NONE_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \ + } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \ + PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \ + } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NORMAL_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op, \ + } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_PAD_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op, \ + } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + (SCALED_NEAREST_FLAGS | \ + FAST_PATH_NONE_REPEAT | \ + FAST_PATH_X_UNIT_POSITIVE), \ + PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _none ## _ ## op, \ + } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func) \ + { PIXMAN_OP_ ## op, \ + PIXMAN_ ## s, \ + SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \ + PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \ + PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ + fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op, \ + } + +/* Prefer the use of 'cover' variant, because it is faster */ +#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \ + SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \ + SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \ + SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func) + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func) + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func) \ + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func), \ + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func) + +#endif diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index 91adc0560..2e135e2fe 100644 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -5800,7 +5800,8 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, int32_t w, pixman_fixed_t vx, pixman_fixed_t unit_x, - pixman_fixed_t max_vx) + pixman_fixed_t max_vx, + pixman_bool_t fully_transparent_src) { uint32_t s, d; const uint32_t* pm = NULL; @@ -5809,6 +5810,9 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; + if (fully_transparent_src) + return; + /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) { @@ -5894,6 +5898,119 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, scaled_nearest_scanline_sse2_8888_8888_OVER, uint32_t, uint32_t, PAD) +static force_inline void +scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, + uint32_t * dst, + const uint32_t * src, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + __m128i xmm_mask; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + if (zero_src || (*mask >> 24) == 0) + return; + + xmm_mask = create_mask_16_128 (*mask >> 24); + + while (w && (unsigned long)dst & 15) + { + uint32_t s = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + if (s) + { + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + __m64 alpha = expand_alpha_1x64 (ms); + __m64 dest = _mm_movepi64_pi64 (xmm_mask); + __m64 alpha_dst = unpack_32_1x64 (d); + + *dst = pack_1x64_32 ( + in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); + } + dst++; + w--; + } + + while (w >= 4) + { + uint32_t tmp1, tmp2, tmp3, tmp4; + + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp3 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp4 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); + + if (!is_zero (xmm_src)) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + dst += 4; + w -= 4; + } + + while (w) + { + uint32_t s = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + if (s) + { + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + __m64 alpha = expand_alpha_1x64 (ms); + __m64 mask = _mm_movepi64_pi64 (xmm_mask); + __m64 dest = unpack_32_1x64 (d); + + *dst = pack_1x64_32 ( + in_over_1x64 (&ms, &alpha, &mask, &dest)); + } + + dst++; + w--; + } + + _mm_empty (); +} + +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) + static const pixman_fast_path_t sse2_fast_paths[] = { /* PIXMAN_OP_OVER */ @@ -5990,6 +6107,11 @@ static const pixman_fast_path_t sse2_fast_paths[] = SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), + { PIXMAN_OP_NONE }, }; diff --git a/pixman/test/Makefile.am b/pixman/test/Makefile.am index 8d8471d1c..3ce466eec 100644 --- a/pixman/test/Makefile.am +++ b/pixman/test/Makefile.am @@ -1,7 +1,6 @@ AM_CFLAGS = @OPENMP_CFLAGS@ -AM_LDFLAGS = @OPENMP_CFLAGS@ - -TEST_LDADD = $(top_builddir)/pixman/libpixman-1.la -lm +AM_LDFLAGS = @OPENMP_CFLAGS@ @TESTPROGS_EXTRA_LDFLAGS@ +LDADD = $(top_builddir)/pixman/libpixman-1.la -lm INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman TESTPROGRAMS = \ @@ -22,121 +21,24 @@ TESTPROGRAMS = \ affine-test \ composite -a1_trap_test_LDADD = $(TEST_LDADD) -a1_trap_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -fetch_test_LDADD = $(TEST_LDADD) -fetch_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -trap_crasher_LDADD = $(TEST_LDADD) -trap_crasher_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -oob_test_LDADD = $(TEST_LDADD) -oob_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -scaling_crash_test_LDADD = $(TEST_LDADD) -scaling_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -region_translate_test_LDADD = $(TEST_LDADD) -region_translate_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ - -pdf_op_test_LDADD = $(TEST_LDADD) -pdf_op_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ pdf_op_test_SOURCES = pdf-op-test.c utils.c utils.h - -region_test_LDADD = $(TEST_LDADD) -region_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ region_test_SOURCES = region-test.c utils.c utils.h - -blitters_test_LDADD = $(TEST_LDADD) -blitters_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ blitters_test_SOURCES = blitters-test.c utils.c utils.h - -scaling_test_LDADD = $(TEST_LDADD) -scaling_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ scaling_test_SOURCES = scaling-test.c utils.c utils.h - -affine_test_LDADD = $(TEST_LDADD) -affine_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ affine_test_SOURCES = affine-test.c utils.c utils.h - -alphamap_LDADD = $(TEST_LDADD) -alphamap_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ alphamap_SOURCES = alphamap.c utils.c utils.h - -alpha_loop_LDADD = $(TEST_LDADD) -alpha_loop_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ alpha_loop_SOURCES = alpha-loop.c utils.c utils.h - -composite_LDADD = $(TEST_LDADD) -composite_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ composite_SOURCES = composite.c utils.c utils.h - -gradient_crash_test_LDADD = $(TEST_LDADD) -gradient_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h - -stress_test_LDADD = $(TEST_LDADD) -stress_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ stress_test_SOURCES = stress-test.c utils.c utils.h -# GTK using test programs - -if HAVE_GTK - -GTK_LDADD = $(TEST_LDADD) $(GTK_LIBS) -GTK_UTILS = gtk-utils.c gtk-utils.h - -TESTPROGRAMS_GTK = \ - clip-test \ - clip-in \ - composite-test \ - gradient-test \ - radial-test \ - alpha-test \ - screen-test \ - convolution-test \ - trap-test - -INCLUDES += $(GTK_CFLAGS) - -gradient_test_LDADD = $(GTK_LDADD) -gradient_test_SOURCES = gradient-test.c $(GTK_UTILS) - -radial_test_LDADD = $(GTK_LDADD) -radial_test_SOURCES = radial-test.c utils.c utils.h $(GTK_UTILS) - -alpha_test_LDADD = $(GTK_LDADD) -alpha_test_SOURCES = alpha-test.c $(GTK_UTILS) - -composite_test_LDADD = $(GTK_LDADD) -composite_test_SOURCES = composite-test.c $(GTK_UTILS) - -clip_test_LDADD = $(GTK_LDADD) -clip_test_SOURCES = clip-test.c $(GTK_UTILS) - -clip_in_LDADD = $(GTK_LDADD) -clip_in_SOURCES = clip-in.c $(GTK_UTILS) - -trap_test_LDADD = $(GTK_LDADD) -trap_test_SOURCES = trap-test.c $(GTK_UTILS) - -screen_test_LDADD = $(GTK_LDADD) -screen_test_SOURCES = screen-test.c $(GTK_UTILS) - -convolution_test_LDADD = $(GTK_LDADD) -convolution_test_SOURCES = convolution-test.c $(GTK_UTILS) - -endif - # Benchmarks BENCHMARKS = \ lowlevel-blt-bench lowlevel_blt_bench_SOURCES = lowlevel-blt-bench.c utils.c utils.h -lowlevel_blt_bench_LDADD = $(TEST_LDADD) -noinst_PROGRAMS = $(TESTPROGRAMS) $(TESTPROGRAMS_GTK) $(BENCHMARKS) +noinst_PROGRAMS = $(TESTPROGRAMS) $(BENCHMARKS) TESTS = $(TESTPROGRAMS) diff --git a/pixman/test/alpha-test.c b/pixman/test/alpha-test.c deleted file mode 100644 index 92c208142..000000000 --- a/pixman/test/alpha-test.c +++ /dev/null @@ -1,117 +0,0 @@ -#include -#include -#include "pixman.h" -#include "gtk-utils.h" - -int -main (int argc, char **argv) -{ -#define WIDTH 400 -#define HEIGHT 200 - - uint32_t *alpha = malloc (WIDTH * HEIGHT * 4); - uint32_t *dest = malloc (WIDTH * HEIGHT * 4); - uint32_t *src = malloc (WIDTH * HEIGHT * 4); - pixman_image_t *grad_img; - pixman_image_t *alpha_img; - pixman_image_t *dest_img; - pixman_image_t *src_img; - int i; - pixman_gradient_stop_t stops[2] = - { - { pixman_int_to_fixed (0), { 0x0000, 0x0000, 0x0000, 0x0000 } }, - { pixman_int_to_fixed (1), { 0xffff, 0x0000, 0x1111, 0xffff } } - }; - pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 }; - pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH), - pixman_int_to_fixed (0) }; -#if 0 - pixman_transform_t trans = { - { { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), }, - { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), }, - { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } - } - }; -#else - pixman_transform_t trans = { - { { pixman_fixed_1, 0, 0 }, - { 0, pixman_fixed_1, 0 }, - { 0, 0, pixman_fixed_1 } } - }; -#endif - - pixman_point_fixed_t c_inner; - pixman_point_fixed_t c_outer; - pixman_fixed_t r_inner; - pixman_fixed_t r_outer; - - for (i = 0; i < WIDTH * HEIGHT; ++i) - alpha[i] = 0x4f00004f; /* pale blue */ - - alpha_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, - WIDTH, HEIGHT, - alpha, - WIDTH * 4); - - for (i = 0; i < WIDTH * HEIGHT; ++i) - dest[i] = 0xffffff00; /* yellow */ - - dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, - WIDTH, HEIGHT, - dest, - WIDTH * 4); - - for (i = 0; i < WIDTH * HEIGHT; ++i) - src[i] = 0xffff0000; - - src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, - WIDTH, HEIGHT, - src, - WIDTH * 4); - - c_inner.x = pixman_double_to_fixed (50.0); - c_inner.y = pixman_double_to_fixed (50.0); - c_outer.x = pixman_double_to_fixed (50.0); - c_outer.y = pixman_double_to_fixed (50.0); - r_inner = 0; - r_outer = pixman_double_to_fixed (50.0); - -#if 0 - grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner, - stops, 2); -#endif -#if 0 - grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner, - stops, 2); - grad_img = pixman_image_create_linear_gradient (&c_inner, &c_outer, - r_inner, r_outer, - stops, 2); -#endif - - grad_img = pixman_image_create_linear_gradient (&p1, &p2, - stops, 2); - - pixman_image_set_transform (grad_img, &trans); - pixman_image_set_repeat (grad_img, PIXMAN_REPEAT_PAD); - - pixman_image_composite (PIXMAN_OP_OVER, grad_img, NULL, alpha_img, - 0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT); - - pixman_image_set_alpha_map (src_img, alpha_img, 10, 10); - - pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img, - 0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT); - - printf ("0, 0: %x\n", dest[0]); - printf ("10, 10: %x\n", dest[10 * 10 + 10]); - printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]); - - show_image (dest_img); - - pixman_image_unref (src_img); - pixman_image_unref (grad_img); - pixman_image_unref (alpha_img); - free (dest); - - return 0; -} diff --git a/pixman/test/clip-in.c b/pixman/test/clip-in.c deleted file mode 100644 index 51579811f..000000000 --- a/pixman/test/clip-in.c +++ /dev/null @@ -1,50 +0,0 @@ -#include -#include -#include -#include "pixman.h" -#include "gtk-utils.h" - -/* This test demonstrates that clipping is done totally different depending - * on whether the source is transformed or not. - */ -int -main (int argc, char **argv) -{ -#define WIDTH 200 -#define HEIGHT 200 - -#define SMALL 25 - - uint32_t *sbits = malloc (SMALL * SMALL * 4); - uint32_t *bits = malloc (WIDTH * HEIGHT * 4); - pixman_transform_t trans = { - { - { pixman_double_to_fixed (1.0), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.1), }, - { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.1), }, - { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) } - } }; - - pixman_image_t *src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, SMALL, SMALL, sbits, 4 * SMALL); - pixman_image_t *dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, 4 * WIDTH); - - memset (bits, 0xff, WIDTH * HEIGHT * 4); - memset (sbits, 0x00, SMALL * SMALL * 4); - - pixman_image_composite (PIXMAN_OP_IN, - src_img, NULL, dest_img, - 0, 0, 0, 0, SMALL, SMALL, 200, 200); - - pixman_image_set_transform (src_img, &trans); - - pixman_image_composite (PIXMAN_OP_IN, - src_img, NULL, dest_img, - 0, 0, 0, 0, SMALL * 2, SMALL * 2, 200, 200); - - show_image (dest_img); - - pixman_image_unref (src_img); - pixman_image_unref (dest_img); - free (bits); - - return 0; -} diff --git a/pixman/test/clip-test.c b/pixman/test/clip-test.c deleted file mode 100644 index aa0df4482..000000000 --- a/pixman/test/clip-test.c +++ /dev/null @@ -1,97 +0,0 @@ -#include -#include -#include "pixman.h" -#include "gtk-utils.h" - -#define WIDTH 200 -#define HEIGHT 200 - -static pixman_image_t * -create_solid_bits (uint32_t pixel) -{ - uint32_t *pixels = malloc (WIDTH * HEIGHT * 4); - int i; - - for (i = 0; i < WIDTH * HEIGHT; ++i) - pixels[i] = pixel; - - return pixman_image_create_bits (PIXMAN_a8r8g8b8, - WIDTH, HEIGHT, - pixels, - WIDTH * 4); -} - -int -main (int argc, char **argv) -{ - pixman_image_t *gradient_img; - pixman_image_t *src_img, *dst_img; - pixman_gradient_stop_t stops[2] = - { - { pixman_int_to_fixed (0), { 0xffff, 0x0000, 0x0000, 0xffff } }, - { pixman_int_to_fixed (1), { 0xffff, 0xffff, 0x0000, 0xffff } } - }; -#if 0 - pixman_point_fixed_t p1 = { 0, 0 }; - pixman_point_fixed_t p2 = { pixman_int_to_fixed (WIDTH), - pixman_int_to_fixed (HEIGHT) }; -#endif - pixman_point_fixed_t c_inner; - pixman_point_fixed_t c_outer; - pixman_fixed_t r_inner; - pixman_fixed_t r_outer; - pixman_region32_t clip_region; - pixman_transform_t trans = { - { { pixman_double_to_fixed (1.3), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.5), }, - { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.5), }, - { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) } - } - }; - - src_img = create_solid_bits (0xff0000ff); - - c_inner.x = pixman_double_to_fixed (100.0); - c_inner.y = pixman_double_to_fixed (100.0); - c_outer.x = pixman_double_to_fixed (100.0); - c_outer.y = pixman_double_to_fixed (100.0); - r_inner = 0; - r_outer = pixman_double_to_fixed (100.0); - - gradient_img = pixman_image_create_radial_gradient (&c_inner, &c_outer, - r_inner, r_outer, - stops, 2); - -#if 0 - gradient_img = pixman_image_create_linear_gradient (&p1, &p2, - stops, 2); - -#endif - - pixman_image_composite (PIXMAN_OP_OVER, gradient_img, NULL, src_img, - 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT); - - pixman_region32_init_rect (&clip_region, 50, 0, 100, 200); - pixman_image_set_clip_region32 (src_img, &clip_region); - pixman_image_set_source_clipping (src_img, TRUE); - pixman_image_set_has_client_clip (src_img, TRUE); - pixman_image_set_transform (src_img, &trans); - pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL); - - dst_img = create_solid_bits (0xffff0000); - pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dst_img, - 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT); - - -#if 0 - printf ("0, 0: %x\n", src[0]); - printf ("10, 10: %x\n", src[10 * 10 + 10]); - printf ("w, h: %x\n", src[(HEIGHT - 1) * 100 + (WIDTH - 1)]); -#endif - - show_image (dst_img); - - pixman_image_unref (gradient_img); - pixman_image_unref (src_img); - - return 0; -} diff --git a/pixman/test/composite-test.c b/pixman/test/composite-test.c deleted file mode 100644 index 79d5d5eac..000000000 --- a/pixman/test/composite-test.c +++ /dev/null @@ -1,191 +0,0 @@ -#include -#include -#include -#include "pixman.h" -#include "gtk-utils.h" - -#define WIDTH 60 -#define HEIGHT 60 - -typedef struct { - const char *name; - pixman_op_t op; -} operator_t; - -static const operator_t operators[] = { - { "CLEAR", PIXMAN_OP_CLEAR }, - { "SRC", PIXMAN_OP_SRC }, - { "DST", PIXMAN_OP_DST }, - { "OVER", PIXMAN_OP_OVER }, - { "OVER_REVERSE", PIXMAN_OP_OVER_REVERSE }, - { "IN", PIXMAN_OP_IN }, - { "IN_REVERSE", PIXMAN_OP_IN_REVERSE }, - { "OUT", PIXMAN_OP_OUT }, - { "OUT_REVERSE", PIXMAN_OP_OUT_REVERSE }, - { "ATOP", PIXMAN_OP_ATOP }, - { "ATOP_REVERSE", PIXMAN_OP_ATOP_REVERSE }, - { "XOR", PIXMAN_OP_XOR }, - { "ADD", PIXMAN_OP_ADD }, - { "SATURATE", PIXMAN_OP_SATURATE }, - - { "MULTIPLY", PIXMAN_OP_MULTIPLY }, - { "SCREEN", PIXMAN_OP_SCREEN }, - { "OVERLAY", PIXMAN_OP_OVERLAY }, - { "DARKEN", PIXMAN_OP_DARKEN }, - { "LIGHTEN", PIXMAN_OP_LIGHTEN }, - { "COLOR_DODGE", PIXMAN_OP_COLOR_DODGE }, - { "COLOR_BURN", PIXMAN_OP_COLOR_BURN }, - { "HARD_LIGHT", PIXMAN_OP_HARD_LIGHT }, - { "SOFT_LIGHT", PIXMAN_OP_SOFT_LIGHT }, - { "DIFFERENCE", PIXMAN_OP_DIFFERENCE }, - { "EXCLUSION", PIXMAN_OP_EXCLUSION }, - { "HSL_HUE", PIXMAN_OP_HSL_HUE }, - { "HSL_SATURATION", PIXMAN_OP_HSL_SATURATION }, - { "HSL_COLOR", PIXMAN_OP_HSL_COLOR }, - { "HSL_LUMINOSITY", PIXMAN_OP_HSL_LUMINOSITY }, -}; - -static uint32_t -reader (const void *src, int size) -{ - switch (size) - { - case 1: - return *(uint8_t *)src; - case 2: - return *(uint16_t *)src; - case 4: - return *(uint32_t *)src; - default: - g_assert_not_reached(); - } -} - -static void -writer (void *src, uint32_t value, int size) -{ - switch (size) - { - case 1: - *(uint8_t *)src = value; - break; - - case 2: - *(uint16_t *)src = value; - break; - - case 4: - *(uint32_t *)src = value; - break; - - default: - break; - } -} - -int -main (int argc, char **argv) -{ -#define d2f pixman_double_to_fixed - - GtkWidget *window, *swindow; - GtkWidget *table; - uint32_t *dest = malloc (WIDTH * HEIGHT * 4); - uint32_t *src = malloc (WIDTH * HEIGHT * 4); - pixman_image_t *src_img; - pixman_image_t *dest_img; - pixman_point_fixed_t p1 = { -10 << 0, 0 }; - pixman_point_fixed_t p2 = { WIDTH << 16, (HEIGHT - 10) << 16 }; - uint16_t full = 0xcfff; - uint16_t low = 0x5000; - uint16_t alpha = 0xffff; - pixman_gradient_stop_t stops[6] = - { - { d2f (0.0), { full, low, low, alpha } }, - { d2f (0.25), { full, full, low, alpha } }, - { d2f (0.4), { low, full, low, alpha } }, - { d2f (0.6), { low, full, full, alpha } }, - { d2f (0.8), { low, low, full, alpha } }, - { d2f (1.0), { full, low, full, alpha } }, - }; - - int i; - - gtk_init (&argc, &argv); - - window = gtk_window_new (GTK_WINDOW_TOPLEVEL); - - gtk_window_set_default_size (GTK_WINDOW (window), 800, 600); - - g_signal_connect (window, "delete-event", - G_CALLBACK (gtk_main_quit), - NULL); - table = gtk_table_new (G_N_ELEMENTS (operators) / 6, 6, TRUE); - - src_img = pixman_image_create_linear_gradient (&p1, &p2, stops, - sizeof (stops) / sizeof (stops[0])); - - pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD); - - dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, - WIDTH, HEIGHT, - dest, - WIDTH * 4); - pixman_image_set_accessors (dest_img, reader, writer); - - for (i = 0; i < G_N_ELEMENTS (operators); ++i) - { - GtkWidget *image; - GdkPixbuf *pixbuf; - GtkWidget *vbox; - GtkWidget *label; - int j, k; - - vbox = gtk_vbox_new (FALSE, 0); - - label = gtk_label_new (operators[i].name); - gtk_box_pack_start (GTK_BOX (vbox), label, FALSE, FALSE, 6); - gtk_widget_show (label); - - for (j = 0; j < HEIGHT; ++j) - { - for (k = 0; k < WIDTH; ++k) - dest[j * WIDTH + k] = 0x7f6f6f00; - } - pixman_image_composite (operators[i].op, src_img, NULL, dest_img, - 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT); - pixbuf = pixbuf_from_argb32 (pixman_image_get_data (dest_img), TRUE, - WIDTH, HEIGHT, WIDTH * 4); - image = gtk_image_new_from_pixbuf (pixbuf); - gtk_box_pack_start (GTK_BOX (vbox), image, FALSE, FALSE, 0); - gtk_widget_show (image); - - gtk_table_attach_defaults (GTK_TABLE (table), vbox, - i % 6, (i % 6) + 1, i / 6, (i / 6) + 1); - gtk_widget_show (vbox); - - g_object_unref (pixbuf); - } - - pixman_image_unref (src_img); - free (src); - pixman_image_unref (dest_img); - free (dest); - - swindow = gtk_scrolled_window_new (NULL, NULL); - gtk_scrolled_window_set_policy (GTK_SCROLLED_WINDOW (swindow), - GTK_POLICY_AUTOMATIC, - GTK_POLICY_AUTOMATIC); - - gtk_scrolled_window_add_with_viewport (GTK_SCROLLED_WINDOW (swindow), table); - gtk_widget_show (table); - - gtk_container_add (GTK_CONTAINER (window), swindow); - gtk_widget_show (swindow); - - gtk_widget_show (window); - - gtk_main (); - - return 0; -} diff --git a/pixman/test/convolution-test.c b/pixman/test/convolution-test.c deleted file mode 100644 index da284af7b..000000000 --- a/pixman/test/convolution-test.c +++ /dev/null @@ -1,47 +0,0 @@ -#include -#include -#include "pixman.h" -#include "gtk-utils.h" - -int -main (int argc, char **argv) -{ -#define WIDTH 200 -#define HEIGHT 200 - -#define d2f pixman_double_to_fixed - - uint32_t *src = malloc (WIDTH * HEIGHT * 4); - uint32_t *mask = malloc (WIDTH * HEIGHT * 4); - uint32_t *dest = malloc (WIDTH * HEIGHT * 4); - pixman_fixed_t convolution[] = - { - d2f (3), d2f (3), - d2f (0.5), d2f (0.5), d2f (0.5), - d2f (0.5), d2f (0.5), d2f (0.5), - d2f (0.5), d2f (0.5), d2f (0.5), - }; - pixman_image_t *simg, *mimg, *dimg; - - int i; - - for (i = 0; i < WIDTH * HEIGHT; ++i) - { - src[i] = 0x7f007f00; - mask[i] = (i % 256) * 0x01000000; - dest[i] = 0; - } - - simg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src, WIDTH * 4); - mimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, mask, WIDTH * 4); - dimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4); - - pixman_image_set_filter (mimg, PIXMAN_FILTER_CONVOLUTION, - convolution, 11); - - pixman_image_composite (PIXMAN_OP_OVER, simg, mimg, dimg, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT); - - show_image (dimg); - - return 0; -} diff --git a/pixman/test/gradient-test.c b/pixman/test/gradient-test.c deleted file mode 100644 index fc84844b0..000000000 --- a/pixman/test/gradient-test.c +++ /dev/null @@ -1,89 +0,0 @@ -#include -#include -#include "pixman.h" -#include "gtk-utils.h" - -int -main (int argc, char **argv) -{ -#define WIDTH 400 -#define HEIGHT 200 - - uint32_t *dest = malloc (WIDTH * HEIGHT * 4); - pixman_image_t *src_img; - pixman_image_t *dest_img; - int i; - pixman_gradient_stop_t stops[2] = - { - { pixman_int_to_fixed (0), { 0xffff, 0xeeee, 0xeeee, 0xeeee } }, - { pixman_int_to_fixed (1), { 0xffff, 0x1111, 0x1111, 0x1111 } } - }; - pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 }; - pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH / 8.), - pixman_int_to_fixed (0) }; -#if 0 - pixman_transform_t trans = { - { { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), }, - { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), }, - { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } - } - }; -#else - pixman_transform_t trans = { - { { pixman_fixed_1, 0, 0 }, - { 0, pixman_fixed_1, 0 }, - { 0, 0, pixman_fixed_1 } } - }; -#endif - - pixman_point_fixed_t c_inner; - pixman_point_fixed_t c_outer; - pixman_fixed_t r_inner; - pixman_fixed_t r_outer; - - for (i = 0; i < WIDTH * HEIGHT; ++i) - dest[i] = 0x4f00004f; /* pale blue */ - - dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, - WIDTH, HEIGHT, - dest, - WIDTH * 4); - - c_inner.x = pixman_double_to_fixed (50.0); - c_inner.y = pixman_double_to_fixed (50.0); - c_outer.x = pixman_double_to_fixed (50.0); - c_outer.y = pixman_double_to_fixed (50.0); - r_inner = 0; - r_outer = pixman_double_to_fixed (50.0); - - src_img = pixman_image_create_conical_gradient (&c_inner, r_inner, - stops, 2); -#if 0 - src_img = pixman_image_create_conical_gradient (&c_inner, r_inner, - stops, 2); - src_img = pixman_image_create_linear_gradient (&c_inner, &c_outer, - r_inner, r_outer, - stops, 2); -#endif - - src_img = pixman_image_create_linear_gradient (&p1, &p2, - stops, 2); - - pixman_image_set_transform (src_img, &trans); - pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD); - - pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img, - 0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT); - - printf ("0, 0: %x\n", dest[0]); - printf ("10, 10: %x\n", dest[10 * 10 + 10]); - printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]); - - show_image (dest_img); - - pixman_image_unref (src_img); - pixman_image_unref (dest_img); - free (dest); - - return 0; -} diff --git a/pixman/test/gtk-utils.c b/pixman/test/gtk-utils.c deleted file mode 100644 index f45cdc912..000000000 --- a/pixman/test/gtk-utils.c +++ /dev/null @@ -1,115 +0,0 @@ -#include -#include -#include "pixman-private.h" /* For image->bits.format - * FIXME: there should probably be public API for this - */ -#include "gtk-utils.h" - -GdkPixbuf * -pixbuf_from_argb32 (uint32_t *bits, - gboolean has_alpha, - int width, - int height, - int stride) -{ - GdkPixbuf *pixbuf = gdk_pixbuf_new (GDK_COLORSPACE_RGB, TRUE, - 8, width, height); - int p_stride = gdk_pixbuf_get_rowstride (pixbuf); - guint32 *p_bits = (guint32 *)gdk_pixbuf_get_pixels (pixbuf); - int w, h; - - for (h = 0; h < height; ++h) - { - for (w = 0; w < width; ++w) - { - uint32_t argb = bits[h * (stride / 4) + w]; - guint r, g, b, a; - char *pb = (char *)p_bits; - - pb += h * p_stride + w * 4; - - r = (argb & 0x00ff0000) >> 16; - g = (argb & 0x0000ff00) >> 8; - b = (argb & 0x000000ff) >> 0; - a = has_alpha? (argb & 0xff000000) >> 24 : 0xff; - - if (a) - { - r = (r * 255) / a; - g = (g * 255) / a; - b = (b * 255) / a; - } - - if (r > 255) r = 255; - if (g > 255) g = 255; - if (b > 255) b = 255; - - pb[0] = r; - pb[1] = g; - pb[2] = b; - pb[3] = a; - } - } - - return pixbuf; -} - - -static gboolean -on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data) -{ - GdkPixbuf *pixbuf = data; - - gdk_draw_pixbuf (widget->window, NULL, - pixbuf, 0, 0, 0, 0, - gdk_pixbuf_get_width (pixbuf), - gdk_pixbuf_get_height (pixbuf), - GDK_RGB_DITHER_NONE, - 0, 0); - - return TRUE; -} - -void -show_image (pixman_image_t *image) -{ - GtkWidget *window; - GdkPixbuf *pixbuf; - int width, height, stride; - int argc; - char **argv; - char *arg0 = g_strdup ("pixman-test-program"); - gboolean has_alpha; - pixman_format_code_t format; - - argc = 1; - argv = (char **)&arg0; - - gtk_init (&argc, &argv); - - window = gtk_window_new (GTK_WINDOW_TOPLEVEL); - width = pixman_image_get_width (image); - height = pixman_image_get_height (image); - stride = pixman_image_get_stride (image); - - gtk_window_set_default_size (GTK_WINDOW (window), width, height); - - format = image->bits.format; - - if (format == PIXMAN_a8r8g8b8) - has_alpha = TRUE; - else if (format == PIXMAN_x8r8g8b8) - has_alpha = FALSE; - else - g_error ("Can't deal with this format: %x\n", format); - - pixbuf = pixbuf_from_argb32 (pixman_image_get_data (image), has_alpha, - width, height, stride); - - g_signal_connect (window, "expose_event", G_CALLBACK (on_expose), pixbuf); - g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL); - - gtk_widget_show (window); - - gtk_main (); -} diff --git a/pixman/test/gtk-utils.h b/pixman/test/gtk-utils.h deleted file mode 100644 index 2cb13bcf0..000000000 --- a/pixman/test/gtk-utils.h +++ /dev/null @@ -1,13 +0,0 @@ -#include -#include -#include -#include -#include "pixman.h" - -void show_image (pixman_image_t *image); - -GdkPixbuf *pixbuf_from_argb32 (uint32_t *bits, - gboolean has_alpha, - int width, - int height, - int stride); diff --git a/pixman/test/radial-test.c b/pixman/test/radial-test.c deleted file mode 100644 index 5d716c339..000000000 --- a/pixman/test/radial-test.c +++ /dev/null @@ -1,198 +0,0 @@ -#include "utils.h" -#include "gtk-utils.h" - -#define NUM_GRADIENTS 7 -#define NUM_STOPS 3 -#define NUM_REPEAT 4 -#define SIZE 128 -#define WIDTH (SIZE * NUM_GRADIENTS) -#define HEIGHT (SIZE * NUM_REPEAT) - -/* - * We want to test all the possible relative positions of the start - * and end circle: - * - * - The start circle can be smaller/equal/bigger than the end - * circle. A radial gradient can be classified in one of these - * three cases depending on the sign of dr. - * - * - The smaller circle can be completely inside/internally - * tangent/outside (at least in part) of the bigger circle. This - * classification is the same as the one which can be computed by - * examining the sign of a = (dx^2 + dy^2 - dr^2). - * - * - If the two circles have the same size, neither can be inside or - * internally tangent - * - * This test draws radial gradients whose circles always have the same - * centers (0, 0) and (1, 0), but with different radiuses. From left - * to right: - * - * - Small start circle completely inside the end circle - * 0.25 -> 1.75; dr = 1.5 > 0; a = 1 - 1.50^2 < 0 - * - * - Small start circle internally tangent to the end circle - * 0.50 -> 1.50; dr = 1.0 > 0; a = 1 - 1.00^2 = 0 - * - * - Small start circle outside of the end circle - * 0.50 -> 1.00; dr = 0.5 > 0; a = 1 - 0.50^2 > 0 - * - * - Start circle with the same size as the end circle - * 1.00 -> 1.00; dr = 0.0 = 0; a = 1 - 0.00^2 > 0 - * - * - Small end circle outside of the start circle - * 1.00 -> 0.50; dr = -0.5 > 0; a = 1 - 0.50^2 > 0 - * - * - Small end circle internally tangent to the start circle - * 1.50 -> 0.50; dr = -1.0 > 0; a = 1 - 1.00^2 = 0 - * - * - Small end circle completely inside the start circle - * 1.75 -> 0.25; dr = -1.5 > 0; a = 1 - 1.50^2 < 0 - * - */ - -const static double radiuses[NUM_GRADIENTS] = { - 0.25, - 0.50, - 0.50, - 1.00, - 1.00, - 1.50, - 1.75 -}; - -#define double_to_color(x) \ - (((uint32_t) ((x)*65536)) - (((uint32_t) ((x)*65536)) >> 16)) - -#define PIXMAN_STOP(offset,r,g,b,a) \ - { pixman_double_to_fixed (offset), \ - { \ - double_to_color (r), \ - double_to_color (g), \ - double_to_color (b), \ - double_to_color (a) \ - } \ - } - -static const pixman_gradient_stop_t stops[NUM_STOPS] = { - PIXMAN_STOP (0.0, 1, 0, 0, 0.75), - PIXMAN_STOP (0.70710678, 0, 1, 0, 0), - PIXMAN_STOP (1.0, 0, 0, 1, 1) -}; - -static pixman_image_t * -create_radial (int index) -{ - pixman_point_fixed_t p0, p1; - pixman_fixed_t r0, r1; - double x0, x1, radius0, radius1, left, right, center; - - x0 = 0; - x1 = 1; - radius0 = radiuses[index]; - radius1 = radiuses[NUM_GRADIENTS - index - 1]; - - /* center the gradient */ - left = MIN (x0 - radius0, x1 - radius1); - right = MAX (x0 + radius0, x1 + radius1); - center = (left + right) * 0.5; - x0 -= center; - x1 -= center; - - /* scale to make it fit within a 1x1 rect centered in (0,0) */ - x0 *= 0.25; - x1 *= 0.25; - radius0 *= 0.25; - radius1 *= 0.25; - - p0.x = pixman_double_to_fixed (x0); - p0.y = pixman_double_to_fixed (0); - - p1.x = pixman_double_to_fixed (x1); - p1.y = pixman_double_to_fixed (0); - - r0 = pixman_double_to_fixed (radius0); - r1 = pixman_double_to_fixed (radius1); - - return pixman_image_create_radial_gradient (&p0, &p1, - r0, r1, - stops, NUM_STOPS); -} - -static const pixman_repeat_t repeat[NUM_REPEAT] = { - PIXMAN_REPEAT_NONE, - PIXMAN_REPEAT_NORMAL, - PIXMAN_REPEAT_REFLECT, - PIXMAN_REPEAT_PAD -}; - -int -main (int argc, char **argv) -{ - pixman_transform_t transform; - pixman_image_t *src_img, *dest_img; - int i, j; - - enable_fp_exceptions (); - - dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, - WIDTH, HEIGHT, - NULL, 0); - - pixman_transform_init_identity (&transform); - - /* - * The create_radial() function returns gradients centered in the - * origin and whose interesting part fits a 1x1 square. We want to - * paint these gradients on a SIZExSIZE square and to make things - * easier we want the origin in the top-left corner of the square - * we want to see. - */ - pixman_transform_translate (NULL, &transform, - pixman_double_to_fixed (0.5), - pixman_double_to_fixed (0.5)); - - pixman_transform_scale (NULL, &transform, - pixman_double_to_fixed (SIZE), - pixman_double_to_fixed (SIZE)); - - /* - * Gradients are evaluated at the center of each pixel, so we need - * to translate by half a pixel to trigger some interesting - * cornercases. In particular, the original implementation of PDF - * radial gradients tried to divide by 0 when using this transform - * on the "tangent circles" cases. - */ - pixman_transform_translate (NULL, &transform, - pixman_double_to_fixed (0.5), - pixman_double_to_fixed (0.5)); - - for (i = 0; i < NUM_GRADIENTS; i++) - { - src_img = create_radial (i); - pixman_image_set_transform (src_img, &transform); - - for (j = 0; j < NUM_REPEAT; j++) - { - pixman_image_set_repeat (src_img, repeat[j]); - - pixman_image_composite32 (PIXMAN_OP_OVER, - src_img, - NULL, - dest_img, - 0, 0, - 0, 0, - i * SIZE, j * SIZE, - SIZE, SIZE); - - } - - pixman_image_unref (src_img); - } - - show_image (dest_img); - - pixman_image_unref (dest_img); - - return 0; -} diff --git a/pixman/test/scaling-test.c b/pixman/test/scaling-test.c index 7b78017a3..dbb9d39b0 100644 --- a/pixman/test/scaling-test.c +++ b/pixman/test/scaling-test.c @@ -1,250 +1,368 @@ -/* - * Test program, which can detect some problems with nearest neighbour - * and bilinear scaling in pixman. Testing is done by running lots - * of random SRC and OVER compositing operations a8r8g8b8, x8a8r8g8b8 - * and r5g6b5 color formats. - * - * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in - * the case of test failure. - */ -#include -#include -#include -#include "utils.h" - -#define MAX_SRC_WIDTH 16 -#define MAX_SRC_HEIGHT 16 -#define MAX_DST_WIDTH 16 -#define MAX_DST_HEIGHT 16 -#define MAX_STRIDE 4 - -/* - * Composite operation with pseudorandom images - */ -uint32_t -test_composite (int testnum, - int verbose) -{ - int i; - pixman_image_t * src_img; - pixman_image_t * dst_img; - pixman_transform_t transform; - pixman_region16_t clip; - int src_width, src_height; - int dst_width, dst_height; - int src_stride, dst_stride; - int src_x, src_y; - int dst_x, dst_y; - int src_bpp; - int dst_bpp; - int w, h; - pixman_fixed_t scale_x = 65536, scale_y = 65536; - pixman_fixed_t translate_x = 0, translate_y = 0; - pixman_op_t op; - pixman_repeat_t repeat = PIXMAN_REPEAT_NONE; - pixman_format_code_t src_fmt, dst_fmt; - uint32_t * srcbuf; - uint32_t * dstbuf; - uint32_t crc32; - FLOAT_REGS_CORRUPTION_DETECTOR_START (); - - lcg_srand (testnum); - - src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4; - dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4; - op = (lcg_rand_n (2) == 0) ? PIXMAN_OP_SRC : PIXMAN_OP_OVER; - - src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1; - src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1; - dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1; - dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1; - src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp; - dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp; - - if (src_stride & 3) - src_stride += 2; - - if (dst_stride & 3) - dst_stride += 2; - - src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2); - src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2); - dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2); - dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2); - w = lcg_rand_n (dst_width * 3 / 2 - dst_x); - h = lcg_rand_n (dst_height * 3 / 2 - dst_y); - - srcbuf = (uint32_t *)malloc (src_stride * src_height); - dstbuf = (uint32_t *)malloc (dst_stride * dst_height); - - for (i = 0; i < src_stride * src_height; i++) - *((uint8_t *)srcbuf + i) = lcg_rand_n (256); - - for (i = 0; i < dst_stride * dst_height; i++) - *((uint8_t *)dstbuf + i) = lcg_rand_n (256); - - src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ? - PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5; - - dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ? - PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5; - - src_img = pixman_image_create_bits ( - src_fmt, src_width, src_height, srcbuf, src_stride); - - dst_img = pixman_image_create_bits ( - dst_fmt, dst_width, dst_height, dstbuf, dst_stride); - - image_endian_swap (src_img, src_bpp * 8); - image_endian_swap (dst_img, dst_bpp * 8); - - if (lcg_rand_n (8) > 0) - { - scale_x = -32768 * 3 + lcg_rand_N (65536 * 5); - scale_y = -32768 * 3 + lcg_rand_N (65536 * 5); - translate_x = lcg_rand_N (65536); - translate_y = lcg_rand_N (65536); - pixman_transform_init_scale (&transform, scale_x, scale_y); - pixman_transform_translate (&transform, NULL, translate_x, translate_y); - pixman_image_set_transform (src_img, &transform); - } - - switch (lcg_rand_n (4)) - { - case 0: - repeat = PIXMAN_REPEAT_NONE; - break; - - case 1: - repeat = PIXMAN_REPEAT_NORMAL; - break; - - case 2: - repeat = PIXMAN_REPEAT_PAD; - break; - - case 3: - repeat = PIXMAN_REPEAT_REFLECT; - break; - - default: - break; - } - pixman_image_set_repeat (src_img, repeat); - - if (lcg_rand_n (2)) - pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0); - else - pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0); - - if (verbose) - { - printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt); - printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n", - op, scale_x, scale_y, repeat); - printf ("translate_x=%d, translate_y=%d\n", - translate_x, translate_y); - printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n", - src_width, src_height, dst_width, dst_height); - printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n", - src_x, src_y, dst_x, dst_y); - printf ("w=%d, h=%d\n", w, h); - } - - if (lcg_rand_n (8) == 0) - { - pixman_box16_t clip_boxes[2]; - int n = lcg_rand_n (2) + 1; - - for (i = 0; i < n; i++) - { - clip_boxes[i].x1 = lcg_rand_n (src_width); - clip_boxes[i].y1 = lcg_rand_n (src_height); - clip_boxes[i].x2 = - clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1); - clip_boxes[i].y2 = - clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1); - - if (verbose) - { - printf ("source clip box: [%d,%d-%d,%d]\n", - clip_boxes[i].x1, clip_boxes[i].y1, - clip_boxes[i].x2, clip_boxes[i].y2); - } - } - - pixman_region_init_rects (&clip, clip_boxes, n); - pixman_image_set_clip_region (src_img, &clip); - pixman_image_set_source_clipping (src_img, 1); - pixman_region_fini (&clip); - } - - if (lcg_rand_n (8) == 0) - { - pixman_box16_t clip_boxes[2]; - int n = lcg_rand_n (2) + 1; - for (i = 0; i < n; i++) - { - clip_boxes[i].x1 = lcg_rand_n (dst_width); - clip_boxes[i].y1 = lcg_rand_n (dst_height); - clip_boxes[i].x2 = - clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1); - clip_boxes[i].y2 = - clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1); - - if (verbose) - { - printf ("destination clip box: [%d,%d-%d,%d]\n", - clip_boxes[i].x1, clip_boxes[i].y1, - clip_boxes[i].x2, clip_boxes[i].y2); - } - } - pixman_region_init_rects (&clip, clip_boxes, n); - pixman_image_set_clip_region (dst_img, &clip); - pixman_region_fini (&clip); - } - - pixman_image_composite (op, src_img, NULL, dst_img, - src_x, src_y, 0, 0, dst_x, dst_y, w, h); - - if (dst_fmt == PIXMAN_x8r8g8b8) - { - /* ignore unused part */ - for (i = 0; i < dst_stride * dst_height / 4; i++) - dstbuf[i] &= 0xFFFFFF; - } - - image_endian_swap (dst_img, dst_bpp * 8); - - if (verbose) - { - int j; - - for (i = 0; i < dst_height; i++) - { - for (j = 0; j < dst_stride; j++) - printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j)); - - printf ("\n"); - } - } - - pixman_image_unref (src_img); - pixman_image_unref (dst_img); - - crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height); - free (srcbuf); - free (dstbuf); - - FLOAT_REGS_CORRUPTION_DETECTOR_FINISH (); - return crc32; -} - -int -main (int argc, const char *argv[]) -{ - pixman_disable_out_of_bounds_workaround (); - - return fuzzer_test_main("scaling", 8000000, 0x7F1AB59F, - test_composite, argc, argv); -} +/* + * Test program, which can detect some problems with nearest neighbour + * and bilinear scaling in pixman. Testing is done by running lots + * of random SRC and OVER compositing operations a8r8g8b8, x8a8r8g8b8 + * and r5g6b5 color formats. + * + * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in + * the case of test failure. + */ +#include +#include +#include +#include "utils.h" + +#define MAX_SRC_WIDTH 48 +#define MAX_SRC_HEIGHT 8 +#define MAX_DST_WIDTH 48 +#define MAX_DST_HEIGHT 8 +#define MAX_STRIDE 4 + +/* + * Composite operation with pseudorandom images + */ +uint32_t +test_composite (int testnum, + int verbose) +{ + int i; + pixman_image_t * src_img; + pixman_image_t * mask_img; + pixman_image_t * dst_img; + pixman_transform_t transform; + pixman_region16_t clip; + int src_width, src_height; + int mask_width, mask_height; + int dst_width, dst_height; + int src_stride, mask_stride, dst_stride; + int src_x, src_y; + int mask_x, mask_y; + int dst_x, dst_y; + int src_bpp; + int mask_bpp = 1; + int dst_bpp; + int w, h; + pixman_fixed_t scale_x = 65536, scale_y = 65536; + pixman_fixed_t translate_x = 0, translate_y = 0; + pixman_fixed_t mask_scale_x = 65536, mask_scale_y = 65536; + pixman_fixed_t mask_translate_x = 0, mask_translate_y = 0; + pixman_op_t op; + pixman_repeat_t repeat = PIXMAN_REPEAT_NONE; + pixman_repeat_t mask_repeat = PIXMAN_REPEAT_NONE; + pixman_format_code_t src_fmt, dst_fmt; + uint32_t * srcbuf; + uint32_t * dstbuf; + uint32_t * maskbuf; + uint32_t crc32; + FLOAT_REGS_CORRUPTION_DETECTOR_START (); + + lcg_srand (testnum); + + src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4; + dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4; + switch (lcg_rand_n (3)) + { + case 0: + op = PIXMAN_OP_SRC; + break; + case 1: + op = PIXMAN_OP_OVER; + break; + default: + op = PIXMAN_OP_ADD; + break; + } + + src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1; + src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1; + + if (lcg_rand_n (2)) + { + mask_width = lcg_rand_n (MAX_SRC_WIDTH) + 1; + mask_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1; + } + else + { + mask_width = mask_height = 1; + } + + dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1; + dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1; + src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp; + mask_stride = mask_width * mask_bpp + lcg_rand_n (MAX_STRIDE) * mask_bpp; + dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp; + + if (src_stride & 3) + src_stride += 2; + + if (mask_stride & 1) + mask_stride += 1; + if (mask_stride & 2) + mask_stride += 2; + + if (dst_stride & 3) + dst_stride += 2; + + src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2); + src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2); + mask_x = -(mask_width / 4) + lcg_rand_n (mask_width * 3 / 2); + mask_y = -(mask_height / 4) + lcg_rand_n (mask_height * 3 / 2); + dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2); + dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2); + w = lcg_rand_n (dst_width * 3 / 2 - dst_x); + h = lcg_rand_n (dst_height * 3 / 2 - dst_y); + + srcbuf = (uint32_t *)malloc (src_stride * src_height); + maskbuf = (uint32_t *)malloc (mask_stride * mask_height); + dstbuf = (uint32_t *)malloc (dst_stride * dst_height); + + for (i = 0; i < src_stride * src_height; i++) + *((uint8_t *)srcbuf + i) = lcg_rand_n (256); + + for (i = 0; i < mask_stride * mask_height; i++) + *((uint8_t *)maskbuf + i) = lcg_rand_n (256); + + for (i = 0; i < dst_stride * dst_height; i++) + *((uint8_t *)dstbuf + i) = lcg_rand_n (256); + + src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ? + PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5; + + dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ? + PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5; + + src_img = pixman_image_create_bits ( + src_fmt, src_width, src_height, srcbuf, src_stride); + + mask_img = pixman_image_create_bits ( + PIXMAN_a8, mask_width, mask_height, maskbuf, mask_stride); + + dst_img = pixman_image_create_bits ( + dst_fmt, dst_width, dst_height, dstbuf, dst_stride); + + image_endian_swap (src_img, src_bpp * 8); + image_endian_swap (dst_img, dst_bpp * 8); + + if (lcg_rand_n (4) > 0) + { + scale_x = -32768 * 3 + lcg_rand_N (65536 * 5); + scale_y = -32768 * 3 + lcg_rand_N (65536 * 5); + translate_x = lcg_rand_N (65536); + translate_y = lcg_rand_N (65536); + pixman_transform_init_scale (&transform, scale_x, scale_y); + pixman_transform_translate (&transform, NULL, translate_x, translate_y); + pixman_image_set_transform (src_img, &transform); + } + + if (lcg_rand_n (2) > 0) + { + mask_scale_x = -32768 * 3 + lcg_rand_N (65536 * 5); + mask_scale_y = -32768 * 3 + lcg_rand_N (65536 * 5); + mask_translate_x = lcg_rand_N (65536); + mask_translate_y = lcg_rand_N (65536); + pixman_transform_init_scale (&transform, mask_scale_x, mask_scale_y); + pixman_transform_translate (&transform, NULL, mask_translate_x, mask_translate_y); + pixman_image_set_transform (mask_img, &transform); + } + + switch (lcg_rand_n (4)) + { + case 0: + mask_repeat = PIXMAN_REPEAT_NONE; + break; + + case 1: + mask_repeat = PIXMAN_REPEAT_NORMAL; + break; + + case 2: + mask_repeat = PIXMAN_REPEAT_PAD; + break; + + case 3: + mask_repeat = PIXMAN_REPEAT_REFLECT; + break; + + default: + break; + } + pixman_image_set_repeat (mask_img, mask_repeat); + + switch (lcg_rand_n (4)) + { + case 0: + repeat = PIXMAN_REPEAT_NONE; + break; + + case 1: + repeat = PIXMAN_REPEAT_NORMAL; + break; + + case 2: + repeat = PIXMAN_REPEAT_PAD; + break; + + case 3: + repeat = PIXMAN_REPEAT_REFLECT; + break; + + default: + break; + } + pixman_image_set_repeat (src_img, repeat); + + if (lcg_rand_n (2)) + pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0); + else + pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0); + + if (lcg_rand_n (2)) + pixman_image_set_filter (mask_img, PIXMAN_FILTER_NEAREST, NULL, 0); + else + pixman_image_set_filter (mask_img, PIXMAN_FILTER_BILINEAR, NULL, 0); + + if (verbose) + { + printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt); + printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n", + op, scale_x, scale_y, repeat); + printf ("translate_x=%d, translate_y=%d\n", + translate_x, translate_y); + printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n", + src_width, src_height, dst_width, dst_height); + printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n", + src_x, src_y, dst_x, dst_y); + printf ("w=%d, h=%d\n", w, h); + } + + if (lcg_rand_n (8) == 0) + { + pixman_box16_t clip_boxes[2]; + int n = lcg_rand_n (2) + 1; + + for (i = 0; i < n; i++) + { + clip_boxes[i].x1 = lcg_rand_n (src_width); + clip_boxes[i].y1 = lcg_rand_n (src_height); + clip_boxes[i].x2 = + clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1); + clip_boxes[i].y2 = + clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1); + + if (verbose) + { + printf ("source clip box: [%d,%d-%d,%d]\n", + clip_boxes[i].x1, clip_boxes[i].y1, + clip_boxes[i].x2, clip_boxes[i].y2); + } + } + + pixman_region_init_rects (&clip, clip_boxes, n); + pixman_image_set_clip_region (src_img, &clip); + pixman_image_set_source_clipping (src_img, 1); + pixman_region_fini (&clip); + } + + if (lcg_rand_n (8) == 0) + { + pixman_box16_t clip_boxes[2]; + int n = lcg_rand_n (2) + 1; + + for (i = 0; i < n; i++) + { + clip_boxes[i].x1 = lcg_rand_n (mask_width); + clip_boxes[i].y1 = lcg_rand_n (mask_height); + clip_boxes[i].x2 = + clip_boxes[i].x1 + lcg_rand_n (mask_width - clip_boxes[i].x1); + clip_boxes[i].y2 = + clip_boxes[i].y1 + lcg_rand_n (mask_height - clip_boxes[i].y1); + + if (verbose) + { + printf ("mask clip box: [%d,%d-%d,%d]\n", + clip_boxes[i].x1, clip_boxes[i].y1, + clip_boxes[i].x2, clip_boxes[i].y2); + } + } + + pixman_region_init_rects (&clip, clip_boxes, n); + pixman_image_set_clip_region (mask_img, &clip); + pixman_image_set_source_clipping (mask_img, 1); + pixman_region_fini (&clip); + } + + if (lcg_rand_n (8) == 0) + { + pixman_box16_t clip_boxes[2]; + int n = lcg_rand_n (2) + 1; + for (i = 0; i < n; i++) + { + clip_boxes[i].x1 = lcg_rand_n (dst_width); + clip_boxes[i].y1 = lcg_rand_n (dst_height); + clip_boxes[i].x2 = + clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1); + clip_boxes[i].y2 = + clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1); + + if (verbose) + { + printf ("destination clip box: [%d,%d-%d,%d]\n", + clip_boxes[i].x1, clip_boxes[i].y1, + clip_boxes[i].x2, clip_boxes[i].y2); + } + } + pixman_region_init_rects (&clip, clip_boxes, n); + pixman_image_set_clip_region (dst_img, &clip); + pixman_region_fini (&clip); + } + + if (lcg_rand_n (2) == 0) + pixman_image_composite (op, src_img, NULL, dst_img, + src_x, src_y, 0, 0, dst_x, dst_y, w, h); + else + pixman_image_composite (op, src_img, mask_img, dst_img, + src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h); + + if (dst_fmt == PIXMAN_x8r8g8b8) + { + /* ignore unused part */ + for (i = 0; i < dst_stride * dst_height / 4; i++) + dstbuf[i] &= 0xFFFFFF; + } + + image_endian_swap (dst_img, dst_bpp * 8); + + if (verbose) + { + int j; + + for (i = 0; i < dst_height; i++) + { + for (j = 0; j < dst_stride; j++) + printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j)); + + printf ("\n"); + } + } + + pixman_image_unref (src_img); + pixman_image_unref (mask_img); + pixman_image_unref (dst_img); + + crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height); + free (srcbuf); + free (maskbuf); + free (dstbuf); + + FLOAT_REGS_CORRUPTION_DETECTOR_FINISH (); + return crc32; +} + +int +main (int argc, const char *argv[]) +{ + pixman_disable_out_of_bounds_workaround (); + + return fuzzer_test_main("scaling", 8000000, 0x80DF1CB2, + test_composite, argc, argv); +} diff --git a/pixman/test/screen-test.c b/pixman/test/screen-test.c deleted file mode 100644 index e69dba3de..000000000 --- a/pixman/test/screen-test.c +++ /dev/null @@ -1,44 +0,0 @@ -#include -#include -#include "pixman.h" -#include "gtk-utils.h" - -int -main (int argc, char **argv) -{ -#define WIDTH 40 -#define HEIGHT 40 - - uint32_t *src1 = malloc (WIDTH * HEIGHT * 4); - uint32_t *src2 = malloc (WIDTH * HEIGHT * 4); - uint32_t *src3 = malloc (WIDTH * HEIGHT * 4); - uint32_t *dest = malloc (3 * WIDTH * 2 * HEIGHT * 4); - pixman_image_t *simg1, *simg2, *simg3, *dimg; - - int i; - - for (i = 0; i < WIDTH * HEIGHT; ++i) - { - src1[i] = 0x7ff00000; - src2[i] = 0x7f00ff00; - src3[i] = 0x7f0000ff; - } - - for (i = 0; i < 3 * WIDTH * 2 * HEIGHT; ++i) - { - dest[i] = 0x0; - } - - simg1 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src1, WIDTH * 4); - simg2 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src2, WIDTH * 4); - simg3 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src3, WIDTH * 4); - dimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, 3 * WIDTH, 2 * HEIGHT, dest, 3 * WIDTH * 4); - - pixman_image_composite (PIXMAN_OP_SCREEN, simg1, NULL, dimg, 0, 0, 0, 0, WIDTH, HEIGHT / 4, WIDTH, HEIGHT); - pixman_image_composite (PIXMAN_OP_SCREEN, simg2, NULL, dimg, 0, 0, 0, 0, (WIDTH/2), HEIGHT / 4 + HEIGHT / 2, WIDTH, HEIGHT); - pixman_image_composite (PIXMAN_OP_SCREEN, simg3, NULL, dimg, 0, 0, 0, 0, (4 * WIDTH) / 3, HEIGHT, WIDTH, HEIGHT); - - show_image (dimg); - - return 0; -} diff --git a/pixman/test/trap-test.c b/pixman/test/trap-test.c deleted file mode 100644 index 19295e7a5..000000000 --- a/pixman/test/trap-test.c +++ /dev/null @@ -1,49 +0,0 @@ -#include -#include -#include -#include "pixman.h" -#include "gtk-utils.h" - -int -main (int argc, char **argv) -{ -#define WIDTH 200 -#define HEIGHT 200 - - pixman_image_t *src_img; - pixman_image_t *mask_img; - pixman_image_t *dest_img; - pixman_trap_t trap; - pixman_color_t white = { 0x0000, 0xffff, 0x0000, 0xffff }; - uint32_t *bits = malloc (WIDTH * HEIGHT * 4); - uint32_t *mbits = malloc (WIDTH * HEIGHT); - - memset (mbits, 0, WIDTH * HEIGHT); - memset (bits, 0xff, WIDTH * HEIGHT * 4); - - trap.top.l = pixman_int_to_fixed (50) + 0x8000; - trap.top.r = pixman_int_to_fixed (150) + 0x8000; - trap.top.y = pixman_int_to_fixed (30); - - trap.bot.l = pixman_int_to_fixed (50) + 0x8000; - trap.bot.r = pixman_int_to_fixed (150) + 0x8000; - trap.bot.y = pixman_int_to_fixed (150); - - mask_img = pixman_image_create_bits (PIXMAN_a8, WIDTH, HEIGHT, mbits, WIDTH); - src_img = pixman_image_create_solid_fill (&white); - dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4); - - pixman_add_traps (mask_img, 0, 0, 1, &trap); - - pixman_image_composite (PIXMAN_OP_OVER, - src_img, mask_img, dest_img, - 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT); - - show_image (dest_img); - - pixman_image_unref (src_img); - pixman_image_unref (dest_img); - free (bits); - - return 0; -} -- cgit v1.2.3