Imported nx-X11-3.1.0-1.tar.gznx-X11/3.1.0-1

Summary: Imported nx-X11-3.1.0-1.tar.gz Keywords: Imported nx-X11-3.1.0-1.tar.gz into Git repository
author: Reinhard Tartler <siretart@tauware.de> 2011-10-10 17:43:39 +0200
committer: Reinhard Tartler <siretart@tauware.de> 2011-10-10 17:43:39 +0200
commit: f4092abdf94af6a99aff944d6264bc1284e8bdd4 (patch)
tree: 2ac1c9cc16ceb93edb2c4382c088dac5aeafdf0f /nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S
parent: a840692edc9c6d19cd7c057f68e39c7d95eb767d (diff)
download: nx-libs-f4092abdf94af6a99aff944d6264bc1284e8bdd4.tar.gz
nx-libs-f4092abdf94af6a99aff944d6264bc1284e8bdd4.tar.bz2
nx-libs-f4092abdf94af6a99aff944d6264bc1284e8bdd4.zip
1 files changed, 689 insertions, 0 deletions
diff --git a/nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S b/nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S
new file mode 100644
index 000000000..6b8036e5b
--- /dev/null
+++ b/nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S
@@ -0,0 +1,689 @@
+/*
+ * (C) Copyright IBM Corporation 2004
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+ 
+/**
+ * \file read_rgba_span_x86.S
+ * Optimized routines to transfer pixel data from the framebuffer to a
+ * buffer in main memory.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+	.file	"read_rgba_span_x86.S"
+#if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
+	.section	.rodata
+	.align 16
+	.type	mask, @object
+	.size	mask, 32
+mask:
+	.long	0xff00ff00
+	.long	0xff00ff00
+	.long	0xff00ff00
+	.long	0xff00ff00
+	.long	0x00ff0000
+	.long	0x00ff0000
+	.long	0x00ff0000
+	.long	0x00ff0000
+
+
+/* I implemented these as macros because the appear in quite a few places,
+ * and I've tweaked them a number of times.  I got tired of changing every
+ * place they appear. :)
+ */
+
+#define DO_ONE_PIXEL() \
+	movl	(%ebx), %eax ; \
+	addl	$4, %ebx ; \
+	bswap	%eax          /* ARGB -> BGRA */ ; \
+	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
+	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
+	addl	$4, %ecx
+
+#define DO_ONE_LAST_PIXEL() \
+	movl	(%ebx), %eax ; \
+	bswap	%eax          /* ARGB -> BGRA */ ; \
+	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
+	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
+
+
+/**
+ * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
+ * 
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+
+.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
+.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
+	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
+_generic_read_RGBA_span_BGRA8888_REV_MMX:
+	pushl	%ebx
+
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+	movq	mask, %mm1
+	movq	mask+16, %mm2
+
+	movl	8(%esp), %ebx	/* source pointer */
+	movl	16(%esp), %edx	/* number of pixels to copy */
+	movl	12(%esp), %ecx	/* destination pointer */
+
+	testl	%edx, %edx
+	jle	.L20		/* Bail if there's nothing to do. */
+
+	movl	%ebx, %eax
+
+	negl	%eax
+	sarl	$2, %eax
+	andl	$1, %eax
+	je	.L17
+
+	subl	%eax, %edx
+	DO_ONE_PIXEL()
+.L17:
+
+	/* Would it be faster to unroll this loop once and process 4 pixels
+	 * per pass, instead of just two?
+	 */
+
+	movl	%edx, %eax
+	shrl	%eax
+	jmp	.L18
+.L19:
+	movq	(%ebx), %mm0
+	addl	$8, %ebx
+
+	/* These 9 instructions do what PSHUFB (if there were such an
+	 * instruction) could do in 1. :(
+	 */
+
+	movq	%mm0, %mm3
+	movq	%mm0, %mm4
+
+	pand	%mm2, %mm3
+	psllq	$16, %mm4
+	psrlq	$16, %mm3
+	pand	%mm2, %mm4
+
+	pand	%mm1, %mm0
+	por	%mm4, %mm3
+	por	%mm3, %mm0
+
+	movq	%mm0, (%ecx)
+	addl	$8, %ecx
+	subl	$1, %eax
+.L18:
+	jne	.L19
+
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+
+	/* At this point there are either 1 or 0 pixels remaining to be
+	 * converted.  Convert the last pixel, if needed.
+	 */
+
+	testl	$1, %edx
+	je	.L20
+
+	DO_ONE_LAST_PIXEL()
+
+.L20:
+	popl	%ebx
+	ret
+	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
+
+
+/**
+ * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
+ * instructions are only actually used to read data from the framebuffer.
+ * In practice, the speed-up is pretty small.
+ *
+ * \todo
+ * Do some more testing and determine if there's any reason to have this
+ * function in addition to the MMX version.
+ *
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
+	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE:
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+	movq	mask, %mm1
+	movq	mask+16, %mm2
+
+	movl	16(%esp), %ebx	/* source pointer */
+	movl	24(%esp), %edx	/* number of pixels to copy */
+	movl	20(%esp), %ecx	/* destination pointer */
+
+	testl	%edx, %edx
+	jle	.L35		/* Bail if there's nothing to do. */
+
+	movl	%esp, %ebp
+	subl	$16, %esp
+	andl	$0xfffffff0, %esp
+
+	movl	%ebx, %eax
+	movl	%edx, %esi
+
+	negl	%eax
+	andl	$15, %eax
+	sarl	$2, %eax
+	cmpl	%edx, %eax
+	cmovle	%eax, %esi
+
+	subl	%esi, %edx
+
+	testl	$1, %esi
+	je	.L32
+
+	DO_ONE_PIXEL()
+.L32:
+
+	testl	$2, %esi
+	je	.L31
+
+	movq	(%ebx), %mm0
+	addl	$8, %ebx
+
+	movq	%mm0, %mm3
+	movq	%mm0, %mm4
+	
+	pand	%mm2, %mm3
+	psllq	$16, %mm4
+	psrlq	$16, %mm3
+	pand	%mm2, %mm4
+
+	pand	%mm1, %mm0
+	por	%mm4, %mm3
+	por	%mm3, %mm0
+
+	movq	%mm0, (%ecx)
+	addl	$8, %ecx
+.L31:
+
+	movl	%edx, %eax
+	shrl	$2, %eax
+	jmp	.L33
+.L34:
+	movaps	(%ebx), %xmm0
+	addl	$16, %ebx
+
+	/* This would be so much better if we could just move directly from
+	 * an SSE register to an MMX register.  Unfortunately, that
+	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
+	 * instruction.
+	 */
+
+	movaps	%xmm0, (%esp)
+	movq	(%esp), %mm0
+	movq	8(%esp), %mm5
+
+	movq	%mm0, %mm3
+	movq	%mm0, %mm4
+	movq	%mm5, %mm6
+	movq	%mm5, %mm7
+
+	pand	%mm2, %mm3
+	pand	%mm2, %mm6
+
+	psllq	$16, %mm4
+	psllq	$16, %mm7
+
+	psrlq	$16, %mm3
+	psrlq	$16, %mm6
+
+	pand	%mm2, %mm4
+	pand	%mm2, %mm7
+
+	pand	%mm1, %mm0
+	pand	%mm1, %mm5
+
+	por	%mm4, %mm3
+	por	%mm7, %mm6
+
+	por	%mm3, %mm0
+	por	%mm6, %mm5
+
+	movq	%mm0, (%ecx)
+	movq	%mm5, 8(%ecx)
+	addl	$16, %ecx
+
+	subl	$1, %eax
+.L33:
+	jne	.L34
+
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+	movl	%ebp, %esp
+
+	/* At this point there are either [0, 3] pixels remaining to be
+	 * converted.
+	 */
+
+	testl	$2, %edx
+	je	.L36
+
+	movq	(%ebx), %mm0
+	addl	$8, %ebx
+
+	movq	%mm0, %mm3
+	movq	%mm0, %mm4
+	
+	pand	%mm2, %mm3
+	psllq	$16, %mm4
+	psrlq	$16, %mm3
+	pand	%mm2, %mm4
+
+	pand	%mm1, %mm0
+	por	%mm4, %mm3
+	por	%mm3, %mm0
+
+	movq	%mm0, (%ecx)
+	addl	$8, %ecx
+.L36:
+
+	testl	$1, %edx
+	je	.L35
+
+	DO_ONE_LAST_PIXEL()
+.L35:
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	ret
+	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
+
+
+/**
+ * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
+ */
+
+	.text
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
+	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE2:
+	pushl	%esi
+	pushl	%ebx
+
+	movdqa	mask, %xmm1
+	movdqa	mask+16, %xmm2
+
+	movl	12(%esp), %ebx	/* source pointer */
+	movl	20(%esp), %edx	/* number of pixels to copy */
+	movl	16(%esp), %ecx	/* destination pointer */
+
+	movl	%ebx, %eax
+	movl	%edx, %esi
+
+	testl	%edx, %edx
+	jle	.L46		/* Bail if there's nothing to do. */
+
+	/* If the source pointer isn't a multiple of 16 we have to process
+	 * a few pixels the "slow" way to get the address aligned for
+	 * the SSE fetch intsructions.
+	 */
+
+	negl	%eax
+	andl	$15, %eax
+	sarl	$2, %eax
+
+	cmpl	%edx, %eax
+	cmovbe	%eax, %esi
+	subl	%esi, %edx
+
+	testl	$1, %esi
+	je	.L41
+
+	DO_ONE_PIXEL()  
+.L41:
+	testl	$2, %esi
+	je	.L40
+
+	movq	(%ebx), %xmm0
+	addl	$8, %ebx
+
+	movdqa	%xmm0, %xmm3
+	movdqa	%xmm0, %xmm4
+	andps	%xmm1, %xmm0
+
+	andps	%xmm2, %xmm3
+	pslldq	$2, %xmm4
+	psrldq	$2, %xmm3
+	andps	%xmm2, %xmm4
+
+	orps	%xmm4, %xmm3
+	orps	%xmm3, %xmm0
+
+	movq	%xmm0, (%ecx)
+	addl	$8, %ecx
+.L40:
+
+	/* Would it be worth having a specialized version of this loop for
+	 * the case where the destination is 16-byte aligned?  That version
+	 * would be identical except that it could use movedqa instead of
+	 * movdqu.
+	 */
+
+	movl	%edx, %eax
+	shrl	$2, %eax
+	jmp	.L42
+.L43:
+	movdqa	(%ebx), %xmm0
+	addl	$16, %ebx
+
+	movdqa	%xmm0, %xmm3
+	movdqa	%xmm0, %xmm4
+	andps	%xmm1, %xmm0
+
+	andps	%xmm2, %xmm3
+	pslldq	$2, %xmm4
+	psrldq	$2, %xmm3
+	andps	%xmm2, %xmm4
+
+	orps	%xmm4, %xmm3
+	orps	%xmm3, %xmm0
+
+	movdqu	%xmm0, (%ecx)
+	addl	$16, %ecx
+	subl	$1, %eax
+.L42:
+	jne	.L43
+
+
+	/* There may be upto 3 pixels remaining to be copied.  Take care
+	 * of them now.  We do the 2 pixel case first because the data
+	 * will be aligned.
+	 */
+
+	testl	$2, %edx
+	je	.L47
+
+	movq	(%ebx), %xmm0
+
+	movdqa	%xmm0, %xmm3
+	movdqa	%xmm0, %xmm4
+	andps	%xmm1, %xmm0
+
+	andps	%xmm2, %xmm3
+	pslldq	$2, %xmm4
+	psrldq	$2, %xmm3
+	andps	%xmm2, %xmm4
+
+	orps	%xmm4, %xmm3
+	orps	%xmm3, %xmm0
+
+	movq	%xmm0, (%ecx)
+.L47:
+
+	testl	$1, %edx
+	je	.L46
+
+	DO_ONE_LAST_PIXEL()  
+.L46:
+
+	popl	%ebx
+	popl	%esi
+	ret
+	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
+
+
+
+	.section	.rodata
+
+	.align	16
+mask_565:
+	.word	0xf800
+	.word	0x07e0
+	.word	0x001f
+	.word	0x0000
+
+/* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
+ * implementation in Mesa.  Setting SCALE_ADJUST to 0 is slightly faster but
+ * at a small cost to accuracy.
+ */
+
+#define SCALE_ADJUST	5
+#if SCALE_ADJUST == 5
+prescale:
+	.word	0x0001
+	.word	0x0010
+	.word	0x0200
+	.word	0x0000
+
+scale:
+	.word	0x20e8		/* (0x00ff0000 / 0x000007c0) + 1 */
+	.word	0x40c5		/* (0x00ff0000 / 0x000003f0) + 1 */
+	.word	0x839d		/* (0x00ff0000 / 0x000001f0) + 1 */
+	.word	0x0000
+#elif SCALE_ADJUST == 0
+prescale:
+	.word	0x0001
+	.word	0x0020
+	.word	0x0800
+	.word	0x0000
+
+scale:
+	.word	0x0108		/* (0x00ff0000 / 0x0000f800) + 1 */
+	.word	0x0104		/* (0x00ff0000 / 0x0000fc00) + 1 */
+	.word	0x0108		/* (0x00ff0000 / 0x0000f800) + 1 */
+	.word	0x0000
+#else
+#error SCALE_ADJUST must either be 5 or 0.
+#endif
+
+
+alpha:	.long	0x00000000
+	.long	0x00ff0000
+
+/**
+ * MMX optimized version of the RGB565 to RGBA copy routine.
+ */
+
+	.text
+	.globl	_generic_read_RGBA_span_RGB565_MMX
+        .hidden _generic_read_RGBA_span_RGB565_MMX
+	.type	_generic_read_RGBA_span_RGB565_MMX, @function
+
+_generic_read_RGBA_span_RGB565_MMX:
+
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+
+	movl	4(%esp), %eax	/* source pointer */
+	movl	8(%esp), %edx	/* destination pointer */
+	movl	12(%esp), %ecx	/* number of pixels to copy */
+
+	movq	mask_565, %mm5
+	movq	prescale, %mm6
+	movq	scale, %mm7
+
+	sarl	$2, %ecx
+	jle	.L01		/* Bail early if the count is negative. */
+	jmp	.L02
+
+.L03:
+	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
+	 * second pixels into the four words of %mm0 and %mm2.
+      	 */
+
+	movq	(%eax), %mm4
+	addl	$8, %eax
+
+	pshufw	$0x00, %mm4, %mm0
+	pshufw	$0x55, %mm4, %mm2
+
+
+	/* Mask the pixels so that each word of each register contains only
+	 * one color component.
+	 */
+
+	pand	%mm5, %mm0
+	pand	%mm5, %mm2
+
+
+	/* Adjust the component values so that they are as small as possible,
+	 * but large enough so that we can multiply them by an unsigned 16-bit
+	 * number and get a value as large as 0x00ff0000.
+ 	 */
+
+	pmullw	%mm6, %mm0
+	pmullw	%mm6, %mm2
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+	psrlw	$SCALE_ADJUST, %mm2
+#endif
+
+	/* Scale the input component values to be on the range
+	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
+	 */
+
+	pmulhuw	%mm7, %mm0
+	pmulhuw	%mm7, %mm2
+
+
+	/* Always set the alpha value to 0xff.
+	 */
+
+	por	alpha, %mm0
+	por	alpha, %mm2
+
+
+	/* Pack the 16-bit values to 8-bit values and store the converted
+	 * pixel data.
+	 */
+
+	packuswb	%mm2, %mm0
+	movq	%mm0, (%edx)
+	addl	$8, %edx
+
+
+
+	pshufw	$0xaa, %mm4, %mm0
+	pshufw	$0xff, %mm4, %mm2
+
+	pand	%mm5, %mm0
+	pand	%mm5, %mm2
+	pmullw	%mm6, %mm0
+	pmullw	%mm6, %mm2
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+	psrlw	$SCALE_ADJUST, %mm2
+#endif
+	pmulhuw	%mm7, %mm0
+	pmulhuw	%mm7, %mm2
+
+	por	alpha, %mm0
+	por	alpha, %mm2
+
+	packuswb	%mm2, %mm0
+
+	movq	%mm0, (%edx)
+	addl	$8, %edx
+
+	subl	$1, %ecx
+.L02:
+	jne	.L03
+
+
+	/* At this point there can be at most 3 pixels left to process.  If
+	 * there is either 2 or 3 left, process 2.
+         */
+
+	movl	12(%esp), %ecx
+	testl	$0x02, %ecx
+	je	.L04
+
+	movd	(%eax), %mm4
+	addl	$4, %eax
+
+	pshufw	$0x00, %mm4, %mm0
+	pshufw	$0x55, %mm4, %mm2
+
+	pand	%mm5, %mm0
+	pand	%mm5, %mm2
+	pmullw	%mm6, %mm0
+	pmullw	%mm6, %mm2
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+	psrlw	$SCALE_ADJUST, %mm2
+#endif
+	pmulhuw	%mm7, %mm0
+	pmulhuw	%mm7, %mm2
+
+	por	alpha, %mm0
+	por	alpha, %mm2
+
+	packuswb	%mm2, %mm0
+
+	movq	%mm0, (%edx)
+	addl	$8, %edx
+
+.L04:
+	/* At this point there can be at most 1 pixel left to process.
+	 * Process it if needed.
+         */
+
+	testl	$0x01, %ecx
+	je	.L01
+
+	movzxw	(%eax), %ecx
+	movd	%ecx, %mm4
+
+	pshufw	$0x00, %mm4, %mm0
+
+	pand	%mm5, %mm0
+	pmullw	%mm6, %mm0
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+#endif
+	pmulhuw	%mm7, %mm0
+
+	por	alpha, %mm0
+
+	packuswb	%mm0, %mm0
+
+	movd	%mm0, (%edx)
+
+.L01:
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+	ret
+#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
author	Reinhard Tartler <siretart@tauware.de>	2011-10-10 17:43:39 +0200
committer	Reinhard Tartler <siretart@tauware.de>	2011-10-10 17:43:39 +0200
commit	f4092abdf94af6a99aff944d6264bc1284e8bdd4 (patch)
tree	2ac1c9cc16ceb93edb2c4382c088dac5aeafdf0f /nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S
parent	a840692edc9c6d19cd7c057f68e39c7d95eb767d (diff)
download	nx-libs-f4092abdf94af6a99aff944d6264bc1284e8bdd4.tar.gz nx-libs-f4092abdf94af6a99aff944d6264bc1284e8bdd4.tar.bz2 nx-libs-f4092abdf94af6a99aff944d6264bc1284e8bdd4.zip