aboutsummaryrefslogtreecommitdiff
path: root/nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S
diff options
context:
space:
mode:
Diffstat (limited to 'nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S')
-rw-r--r--nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S689
1 files changed, 0 insertions, 689 deletions
diff --git a/nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S b/nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S
deleted file mode 100644
index 6b8036e5b..000000000
--- a/nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S
+++ /dev/null
@@ -1,689 +0,0 @@
-/*
- * (C) Copyright IBM Corporation 2004
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file read_rgba_span_x86.S
- * Optimized routines to transfer pixel data from the framebuffer to a
- * buffer in main memory.
- *
- * \author Ian Romanick <idr@us.ibm.com>
- */
-
- .file "read_rgba_span_x86.S"
-#if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
- .section .rodata
- .align 16
- .type mask, @object
- .size mask, 32
-mask:
- .long 0xff00ff00
- .long 0xff00ff00
- .long 0xff00ff00
- .long 0xff00ff00
- .long 0x00ff0000
- .long 0x00ff0000
- .long 0x00ff0000
- .long 0x00ff0000
-
-
-/* I implemented these as macros because the appear in quite a few places,
- * and I've tweaked them a number of times. I got tired of changing every
- * place they appear. :)
- */
-
-#define DO_ONE_PIXEL() \
- movl (%ebx), %eax ; \
- addl $4, %ebx ; \
- bswap %eax /* ARGB -> BGRA */ ; \
- rorl $8, %eax /* BGRA -> ABGR */ ; \
- movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
- addl $4, %ecx
-
-#define DO_ONE_LAST_PIXEL() \
- movl (%ebx), %eax ; \
- bswap %eax /* ARGB -> BGRA */ ; \
- rorl $8, %eax /* BGRA -> ABGR */ ; \
- movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
-
-
-/**
- * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
- *
- * \warning
- * This function assumes that the caller will issue the EMMS instruction
- * at the correct places.
- */
-
-.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
-.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
- .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
-_generic_read_RGBA_span_BGRA8888_REV_MMX:
- pushl %ebx
-
-#ifdef USE_INNER_EMMS
- emms
-#endif
- movq mask, %mm1
- movq mask+16, %mm2
-
- movl 8(%esp), %ebx /* source pointer */
- movl 16(%esp), %edx /* number of pixels to copy */
- movl 12(%esp), %ecx /* destination pointer */
-
- testl %edx, %edx
- jle .L20 /* Bail if there's nothing to do. */
-
- movl %ebx, %eax
-
- negl %eax
- sarl $2, %eax
- andl $1, %eax
- je .L17
-
- subl %eax, %edx
- DO_ONE_PIXEL()
-.L17:
-
- /* Would it be faster to unroll this loop once and process 4 pixels
- * per pass, instead of just two?
- */
-
- movl %edx, %eax
- shrl %eax
- jmp .L18
-.L19:
- movq (%ebx), %mm0
- addl $8, %ebx
-
- /* These 9 instructions do what PSHUFB (if there were such an
- * instruction) could do in 1. :(
- */
-
- movq %mm0, %mm3
- movq %mm0, %mm4
-
- pand %mm2, %mm3
- psllq $16, %mm4
- psrlq $16, %mm3
- pand %mm2, %mm4
-
- pand %mm1, %mm0
- por %mm4, %mm3
- por %mm3, %mm0
-
- movq %mm0, (%ecx)
- addl $8, %ecx
- subl $1, %eax
-.L18:
- jne .L19
-
-#ifdef USE_INNER_EMMS
- emms
-#endif
-
- /* At this point there are either 1 or 0 pixels remaining to be
- * converted. Convert the last pixel, if needed.
- */
-
- testl $1, %edx
- je .L20
-
- DO_ONE_LAST_PIXEL()
-
-.L20:
- popl %ebx
- ret
- .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
-
-
-/**
- * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
- * instructions are only actually used to read data from the framebuffer.
- * In practice, the speed-up is pretty small.
- *
- * \todo
- * Do some more testing and determine if there's any reason to have this
- * function in addition to the MMX version.
- *
- * \warning
- * This function assumes that the caller will issue the EMMS instruction
- * at the correct places.
- */
-
-.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
-.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
- .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
-_generic_read_RGBA_span_BGRA8888_REV_SSE:
- pushl %esi
- pushl %ebx
- pushl %ebp
-
-#ifdef USE_INNER_EMMS
- emms
-#endif
- movq mask, %mm1
- movq mask+16, %mm2
-
- movl 16(%esp), %ebx /* source pointer */
- movl 24(%esp), %edx /* number of pixels to copy */
- movl 20(%esp), %ecx /* destination pointer */
-
- testl %edx, %edx
- jle .L35 /* Bail if there's nothing to do. */
-
- movl %esp, %ebp
- subl $16, %esp
- andl $0xfffffff0, %esp
-
- movl %ebx, %eax
- movl %edx, %esi
-
- negl %eax
- andl $15, %eax
- sarl $2, %eax
- cmpl %edx, %eax
- cmovle %eax, %esi
-
- subl %esi, %edx
-
- testl $1, %esi
- je .L32
-
- DO_ONE_PIXEL()
-.L32:
-
- testl $2, %esi
- je .L31
-
- movq (%ebx), %mm0
- addl $8, %ebx
-
- movq %mm0, %mm3
- movq %mm0, %mm4
-
- pand %mm2, %mm3
- psllq $16, %mm4
- psrlq $16, %mm3
- pand %mm2, %mm4
-
- pand %mm1, %mm0
- por %mm4, %mm3
- por %mm3, %mm0
-
- movq %mm0, (%ecx)
- addl $8, %ecx
-.L31:
-
- movl %edx, %eax
- shrl $2, %eax
- jmp .L33
-.L34:
- movaps (%ebx), %xmm0
- addl $16, %ebx
-
- /* This would be so much better if we could just move directly from
- * an SSE register to an MMX register. Unfortunately, that
- * functionality wasn't introduced until SSE2 with the MOVDQ2Q
- * instruction.
- */
-
- movaps %xmm0, (%esp)
- movq (%esp), %mm0
- movq 8(%esp), %mm5
-
- movq %mm0, %mm3
- movq %mm0, %mm4
- movq %mm5, %mm6
- movq %mm5, %mm7
-
- pand %mm2, %mm3
- pand %mm2, %mm6
-
- psllq $16, %mm4
- psllq $16, %mm7
-
- psrlq $16, %mm3
- psrlq $16, %mm6
-
- pand %mm2, %mm4
- pand %mm2, %mm7
-
- pand %mm1, %mm0
- pand %mm1, %mm5
-
- por %mm4, %mm3
- por %mm7, %mm6
-
- por %mm3, %mm0
- por %mm6, %mm5
-
- movq %mm0, (%ecx)
- movq %mm5, 8(%ecx)
- addl $16, %ecx
-
- subl $1, %eax
-.L33:
- jne .L34
-
-#ifdef USE_INNER_EMMS
- emms
-#endif
- movl %ebp, %esp
-
- /* At this point there are either [0, 3] pixels remaining to be
- * converted.
- */
-
- testl $2, %edx
- je .L36
-
- movq (%ebx), %mm0
- addl $8, %ebx
-
- movq %mm0, %mm3
- movq %mm0, %mm4
-
- pand %mm2, %mm3
- psllq $16, %mm4
- psrlq $16, %mm3
- pand %mm2, %mm4
-
- pand %mm1, %mm0
- por %mm4, %mm3
- por %mm3, %mm0
-
- movq %mm0, (%ecx)
- addl $8, %ecx
-.L36:
-
- testl $1, %edx
- je .L35
-
- DO_ONE_LAST_PIXEL()
-.L35:
- popl %ebp
- popl %ebx
- popl %esi
- ret
- .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
-
-
-/**
- * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
- */
-
- .text
-.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
-.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
- .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
-_generic_read_RGBA_span_BGRA8888_REV_SSE2:
- pushl %esi
- pushl %ebx
-
- movdqa mask, %xmm1
- movdqa mask+16, %xmm2
-
- movl 12(%esp), %ebx /* source pointer */
- movl 20(%esp), %edx /* number of pixels to copy */
- movl 16(%esp), %ecx /* destination pointer */
-
- movl %ebx, %eax
- movl %edx, %esi
-
- testl %edx, %edx
- jle .L46 /* Bail if there's nothing to do. */
-
- /* If the source pointer isn't a multiple of 16 we have to process
- * a few pixels the "slow" way to get the address aligned for
- * the SSE fetch intsructions.
- */
-
- negl %eax
- andl $15, %eax
- sarl $2, %eax
-
- cmpl %edx, %eax
- cmovbe %eax, %esi
- subl %esi, %edx
-
- testl $1, %esi
- je .L41
-
- DO_ONE_PIXEL()
-.L41:
- testl $2, %esi
- je .L40
-
- movq (%ebx), %xmm0
- addl $8, %ebx
-
- movdqa %xmm0, %xmm3
- movdqa %xmm0, %xmm4
- andps %xmm1, %xmm0
-
- andps %xmm2, %xmm3
- pslldq $2, %xmm4
- psrldq $2, %xmm3
- andps %xmm2, %xmm4
-
- orps %xmm4, %xmm3
- orps %xmm3, %xmm0
-
- movq %xmm0, (%ecx)
- addl $8, %ecx
-.L40:
-
- /* Would it be worth having a specialized version of this loop for
- * the case where the destination is 16-byte aligned? That version
- * would be identical except that it could use movedqa instead of
- * movdqu.
- */
-
- movl %edx, %eax
- shrl $2, %eax
- jmp .L42
-.L43:
- movdqa (%ebx), %xmm0
- addl $16, %ebx
-
- movdqa %xmm0, %xmm3
- movdqa %xmm0, %xmm4
- andps %xmm1, %xmm0
-
- andps %xmm2, %xmm3
- pslldq $2, %xmm4
- psrldq $2, %xmm3
- andps %xmm2, %xmm4
-
- orps %xmm4, %xmm3
- orps %xmm3, %xmm0
-
- movdqu %xmm0, (%ecx)
- addl $16, %ecx
- subl $1, %eax
-.L42:
- jne .L43
-
-
- /* There may be upto 3 pixels remaining to be copied. Take care
- * of them now. We do the 2 pixel case first because the data
- * will be aligned.
- */
-
- testl $2, %edx
- je .L47
-
- movq (%ebx), %xmm0
-
- movdqa %xmm0, %xmm3
- movdqa %xmm0, %xmm4
- andps %xmm1, %xmm0
-
- andps %xmm2, %xmm3
- pslldq $2, %xmm4
- psrldq $2, %xmm3
- andps %xmm2, %xmm4
-
- orps %xmm4, %xmm3
- orps %xmm3, %xmm0
-
- movq %xmm0, (%ecx)
-.L47:
-
- testl $1, %edx
- je .L46
-
- DO_ONE_LAST_PIXEL()
-.L46:
-
- popl %ebx
- popl %esi
- ret
- .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
-
-
-
- .section .rodata
-
- .align 16
-mask_565:
- .word 0xf800
- .word 0x07e0
- .word 0x001f
- .word 0x0000
-
-/* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
- * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but
- * at a small cost to accuracy.
- */
-
-#define SCALE_ADJUST 5
-#if SCALE_ADJUST == 5
-prescale:
- .word 0x0001
- .word 0x0010
- .word 0x0200
- .word 0x0000
-
-scale:
- .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */
- .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */
- .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */
- .word 0x0000
-#elif SCALE_ADJUST == 0
-prescale:
- .word 0x0001
- .word 0x0020
- .word 0x0800
- .word 0x0000
-
-scale:
- .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
- .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */
- .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
- .word 0x0000
-#else
-#error SCALE_ADJUST must either be 5 or 0.
-#endif
-
-
-alpha: .long 0x00000000
- .long 0x00ff0000
-
-/**
- * MMX optimized version of the RGB565 to RGBA copy routine.
- */
-
- .text
- .globl _generic_read_RGBA_span_RGB565_MMX
- .hidden _generic_read_RGBA_span_RGB565_MMX
- .type _generic_read_RGBA_span_RGB565_MMX, @function
-
-_generic_read_RGBA_span_RGB565_MMX:
-
-#ifdef USE_INNER_EMMS
- emms
-#endif
-
- movl 4(%esp), %eax /* source pointer */
- movl 8(%esp), %edx /* destination pointer */
- movl 12(%esp), %ecx /* number of pixels to copy */
-
- movq mask_565, %mm5
- movq prescale, %mm6
- movq scale, %mm7
-
- sarl $2, %ecx
- jle .L01 /* Bail early if the count is negative. */
- jmp .L02
-
-.L03:
- /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
- * second pixels into the four words of %mm0 and %mm2.
- */
-
- movq (%eax), %mm4
- addl $8, %eax
-
- pshufw $0x00, %mm4, %mm0
- pshufw $0x55, %mm4, %mm2
-
-
- /* Mask the pixels so that each word of each register contains only
- * one color component.
- */
-
- pand %mm5, %mm0
- pand %mm5, %mm2
-
-
- /* Adjust the component values so that they are as small as possible,
- * but large enough so that we can multiply them by an unsigned 16-bit
- * number and get a value as large as 0x00ff0000.
- */
-
- pmullw %mm6, %mm0
- pmullw %mm6, %mm2
-#if SCALE_ADJUST > 0
- psrlw $SCALE_ADJUST, %mm0
- psrlw $SCALE_ADJUST, %mm2
-#endif
-
- /* Scale the input component values to be on the range
- * [0, 0x00ff0000]. This it the real magic of the whole routine.
- */
-
- pmulhuw %mm7, %mm0
- pmulhuw %mm7, %mm2
-
-
- /* Always set the alpha value to 0xff.
- */
-
- por alpha, %mm0
- por alpha, %mm2
-
-
- /* Pack the 16-bit values to 8-bit values and store the converted
- * pixel data.
- */
-
- packuswb %mm2, %mm0
- movq %mm0, (%edx)
- addl $8, %edx
-
-
-
- pshufw $0xaa, %mm4, %mm0
- pshufw $0xff, %mm4, %mm2
-
- pand %mm5, %mm0
- pand %mm5, %mm2
- pmullw %mm6, %mm0
- pmullw %mm6, %mm2
-#if SCALE_ADJUST > 0
- psrlw $SCALE_ADJUST, %mm0
- psrlw $SCALE_ADJUST, %mm2
-#endif
- pmulhuw %mm7, %mm0
- pmulhuw %mm7, %mm2
-
- por alpha, %mm0
- por alpha, %mm2
-
- packuswb %mm2, %mm0
-
- movq %mm0, (%edx)
- addl $8, %edx
-
- subl $1, %ecx
-.L02:
- jne .L03
-
-
- /* At this point there can be at most 3 pixels left to process. If
- * there is either 2 or 3 left, process 2.
- */
-
- movl 12(%esp), %ecx
- testl $0x02, %ecx
- je .L04
-
- movd (%eax), %mm4
- addl $4, %eax
-
- pshufw $0x00, %mm4, %mm0
- pshufw $0x55, %mm4, %mm2
-
- pand %mm5, %mm0
- pand %mm5, %mm2
- pmullw %mm6, %mm0
- pmullw %mm6, %mm2
-#if SCALE_ADJUST > 0
- psrlw $SCALE_ADJUST, %mm0
- psrlw $SCALE_ADJUST, %mm2
-#endif
- pmulhuw %mm7, %mm0
- pmulhuw %mm7, %mm2
-
- por alpha, %mm0
- por alpha, %mm2
-
- packuswb %mm2, %mm0
-
- movq %mm0, (%edx)
- addl $8, %edx
-
-.L04:
- /* At this point there can be at most 1 pixel left to process.
- * Process it if needed.
- */
-
- testl $0x01, %ecx
- je .L01
-
- movzxw (%eax), %ecx
- movd %ecx, %mm4
-
- pshufw $0x00, %mm4, %mm0
-
- pand %mm5, %mm0
- pmullw %mm6, %mm0
-#if SCALE_ADJUST > 0
- psrlw $SCALE_ADJUST, %mm0
-#endif
- pmulhuw %mm7, %mm0
-
- por alpha, %mm0
-
- packuswb %mm0, %mm0
-
- movd %mm0, (%edx)
-
-.L01:
-#ifdef USE_INNER_EMMS
- emms
-#endif
- ret
-#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */