diff options
Diffstat (limited to 'nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S')
-rw-r--r-- | nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S | 689 |
1 files changed, 0 insertions, 689 deletions
diff --git a/nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S b/nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S deleted file mode 100644 index 6b8036e5b..000000000 --- a/nx-X11/extras/Mesa/src/mesa/x86/read_rgba_span_x86.S +++ /dev/null @@ -1,689 +0,0 @@ -/* - * (C) Copyright IBM Corporation 2004 - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * \file read_rgba_span_x86.S - * Optimized routines to transfer pixel data from the framebuffer to a - * buffer in main memory. - * - * \author Ian Romanick <idr@us.ibm.com> - */ - - .file "read_rgba_span_x86.S" -#if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */ - .section .rodata - .align 16 - .type mask, @object - .size mask, 32 -mask: - .long 0xff00ff00 - .long 0xff00ff00 - .long 0xff00ff00 - .long 0xff00ff00 - .long 0x00ff0000 - .long 0x00ff0000 - .long 0x00ff0000 - .long 0x00ff0000 - - -/* I implemented these as macros because the appear in quite a few places, - * and I've tweaked them a number of times. I got tired of changing every - * place they appear. :) - */ - -#define DO_ONE_PIXEL() \ - movl (%ebx), %eax ; \ - addl $4, %ebx ; \ - bswap %eax /* ARGB -> BGRA */ ; \ - rorl $8, %eax /* BGRA -> ABGR */ ; \ - movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ - addl $4, %ecx - -#define DO_ONE_LAST_PIXEL() \ - movl (%ebx), %eax ; \ - bswap %eax /* ARGB -> BGRA */ ; \ - rorl $8, %eax /* BGRA -> ABGR */ ; \ - movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ - - -/** - * MMX optimized version of the BGRA8888_REV to RGBA copy routine. - * - * \warning - * This function assumes that the caller will issue the EMMS instruction - * at the correct places. - */ - -.globl _generic_read_RGBA_span_BGRA8888_REV_MMX -.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX - .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function -_generic_read_RGBA_span_BGRA8888_REV_MMX: - pushl %ebx - -#ifdef USE_INNER_EMMS - emms -#endif - movq mask, %mm1 - movq mask+16, %mm2 - - movl 8(%esp), %ebx /* source pointer */ - movl 16(%esp), %edx /* number of pixels to copy */ - movl 12(%esp), %ecx /* destination pointer */ - - testl %edx, %edx - jle .L20 /* Bail if there's nothing to do. */ - - movl %ebx, %eax - - negl %eax - sarl $2, %eax - andl $1, %eax - je .L17 - - subl %eax, %edx - DO_ONE_PIXEL() -.L17: - - /* Would it be faster to unroll this loop once and process 4 pixels - * per pass, instead of just two? - */ - - movl %edx, %eax - shrl %eax - jmp .L18 -.L19: - movq (%ebx), %mm0 - addl $8, %ebx - - /* These 9 instructions do what PSHUFB (if there were such an - * instruction) could do in 1. :( - */ - - movq %mm0, %mm3 - movq %mm0, %mm4 - - pand %mm2, %mm3 - psllq $16, %mm4 - psrlq $16, %mm3 - pand %mm2, %mm4 - - pand %mm1, %mm0 - por %mm4, %mm3 - por %mm3, %mm0 - - movq %mm0, (%ecx) - addl $8, %ecx - subl $1, %eax -.L18: - jne .L19 - -#ifdef USE_INNER_EMMS - emms -#endif - - /* At this point there are either 1 or 0 pixels remaining to be - * converted. Convert the last pixel, if needed. - */ - - testl $1, %edx - je .L20 - - DO_ONE_LAST_PIXEL() - -.L20: - popl %ebx - ret - .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX - - -/** - * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE - * instructions are only actually used to read data from the framebuffer. - * In practice, the speed-up is pretty small. - * - * \todo - * Do some more testing and determine if there's any reason to have this - * function in addition to the MMX version. - * - * \warning - * This function assumes that the caller will issue the EMMS instruction - * at the correct places. - */ - -.globl _generic_read_RGBA_span_BGRA8888_REV_SSE -.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE - .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function -_generic_read_RGBA_span_BGRA8888_REV_SSE: - pushl %esi - pushl %ebx - pushl %ebp - -#ifdef USE_INNER_EMMS - emms -#endif - movq mask, %mm1 - movq mask+16, %mm2 - - movl 16(%esp), %ebx /* source pointer */ - movl 24(%esp), %edx /* number of pixels to copy */ - movl 20(%esp), %ecx /* destination pointer */ - - testl %edx, %edx - jle .L35 /* Bail if there's nothing to do. */ - - movl %esp, %ebp - subl $16, %esp - andl $0xfffffff0, %esp - - movl %ebx, %eax - movl %edx, %esi - - negl %eax - andl $15, %eax - sarl $2, %eax - cmpl %edx, %eax - cmovle %eax, %esi - - subl %esi, %edx - - testl $1, %esi - je .L32 - - DO_ONE_PIXEL() -.L32: - - testl $2, %esi - je .L31 - - movq (%ebx), %mm0 - addl $8, %ebx - - movq %mm0, %mm3 - movq %mm0, %mm4 - - pand %mm2, %mm3 - psllq $16, %mm4 - psrlq $16, %mm3 - pand %mm2, %mm4 - - pand %mm1, %mm0 - por %mm4, %mm3 - por %mm3, %mm0 - - movq %mm0, (%ecx) - addl $8, %ecx -.L31: - - movl %edx, %eax - shrl $2, %eax - jmp .L33 -.L34: - movaps (%ebx), %xmm0 - addl $16, %ebx - - /* This would be so much better if we could just move directly from - * an SSE register to an MMX register. Unfortunately, that - * functionality wasn't introduced until SSE2 with the MOVDQ2Q - * instruction. - */ - - movaps %xmm0, (%esp) - movq (%esp), %mm0 - movq 8(%esp), %mm5 - - movq %mm0, %mm3 - movq %mm0, %mm4 - movq %mm5, %mm6 - movq %mm5, %mm7 - - pand %mm2, %mm3 - pand %mm2, %mm6 - - psllq $16, %mm4 - psllq $16, %mm7 - - psrlq $16, %mm3 - psrlq $16, %mm6 - - pand %mm2, %mm4 - pand %mm2, %mm7 - - pand %mm1, %mm0 - pand %mm1, %mm5 - - por %mm4, %mm3 - por %mm7, %mm6 - - por %mm3, %mm0 - por %mm6, %mm5 - - movq %mm0, (%ecx) - movq %mm5, 8(%ecx) - addl $16, %ecx - - subl $1, %eax -.L33: - jne .L34 - -#ifdef USE_INNER_EMMS - emms -#endif - movl %ebp, %esp - - /* At this point there are either [0, 3] pixels remaining to be - * converted. - */ - - testl $2, %edx - je .L36 - - movq (%ebx), %mm0 - addl $8, %ebx - - movq %mm0, %mm3 - movq %mm0, %mm4 - - pand %mm2, %mm3 - psllq $16, %mm4 - psrlq $16, %mm3 - pand %mm2, %mm4 - - pand %mm1, %mm0 - por %mm4, %mm3 - por %mm3, %mm0 - - movq %mm0, (%ecx) - addl $8, %ecx -.L36: - - testl $1, %edx - je .L35 - - DO_ONE_LAST_PIXEL() -.L35: - popl %ebp - popl %ebx - popl %esi - ret - .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE - - -/** - * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. - */ - - .text -.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 -.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2 - .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function -_generic_read_RGBA_span_BGRA8888_REV_SSE2: - pushl %esi - pushl %ebx - - movdqa mask, %xmm1 - movdqa mask+16, %xmm2 - - movl 12(%esp), %ebx /* source pointer */ - movl 20(%esp), %edx /* number of pixels to copy */ - movl 16(%esp), %ecx /* destination pointer */ - - movl %ebx, %eax - movl %edx, %esi - - testl %edx, %edx - jle .L46 /* Bail if there's nothing to do. */ - - /* If the source pointer isn't a multiple of 16 we have to process - * a few pixels the "slow" way to get the address aligned for - * the SSE fetch intsructions. - */ - - negl %eax - andl $15, %eax - sarl $2, %eax - - cmpl %edx, %eax - cmovbe %eax, %esi - subl %esi, %edx - - testl $1, %esi - je .L41 - - DO_ONE_PIXEL() -.L41: - testl $2, %esi - je .L40 - - movq (%ebx), %xmm0 - addl $8, %ebx - - movdqa %xmm0, %xmm3 - movdqa %xmm0, %xmm4 - andps %xmm1, %xmm0 - - andps %xmm2, %xmm3 - pslldq $2, %xmm4 - psrldq $2, %xmm3 - andps %xmm2, %xmm4 - - orps %xmm4, %xmm3 - orps %xmm3, %xmm0 - - movq %xmm0, (%ecx) - addl $8, %ecx -.L40: - - /* Would it be worth having a specialized version of this loop for - * the case where the destination is 16-byte aligned? That version - * would be identical except that it could use movedqa instead of - * movdqu. - */ - - movl %edx, %eax - shrl $2, %eax - jmp .L42 -.L43: - movdqa (%ebx), %xmm0 - addl $16, %ebx - - movdqa %xmm0, %xmm3 - movdqa %xmm0, %xmm4 - andps %xmm1, %xmm0 - - andps %xmm2, %xmm3 - pslldq $2, %xmm4 - psrldq $2, %xmm3 - andps %xmm2, %xmm4 - - orps %xmm4, %xmm3 - orps %xmm3, %xmm0 - - movdqu %xmm0, (%ecx) - addl $16, %ecx - subl $1, %eax -.L42: - jne .L43 - - - /* There may be upto 3 pixels remaining to be copied. Take care - * of them now. We do the 2 pixel case first because the data - * will be aligned. - */ - - testl $2, %edx - je .L47 - - movq (%ebx), %xmm0 - - movdqa %xmm0, %xmm3 - movdqa %xmm0, %xmm4 - andps %xmm1, %xmm0 - - andps %xmm2, %xmm3 - pslldq $2, %xmm4 - psrldq $2, %xmm3 - andps %xmm2, %xmm4 - - orps %xmm4, %xmm3 - orps %xmm3, %xmm0 - - movq %xmm0, (%ecx) -.L47: - - testl $1, %edx - je .L46 - - DO_ONE_LAST_PIXEL() -.L46: - - popl %ebx - popl %esi - ret - .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 - - - - .section .rodata - - .align 16 -mask_565: - .word 0xf800 - .word 0x07e0 - .word 0x001f - .word 0x0000 - -/* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C - * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but - * at a small cost to accuracy. - */ - -#define SCALE_ADJUST 5 -#if SCALE_ADJUST == 5 -prescale: - .word 0x0001 - .word 0x0010 - .word 0x0200 - .word 0x0000 - -scale: - .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */ - .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */ - .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */ - .word 0x0000 -#elif SCALE_ADJUST == 0 -prescale: - .word 0x0001 - .word 0x0020 - .word 0x0800 - .word 0x0000 - -scale: - .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */ - .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */ - .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */ - .word 0x0000 -#else -#error SCALE_ADJUST must either be 5 or 0. -#endif - - -alpha: .long 0x00000000 - .long 0x00ff0000 - -/** - * MMX optimized version of the RGB565 to RGBA copy routine. - */ - - .text - .globl _generic_read_RGBA_span_RGB565_MMX - .hidden _generic_read_RGBA_span_RGB565_MMX - .type _generic_read_RGBA_span_RGB565_MMX, @function - -_generic_read_RGBA_span_RGB565_MMX: - -#ifdef USE_INNER_EMMS - emms -#endif - - movl 4(%esp), %eax /* source pointer */ - movl 8(%esp), %edx /* destination pointer */ - movl 12(%esp), %ecx /* number of pixels to copy */ - - movq mask_565, %mm5 - movq prescale, %mm6 - movq scale, %mm7 - - sarl $2, %ecx - jle .L01 /* Bail early if the count is negative. */ - jmp .L02 - -.L03: - /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and - * second pixels into the four words of %mm0 and %mm2. - */ - - movq (%eax), %mm4 - addl $8, %eax - - pshufw $0x00, %mm4, %mm0 - pshufw $0x55, %mm4, %mm2 - - - /* Mask the pixels so that each word of each register contains only - * one color component. - */ - - pand %mm5, %mm0 - pand %mm5, %mm2 - - - /* Adjust the component values so that they are as small as possible, - * but large enough so that we can multiply them by an unsigned 16-bit - * number and get a value as large as 0x00ff0000. - */ - - pmullw %mm6, %mm0 - pmullw %mm6, %mm2 -#if SCALE_ADJUST > 0 - psrlw $SCALE_ADJUST, %mm0 - psrlw $SCALE_ADJUST, %mm2 -#endif - - /* Scale the input component values to be on the range - * [0, 0x00ff0000]. This it the real magic of the whole routine. - */ - - pmulhuw %mm7, %mm0 - pmulhuw %mm7, %mm2 - - - /* Always set the alpha value to 0xff. - */ - - por alpha, %mm0 - por alpha, %mm2 - - - /* Pack the 16-bit values to 8-bit values and store the converted - * pixel data. - */ - - packuswb %mm2, %mm0 - movq %mm0, (%edx) - addl $8, %edx - - - - pshufw $0xaa, %mm4, %mm0 - pshufw $0xff, %mm4, %mm2 - - pand %mm5, %mm0 - pand %mm5, %mm2 - pmullw %mm6, %mm0 - pmullw %mm6, %mm2 -#if SCALE_ADJUST > 0 - psrlw $SCALE_ADJUST, %mm0 - psrlw $SCALE_ADJUST, %mm2 -#endif - pmulhuw %mm7, %mm0 - pmulhuw %mm7, %mm2 - - por alpha, %mm0 - por alpha, %mm2 - - packuswb %mm2, %mm0 - - movq %mm0, (%edx) - addl $8, %edx - - subl $1, %ecx -.L02: - jne .L03 - - - /* At this point there can be at most 3 pixels left to process. If - * there is either 2 or 3 left, process 2. - */ - - movl 12(%esp), %ecx - testl $0x02, %ecx - je .L04 - - movd (%eax), %mm4 - addl $4, %eax - - pshufw $0x00, %mm4, %mm0 - pshufw $0x55, %mm4, %mm2 - - pand %mm5, %mm0 - pand %mm5, %mm2 - pmullw %mm6, %mm0 - pmullw %mm6, %mm2 -#if SCALE_ADJUST > 0 - psrlw $SCALE_ADJUST, %mm0 - psrlw $SCALE_ADJUST, %mm2 -#endif - pmulhuw %mm7, %mm0 - pmulhuw %mm7, %mm2 - - por alpha, %mm0 - por alpha, %mm2 - - packuswb %mm2, %mm0 - - movq %mm0, (%edx) - addl $8, %edx - -.L04: - /* At this point there can be at most 1 pixel left to process. - * Process it if needed. - */ - - testl $0x01, %ecx - je .L01 - - movzxw (%eax), %ecx - movd %ecx, %mm4 - - pshufw $0x00, %mm4, %mm0 - - pand %mm5, %mm0 - pmullw %mm6, %mm0 -#if SCALE_ADJUST > 0 - psrlw $SCALE_ADJUST, %mm0 -#endif - pmulhuw %mm7, %mm0 - - por alpha, %mm0 - - packuswb %mm0, %mm0 - - movd %mm0, (%edx) - -.L01: -#ifdef USE_INNER_EMMS - emms -#endif - ret -#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */ |