diff options
Diffstat (limited to 'mesalib/src/mesa/x86')
36 files changed, 13049 insertions, 13049 deletions
diff --git a/mesalib/src/mesa/x86/3dnow.c b/mesalib/src/mesa/x86/3dnow.c index 549157c50..de2fb1e2a 100644 --- a/mesalib/src/mesa/x86/3dnow.c +++ b/mesalib/src/mesa/x86/3dnow.c @@ -1,91 +1,91 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 5.0.1
- *
- * Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * 3DNow! optimizations contributed by
- * Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
- */
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "math/m_xform.h"
-#include "tnl/t_context.h"
-
-#include "3dnow.h"
-#include "x86_xform.h"
-
-#ifdef DEBUG_MATH
-#include "math/m_debug.h"
-#endif
-
-
-#ifdef USE_3DNOW_ASM
-DECLARE_XFORM_GROUP( 3dnow, 2 )
-DECLARE_XFORM_GROUP( 3dnow, 3 )
-DECLARE_XFORM_GROUP( 3dnow, 4 )
-
-DECLARE_NORM_GROUP( 3dnow )
-
-
-extern void _ASMAPI
-_mesa_v16_3dnow_general_xform( GLfloat *first_vert,
- const GLfloat *m,
- const GLfloat *src,
- GLuint src_stride,
- GLuint count );
-
-extern void _ASMAPI
-_mesa_3dnow_project_vertices( GLfloat *first,
- GLfloat *last,
- const GLfloat *m,
- GLuint stride );
-
-extern void _ASMAPI
-_mesa_3dnow_project_clipped_vertices( GLfloat *first,
- GLfloat *last,
- const GLfloat *m,
- GLuint stride,
- const GLubyte *clipmask );
-#endif
-
-
-void _mesa_init_3dnow_transform_asm( void )
-{
-#ifdef USE_3DNOW_ASM
- ASSIGN_XFORM_GROUP( 3dnow, 2 );
- ASSIGN_XFORM_GROUP( 3dnow, 3 );
- ASSIGN_XFORM_GROUP( 3dnow, 4 );
-
- /* There's a bug somewhere in the 3dnow_normal.S file that causes
- * bad shading. Disable for now.
- ASSIGN_NORM_GROUP( 3dnow );
- */
-
-#ifdef DEBUG_MATH
- _math_test_all_transform_functions( "3DNow!" );
- _math_test_all_normal_transform_functions( "3DNow!" );
-#endif
-#endif
-}
+ +/* + * Mesa 3-D graphics library + * Version: 5.0.1 + * + * Copyright (C) 1999-2003 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * 3DNow! optimizations contributed by + * Holger Waechtler <holger@akaflieg.extern.tu-berlin.de> + */ + +#include "main/glheader.h" +#include "main/context.h" +#include "math/m_xform.h" +#include "tnl/t_context.h" + +#include "3dnow.h" +#include "x86_xform.h" + +#ifdef DEBUG_MATH +#include "math/m_debug.h" +#endif + + +#ifdef USE_3DNOW_ASM +DECLARE_XFORM_GROUP( 3dnow, 2 ) +DECLARE_XFORM_GROUP( 3dnow, 3 ) +DECLARE_XFORM_GROUP( 3dnow, 4 ) + +DECLARE_NORM_GROUP( 3dnow ) + + +extern void _ASMAPI +_mesa_v16_3dnow_general_xform( GLfloat *first_vert, + const GLfloat *m, + const GLfloat *src, + GLuint src_stride, + GLuint count ); + +extern void _ASMAPI +_mesa_3dnow_project_vertices( GLfloat *first, + GLfloat *last, + const GLfloat *m, + GLuint stride ); + +extern void _ASMAPI +_mesa_3dnow_project_clipped_vertices( GLfloat *first, + GLfloat *last, + const GLfloat *m, + GLuint stride, + const GLubyte *clipmask ); +#endif + + +void _mesa_init_3dnow_transform_asm( void ) +{ +#ifdef USE_3DNOW_ASM + ASSIGN_XFORM_GROUP( 3dnow, 2 ); + ASSIGN_XFORM_GROUP( 3dnow, 3 ); + ASSIGN_XFORM_GROUP( 3dnow, 4 ); + + /* There's a bug somewhere in the 3dnow_normal.S file that causes + * bad shading. Disable for now. + ASSIGN_NORM_GROUP( 3dnow ); + */ + +#ifdef DEBUG_MATH + _math_test_all_transform_functions( "3DNow!" ); + _math_test_all_normal_transform_functions( "3DNow!" ); +#endif +#endif +} diff --git a/mesalib/src/mesa/x86/3dnow.h b/mesalib/src/mesa/x86/3dnow.h index 067f8f721..1c1fedcd4 100644 --- a/mesalib/src/mesa/x86/3dnow.h +++ b/mesalib/src/mesa/x86/3dnow.h @@ -1,36 +1,36 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * 3DNow! optimizations contributed by
- * Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
- */
-
-#ifndef __3DNOW_H__
-#define __3DNOW_H__
-
-void _mesa_init_3dnow_transform_asm( void );
-
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * 3DNow! optimizations contributed by + * Holger Waechtler <holger@akaflieg.extern.tu-berlin.de> + */ + +#ifndef __3DNOW_H__ +#define __3DNOW_H__ + +void _mesa_init_3dnow_transform_asm( void ); + +#endif diff --git a/mesalib/src/mesa/x86/3dnow_normal.S b/mesalib/src/mesa/x86/3dnow_normal.S index bfa316588..7f5f6b357 100644 --- a/mesalib/src/mesa/x86/3dnow_normal.S +++ b/mesalib/src/mesa/x86/3dnow_normal.S @@ -1,852 +1,852 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 5.1
- *
- * Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * 3Dnow assembly code by Holger Waechtler
- */
-
-#ifdef USE_3DNOW_ASM
-
-#include "assyntax.h"
-#include "matypes.h"
-#include "norm_args.h"
-
- SEG_TEXT
-
-#define M(i) REGOFF(i * 4, ECX)
-#define STRIDE REGOFF(12, ESI)
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_3dnow_transform_normalize_normals)
-HIDDEN(_mesa_3dnow_transform_normalize_normals)
-GLNAME(_mesa_3dnow_transform_normalize_normals):
-
-#define FRAME_OFFSET 12
-
- PUSH_L ( EDI )
- PUSH_L ( ESI )
- PUSH_L ( EBP )
-
- MOV_L ( ARG_LENGTHS, EDI )
- MOV_L ( ARG_IN, ESI )
- MOV_L ( ARG_DEST, EAX )
- MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */
- MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) )
- MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
- MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
- MOV_L ( ARG_MAT, ECX )
- MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
-
- CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
- JE ( LLBL (G3TN_end) )
-
- MOV_L ( REGOFF (V4F_COUNT, ESI), EBP )
- FEMMS
-
- PUSH_L ( EBP )
- PUSH_L ( EAX )
- PUSH_L ( EDX ) /* save counter & pointer for */
- /* the normalize pass */
-#undef FRAME_OFFSET
-#define FRAME_OFFSET 24
-
- MOVQ ( M(0), MM3 ) /* m1 | m0 */
- MOVQ ( M(4), MM4 ) /* m5 | m4 */
-
- MOVD ( M(2), MM5 ) /* | m2 */
- PUNPCKLDQ ( M(6), MM5 ) /* m6 | m2 */
-
- MOVQ ( M(8), MM6 ) /* m9 | m8 */
- MOVQ ( M(10), MM7 ) /* | m10 */
-
- CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
- JNE ( LLBL (G3TN_scale_end ) )
-
- MOVD ( ARG_SCALE, MM0 ) /* | scale */
- PUNPCKLDQ ( MM0, MM0 ) /* scale | scale */
-
- PFMUL ( MM0, MM3 ) /* scale * m1 | scale * m0 */
- PFMUL ( MM0, MM4 ) /* scale * m5 | scale * m4 */
- PFMUL ( MM0, MM5 ) /* scale * m6 | scale * m2 */
- PFMUL ( MM0, MM6 ) /* scale * m9 | scale * m8 */
- PFMUL ( MM0, MM7 ) /* | scale * m10 */
-
-ALIGNTEXT32
-LLBL (G3TN_scale_end):
-LLBL (G3TN_transform):
- MOVQ ( REGIND (EDX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */
-
- MOVQ ( MM0, MM1 ) /* x1 | x0 */
- PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
-
- PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
- ADD_L ( CONST(16), EAX ) /* next r */
-
- PREFETCHW ( REGIND(EAX) )
-
- PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */
- PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */
-
- PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */
- PFADD ( MM2, MM0 ) /* x0*m4+x1*m5+x2*m6| x0*m0+...+x2**/
-
- MOVQ ( REGIND (EDX), MM1 ) /* x1 | x0 */
- MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */
-
- PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
- MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */
-
- PFMUL ( MM7, MM2 ) /* | x2*m10 */
- PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */
-
- PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m*/
- ADD_L ( STRIDE, EDX ) /* next normal */
-
- PREFETCH ( REGIND(EDX) )
-
- MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */
- SUB_L ( CONST(1), EBP ) /* decrement normal counter */
- JNZ ( LLBL (G3TN_transform) )
-
-
- POP_L ( EDX ) /* end of transform --- */
- POP_L ( EAX ) /* now normalizing ... */
- POP_L ( EBP )
-
- CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
- JE ( LLBL (G3TN_norm ) ) /* calculate lengths */
-
-
-ALIGNTEXT32
-LLBL (G3TN_norm_w_lengths):
-
- PREFETCHW ( REGOFF(12,EAX) )
-
- MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
-
- MOVD ( REGIND (EDI), MM3 ) /* | length (x) */
- PFMUL ( MM3, MM1 ) /* | x2 (normalize*/
-
- PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */
- PFMUL ( MM3, MM0 ) /* x1 (normalized) | x0 (normalize*/
-
- ADD_L ( STRIDE, EDX ) /* next normal */
- ADD_L ( CONST(4), EDI ) /* next length */
-
- PREFETCH ( REGIND(EDI) )
-
- MOVQ ( MM0, REGIND(EAX) ) /* write new x0, x1 */
- MOVD ( MM1, REGOFF(8, EAX) ) /* write new x2 */
-
- ADD_L ( CONST(16), EAX ) /* next r */
- SUB_L ( CONST(1), EBP ) /* decrement normal counter */
-
- JNZ ( LLBL (G3TN_norm_w_lengths) )
- JMP ( LLBL (G3TN_exit_3dnow) )
-
-ALIGNTEXT32
-LLBL (G3TN_norm):
-
- PREFETCHW ( REGIND(EAX) )
-
- MOVQ ( REGIND (EAX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
-
- MOVQ ( MM0, MM3 ) /* x1 | x0 */
- MOVQ ( MM1, MM4 ) /* | x2 */
-
- PFMUL ( MM0, MM3 ) /* x1*x1 | x0*x0 */
- ADD_L ( CONST(16), EAX ) /* next r */
-
- PFMUL ( MM1, MM4 ) /* | x2*x2 */
- PFADD ( MM4, MM3 ) /* | x0*x0+x2*x2 */
-
- PFACC ( MM3, MM3 ) /* **not used** | x0*x0+x1*x1+x2**/
- PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */
-
- MOVQ ( MM5, MM4 )
- PUNPCKLDQ ( MM3, MM3 )
-
- SUB_L ( CONST(1), EBP ) /* decrement normal counter */
- PFMUL ( MM5, MM5 )
-
- PFRSQIT1 ( MM3, MM5 )
- PFRCPIT2 ( MM4, MM5 )
-
- PFMUL ( MM5, MM0 ) /* x1 (normalized) | x0 (normalize*/
-
- MOVQ ( MM0, REGOFF(-16, EAX) ) /* write new x0, x1 */
- PFMUL ( MM5, MM1 ) /* | x2 (normalize*/
-
- MOVD ( MM1, REGOFF(-8, EAX) ) /* write new x2 */
- JNZ ( LLBL (G3TN_norm) )
-
-LLBL (G3TN_exit_3dnow):
- FEMMS
-
-LLBL (G3TN_end):
- POP_L ( EBP )
- POP_L ( ESI )
- POP_L ( EDI )
- RET
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_3dnow_transform_normalize_normals_no_rot)
-HIDDEN(_mesa_3dnow_transform_normalize_normals_no_rot)
-GLNAME(_mesa_3dnow_transform_normalize_normals_no_rot):
-
-#undef FRAME_OFFSET
-#define FRAME_OFFSET 12
-
- PUSH_L ( EDI )
- PUSH_L ( ESI )
- PUSH_L ( EBP )
-
- MOV_L ( ARG_LENGTHS, EDI )
- MOV_L ( ARG_IN, ESI )
- MOV_L ( ARG_DEST, EAX )
- MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */
- MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) )
- MOV_L ( ARG_MAT, ECX )
- MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
- MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
- MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
-
- CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
- JE ( LLBL (G3TNNR_end) )
-
- FEMMS
-
- MOVD ( M(0), MM0 ) /* | m0 */
- PUNPCKLDQ ( M(5), MM0 ) /* m5 | m0 */
-
- MOVD ( M(10), MM2 ) /* | m10 */
- PUNPCKLDQ ( MM2, MM2 ) /* m10 | m10 */
-
- CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
- JNE ( LLBL (G3TNNR_scale_end ) )
-
- MOVD ( ARG_SCALE, MM7 ) /* | scale */
- PUNPCKLDQ ( MM7, MM7 ) /* scale | scale */
-
- PFMUL ( MM7, MM0 ) /* scale * m5 | scale * m0 */
- PFMUL ( MM7, MM2 ) /* scale * m10 | scale * m10 */
-
-ALIGNTEXT32
-LLBL (G3TNNR_scale_end):
- CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
- JE ( LLBL (G3TNNR_norm) ) /* need to calculate lengths */
-
- MOVD ( REGIND(EDI), MM3 ) /* | length (x) */
-
-
-ALIGNTEXT32
-LLBL (G3TNNR_norm_w_lengths): /* use precalculated lengths */
-
- PREFETCHW ( REGIND(EAX) )
-
- MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
-
- PFMUL ( MM0, MM6 ) /* x1*m5 | x0*m0 */
- ADD_L ( STRIDE, EDX ) /* next normal */
-
- PREFETCH ( REGIND(EDX) )
-
- PFMUL ( MM2, MM7 ) /* | x2*m10 */
- ADD_L ( CONST(16), EAX ) /* next r */
-
- PFMUL ( MM3, MM7 ) /* | x2 (normalized) */
- PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */
-
- ADD_L ( CONST(4), EDI ) /* next length */
- PFMUL ( MM3, MM6 ) /* x1 (normalized) | x0 (normalized) */
-
- SUB_L ( CONST(1), EBP ) /* decrement normal counter */
- MOVQ ( MM6, REGOFF(-16, EAX) ) /* write r0, r1 */
-
- MOVD ( MM7, REGOFF(-8, EAX) ) /* write r2 */
- MOVD ( REGIND(EDI), MM3 ) /* | length (x) */
-
- JNZ ( LLBL (G3TNNR_norm_w_lengths) )
- JMP ( LLBL (G3TNNR_exit_3dnow) )
-
-ALIGNTEXT32
-LLBL (G3TNNR_norm): /* need to calculate lengths */
-
- PREFETCHW ( REGIND(EAX) )
-
- MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
-
- PFMUL ( MM0, MM6 ) /* x1*m5 | x0*m0 */
- ADD_L ( CONST(16), EAX ) /* next r */
-
- PFMUL ( MM2, MM7 ) /* | x2*m10 */
- MOVQ ( MM6, MM3 ) /* x1 (transformed)| x0 (transformed) */
-
- MOVQ ( MM7, MM4 ) /* | x2 (transformed) */
- PFMUL ( MM6, MM3 ) /* x1*x1 | x0*x0 */
-
-
- PFMUL ( MM7, MM4 ) /* | x2*x2 */
- PFACC ( MM3, MM3 ) /* **not used** | x0*x0+x1*x1 */
-
- PFADD ( MM4, MM3 ) /* | x0*x0+x1*x1+x2*x2*/
- ADD_L ( STRIDE, EDX ) /* next normal */
-
- PREFETCH ( REGIND(EDX) )
-
- PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */
- MOVQ ( MM5, MM4 )
-
- PUNPCKLDQ ( MM3, MM3 )
- PFMUL ( MM5, MM5 )
-
- PFRSQIT1 ( MM3, MM5 )
- SUB_L ( CONST(1), EBP ) /* decrement normal counter */
-
- PFRCPIT2 ( MM4, MM5 )
- PFMUL ( MM5, MM6 ) /* x1 (normalized) | x0 (normalized) */
-
- MOVQ ( MM6, REGOFF(-16, EAX) ) /* write r0, r1 */
- PFMUL ( MM5, MM7 ) /* | x2 (normalized) */
-
- MOVD ( MM7, REGOFF(-8, EAX) ) /* write r2 */
- JNZ ( LLBL (G3TNNR_norm) )
-
-
-LLBL (G3TNNR_exit_3dnow):
- FEMMS
-
-LLBL (G3TNNR_end):
- POP_L ( EBP )
- POP_L ( ESI )
- POP_L ( EDI )
- RET
-
-
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_3dnow_transform_rescale_normals_no_rot)
-HIDDEN(_mesa_3dnow_transform_rescale_normals_no_rot)
-GLNAME(_mesa_3dnow_transform_rescale_normals_no_rot):
-
-#undef FRAME_OFFSET
-#define FRAME_OFFSET 12
-
- PUSH_L ( EDI )
- PUSH_L ( ESI )
- PUSH_L ( EBP )
-
- MOV_L ( ARG_IN, EAX )
- MOV_L ( ARG_DEST, EDX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EBP ) /* dest->count = in->count */
- MOV_L ( EBP, REGOFF(V4F_COUNT, EDX) )
- MOV_L ( ARG_IN, ESI )
- MOV_L ( ARG_MAT, ECX )
- MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
- MOV_L ( REGOFF(V4F_START, EDX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
-
- CMP_L ( CONST(0), EBP )
- JE ( LLBL (G3TRNR_end) )
-
- FEMMS
-
- MOVD ( ARG_SCALE, MM6 ) /* | scale */
- PUNPCKLDQ ( MM6, MM6 ) /* scale | scale */
-
- MOVD ( REGIND(ECX), MM0 ) /* | m0 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m5 | m0 */
-
- PFMUL ( MM6, MM0 ) /* scale*m5 | scale*m0 */
- MOVD ( REGOFF(40, ECX), MM2 ) /* | m10 */
-
- PFMUL ( MM6, MM2 ) /* | scale*m10 */
-
-ALIGNTEXT32
-LLBL (G3TRNR_rescale):
-
- PREFETCHW ( REGIND(EAX) )
-
- MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
-
- PFMUL ( MM0, MM4 ) /* x1*m5 | x0*m0 */
- ADD_L ( STRIDE, EDX ) /* next normal */
-
- PREFETCH ( REGIND(EDX) )
-
- PFMUL ( MM2, MM5 ) /* | x2*m10 */
- ADD_L ( CONST(16), EAX ) /* next r */
-
- SUB_L ( CONST(1), EBP ) /* decrement normal counter */
- MOVQ ( MM4, REGOFF(-16, EAX) ) /* write r0, r1 */
-
- MOVD ( MM5, REGOFF(-8, EAX) ) /* write r2 */
- JNZ ( LLBL (G3TRNR_rescale) ) /* cnt > 0 ? -> process next normal */
-
- FEMMS
-
-LLBL (G3TRNR_end):
- POP_L ( EBP )
- POP_L ( ESI )
- POP_L ( EDI )
- RET
-
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_3dnow_transform_rescale_normals)
-HIDDEN(_mesa_3dnow_transform_rescale_normals)
-GLNAME(_mesa_3dnow_transform_rescale_normals):
-
-#undef FRAME_OFFSET
-#define FRAME_OFFSET 8
-
- PUSH_L ( EDI )
- PUSH_L ( ESI )
-
- MOV_L ( ARG_IN, ESI )
- MOV_L ( ARG_DEST, EAX )
- MOV_L ( ARG_MAT, ECX )
- MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */
- MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) )
- MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
- MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
-
- CMP_L ( CONST(0), EDI )
- JE ( LLBL (G3TR_end) )
-
- FEMMS
-
- MOVQ ( REGIND(ECX), MM3 ) /* m1 | m0 */
-
- MOVQ ( REGOFF(16,ECX), MM4 ) /* m5 | m4 */
- MOVD ( ARG_SCALE, MM0 ) /* scale */
-
- MOVD ( REGOFF(8,ECX), MM5 ) /* | m2 */
- PUNPCKLDQ ( MM0, MM0 ) /* scale | scale */
-
- PUNPCKLDQ ( REGOFF(24, ECX), MM5 )
- PFMUL ( MM0, MM3 ) /* scale*m1 | scale*m0 */
-
- MOVQ ( REGOFF(32, ECX), MM6 ) /* m9 | m8*/
- PFMUL ( MM0, MM4 ) /* scale*m5 | scale*m4 */
-
- MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */
- PFMUL ( MM0, MM5 ) /* scale*m6 | scale*m2 */
-
- PFMUL ( MM0, MM6 ) /* scale*m9 | scale*m8 */
-
- PFMUL ( MM0, MM7 ) /* | scale*m10 */
-
-ALIGNTEXT32
-LLBL (G3TR_rescale):
-
- PREFETCHW ( REGIND(EAX) )
-
- MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
-
- MOVQ ( MM0, MM1 ) /* x1 | x0 */
- PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
-
- PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
- ADD_L ( CONST(16), EAX ) /* next r */
-
- PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */
- PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */
-
- MOVQ ( REGIND(EDX), MM1 ) /* x1 | x0 */
-
- PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */
- PFADD ( MM2, MM0 ) /* x0*m4...+x2*m6| x0*m0+x1*m1+x2*m2 */
-
- MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
- ADD_L ( STRIDE, EDX ) /* next normal */
-
- PREFETCH ( REGIND(EDX) )
-
- MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */
- PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
-
- PFMUL ( MM7, MM2 ) /* | x2*m10 */
- PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */
-
- PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m10 */
- MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */
-
- SUB_L ( CONST(1), EDI ) /* decrement normal counter */
- JNZ ( LLBL (G3TR_rescale) )
-
- FEMMS
-
-LLBL (G3TR_end):
- POP_L ( ESI )
- POP_L ( EDI )
- RET
-
-
-
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_3dnow_transform_normals_no_rot)
-HIDDEN(_mesa_3dnow_transform_normals_no_rot)
-GLNAME(_mesa_3dnow_transform_normals_no_rot):
-
-#undef FRAME_OFFSET
-#define FRAME_OFFSET 8
-
- PUSH_L ( EDI )
- PUSH_L ( ESI )
-
- MOV_L ( ARG_IN, ESI )
- MOV_L ( ARG_DEST, EAX )
- MOV_L ( ARG_MAT, ECX )
- MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */
- MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) )
- MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
- MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
-
- CMP_L ( CONST(0), EDI )
- JE ( LLBL (G3TNR_end) )
-
- FEMMS
-
- MOVD ( REGIND(ECX), MM0 ) /* | m0 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m5 | m0 */
-
- MOVD ( REGOFF(40, ECX), MM2 ) /* | m10 */
- PUNPCKLDQ ( MM2, MM2 ) /* m10 | m10 */
-
-ALIGNTEXT32
-LLBL (G3TNR_transform):
-
- PREFETCHW ( REGIND(EAX) )
-
- MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
-
- PFMUL ( MM0, MM4 ) /* x1*m5 | x0*m0 */
- ADD_L ( STRIDE, EDX) /* next normal */
-
- PREFETCH ( REGIND(EDX) )
-
- PFMUL ( MM2, MM5 ) /* | x2*m10 */
- ADD_L ( CONST(16), EAX ) /* next r */
-
- SUB_L ( CONST(1), EDI ) /* decrement normal counter */
- MOVQ ( MM4, REGOFF(-16, EAX) ) /* write r0, r1 */
-
- MOVD ( MM5, REGOFF(-8, EAX) ) /* write r2 */
- JNZ ( LLBL (G3TNR_transform) )
-
- FEMMS
-
-LLBL (G3TNR_end):
- POP_L ( ESI )
- POP_L ( EDI )
- RET
-
-
-
-
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_3dnow_transform_normals)
-HIDDEN(_mesa_3dnow_transform_normals)
-GLNAME(_mesa_3dnow_transform_normals):
-
-#undef FRAME_OFFSET
-#define FRAME_OFFSET 8
-
- PUSH_L ( EDI )
- PUSH_L ( ESI )
-
- MOV_L ( ARG_IN, ESI )
- MOV_L ( ARG_DEST, EAX )
- MOV_L ( ARG_MAT, ECX )
- MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */
- MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) )
- MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
- MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
-
- CMP_L ( CONST(0), EDI ) /* count > 0 ?? */
- JE ( LLBL (G3T_end) )
-
- FEMMS
-
- MOVQ ( REGIND(ECX), MM3 ) /* m1 | m0 */
- MOVQ ( REGOFF(16, ECX), MM4 ) /* m5 | m4 */
-
- MOVD ( REGOFF(8, ECX), MM5 ) /* | m2 */
- PUNPCKLDQ ( REGOFF(24, ECX), MM5 ) /* m6 | m2 */
-
- MOVQ ( REGOFF(32, ECX), MM6 ) /* m9 | m8 */
- MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */
-
-ALIGNTEXT32
-LLBL (G3T_transform):
-
- PREFETCHW ( REGIND(EAX) )
-
- MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
-
- MOVQ ( MM0, MM1 ) /* x1 | x0 */
- PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
-
- PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
- ADD_L ( CONST(16), EAX ) /* next r */
-
- PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */
- PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */
-
- PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */
- PFADD ( MM2, MM0 ) /* x0*m4...+x2*m6| x0*m0+x1*m1+x2*m2 */
-
- MOVQ ( REGIND(EDX), MM1 ) /* x1 | x0 */
- MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */
-
- PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
- MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
-
- PFMUL ( MM7, MM2 ) /* | x2*m10 */
- ADD_L ( STRIDE, EDX ) /* next normal */
-
- PREFETCH ( REGIND(EDX) )
-
- PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */
- PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m10 */
-
- MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */
- SUB_L ( CONST(1), EDI ) /* decrement normal counter */
-
- JNZ ( LLBL (G3T_transform) )
-
- FEMMS
-
-LLBL (G3T_end):
- POP_L ( ESI )
- POP_L ( EDI )
- RET
-
-
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_3dnow_normalize_normals)
-HIDDEN(_mesa_3dnow_normalize_normals)
-GLNAME(_mesa_3dnow_normalize_normals):
-
-#undef FRAME_OFFSET
-#define FRAME_OFFSET 12
-
- PUSH_L ( EDI )
- PUSH_L ( ESI )
- PUSH_L ( EBP )
-
- MOV_L ( ARG_IN, ESI )
- MOV_L ( ARG_DEST, EAX )
- MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */
- MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) )
- MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V4F_START, ESI), ECX ) /* in->start */
- MOV_L ( ARG_LENGTHS, EDX )
-
- CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
- JE ( LLBL (G3N_end) )
-
- FEMMS
-
- CMP_L ( CONST(0), EDX ) /* lengths == 0 ? */
- JE ( LLBL (G3N_norm2) ) /* calculate lengths */
-
-ALIGNTEXT32
-LLBL (G3N_norm1): /* use precalculated lengths */
-
- PREFETCH ( REGIND(EAX) )
-
- MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
-
- MOVD ( REGIND(EDX), MM3 ) /* | length (x) */
- PFMUL ( MM3, MM1 ) /* | x2 (normalized) */
-
- PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */
- ADD_L ( STRIDE, ECX ) /* next normal */
-
- PREFETCH ( REGIND(ECX) )
-
- PFMUL ( MM3, MM0 ) /* x1 (normalized) | x0 (normalized) */
- MOVQ ( MM0, REGIND(EAX) ) /* write new x0, x1 */
-
- MOVD ( MM1, REGOFF(8, EAX) ) /* write new x2 */
- ADD_L ( CONST(16), EAX ) /* next r */
-
- ADD_L ( CONST(4), EDX ) /* next length */
- SUB_L ( CONST(1), EBP ) /* decrement normal counter */
-
- JNZ ( LLBL (G3N_norm1) )
-
- JMP ( LLBL (G3N_end1) )
-
-ALIGNTEXT32
-LLBL (G3N_norm2): /* need to calculate lengths */
-
- PREFETCHW ( REGIND(EAX) )
-
- PREFETCH ( REGIND(ECX) )
-
- MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
-
- MOVQ ( MM0, MM3 ) /* x1 | x0 */
- ADD_L ( STRIDE, ECX ) /* next normal */
-
- PFMUL ( MM0, MM3 ) /* x1*x1 | x0*x0 */
- MOVQ ( MM1, MM4 ) /* | x2 */
-
- ADD_L ( CONST(16), EAX ) /* next r */
- PFMUL ( MM1, MM4 ) /* | x2*x2 */
-
- PFADD ( MM4, MM3 ) /* | x0*x0+x2*x2 */
- PFACC ( MM3, MM3 ) /* x0*x0+...+x2*x2 | x0*x0+x1*x1+x2*x2*/
-
- PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */
- MOVQ ( MM5, MM4 )
-
- PUNPCKLDQ ( MM3, MM3 )
- PFMUL ( MM5, MM5 )
-
- PFRSQIT1 ( MM3, MM5 )
- SUB_L ( CONST(1), EBP ) /* decrement normal counter */
-
- PFRCPIT2 ( MM4, MM5 )
-
- PFMUL ( MM5, MM0 ) /* x1 (normalized) | x0 (normalized) */
- MOVQ ( MM0, REGOFF(-16, EAX) ) /* write new x0, x1 */
-
- PFMUL ( MM5, MM1 ) /* | x2 (normalized) */
- MOVD ( MM1, REGOFF(-8, EAX) ) /* write new x2 */
-
- JNZ ( LLBL (G3N_norm2) )
-
-LLBL (G3N_end1):
- FEMMS
-
-LLBL (G3N_end):
- POP_L ( EBP )
- POP_L ( ESI )
- POP_L ( EDI )
- RET
-
-
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_3dnow_rescale_normals)
-HIDDEN(_mesa_3dnow_rescale_normals)
-GLNAME(_mesa_3dnow_rescale_normals):
-
-#undef FRAME_OFFSET
-#define FRAME_OFFSET 8
- PUSH_L ( EDI )
- PUSH_L ( ESI )
-
- MOV_L ( ARG_IN, ESI )
- MOV_L ( ARG_DEST, EAX )
- MOV_L ( REGOFF(V4F_COUNT, ESI), EDX ) /* dest->count = in->count */
- MOV_L ( EDX, REGOFF(V4F_COUNT, EAX) )
- MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V4F_START, ESI), ECX ) /* in->start */
-
- CMP_L ( CONST(0), EDX )
- JE ( LLBL (G3R_end) )
-
- FEMMS
-
- MOVD ( ARG_SCALE, MM0 ) /* scale */
- PUNPCKLDQ ( MM0, MM0 )
-
-ALIGNTEXT32
-LLBL (G3R_rescale):
-
- PREFETCHW ( REGIND(EAX) )
-
- MOVQ ( REGIND(ECX), MM1 ) /* x1 | x0 */
- MOVD ( REGOFF(8, ECX), MM2 ) /* | x2 */
-
- PFMUL ( MM0, MM1 ) /* x1*scale | x0*scale */
- ADD_L ( STRIDE, ECX ) /* next normal */
-
- PREFETCH ( REGIND(ECX) )
-
- PFMUL ( MM0, MM2 ) /* | x2*scale */
- ADD_L ( CONST(16), EAX ) /* next r */
-
- MOVQ ( MM1, REGOFF(-16, EAX) ) /* write r0, r1 */
- MOVD ( MM2, REGOFF(-8, EAX) ) /* write r2 */
-
- SUB_L ( CONST(1), EDX ) /* decrement normal counter */
- JNZ ( LLBL (G3R_rescale) )
-
- FEMMS
-
-LLBL (G3R_end):
- POP_L ( ESI )
- POP_L ( EDI )
- RET
-
-#endif
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 5.1 + * + * Copyright (C) 1999-2003 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * 3Dnow assembly code by Holger Waechtler + */ + +#ifdef USE_3DNOW_ASM + +#include "assyntax.h" +#include "matypes.h" +#include "norm_args.h" + + SEG_TEXT + +#define M(i) REGOFF(i * 4, ECX) +#define STRIDE REGOFF(12, ESI) + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_3dnow_transform_normalize_normals) +HIDDEN(_mesa_3dnow_transform_normalize_normals) +GLNAME(_mesa_3dnow_transform_normalize_normals): + +#define FRAME_OFFSET 12 + + PUSH_L ( EDI ) + PUSH_L ( ESI ) + PUSH_L ( EBP ) + + MOV_L ( ARG_LENGTHS, EDI ) + MOV_L ( ARG_IN, ESI ) + MOV_L ( ARG_DEST, EAX ) + MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */ + MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) ) + MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */ + MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */ + MOV_L ( ARG_MAT, ECX ) + MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */ + + CMP_L ( CONST(0), EBP ) /* count > 0 ?? */ + JE ( LLBL (G3TN_end) ) + + MOV_L ( REGOFF (V4F_COUNT, ESI), EBP ) + FEMMS + + PUSH_L ( EBP ) + PUSH_L ( EAX ) + PUSH_L ( EDX ) /* save counter & pointer for */ + /* the normalize pass */ +#undef FRAME_OFFSET +#define FRAME_OFFSET 24 + + MOVQ ( M(0), MM3 ) /* m1 | m0 */ + MOVQ ( M(4), MM4 ) /* m5 | m4 */ + + MOVD ( M(2), MM5 ) /* | m2 */ + PUNPCKLDQ ( M(6), MM5 ) /* m6 | m2 */ + + MOVQ ( M(8), MM6 ) /* m9 | m8 */ + MOVQ ( M(10), MM7 ) /* | m10 */ + + CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */ + JNE ( LLBL (G3TN_scale_end ) ) + + MOVD ( ARG_SCALE, MM0 ) /* | scale */ + PUNPCKLDQ ( MM0, MM0 ) /* scale | scale */ + + PFMUL ( MM0, MM3 ) /* scale * m1 | scale * m0 */ + PFMUL ( MM0, MM4 ) /* scale * m5 | scale * m4 */ + PFMUL ( MM0, MM5 ) /* scale * m6 | scale * m2 */ + PFMUL ( MM0, MM6 ) /* scale * m9 | scale * m8 */ + PFMUL ( MM0, MM7 ) /* | scale * m10 */ + +ALIGNTEXT32 +LLBL (G3TN_scale_end): +LLBL (G3TN_transform): + MOVQ ( REGIND (EDX), MM0 ) /* x1 | x0 */ + MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */ + + MOVQ ( MM0, MM1 ) /* x1 | x0 */ + PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */ + + PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */ + ADD_L ( CONST(16), EAX ) /* next r */ + + PREFETCHW ( REGIND(EAX) ) + + PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */ + PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */ + + PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */ + PFADD ( MM2, MM0 ) /* x0*m4+x1*m5+x2*m6| x0*m0+...+x2**/ + + MOVQ ( REGIND (EDX), MM1 ) /* x1 | x0 */ + MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */ + + PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */ + MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */ + + PFMUL ( MM7, MM2 ) /* | x2*m10 */ + PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */ + + PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m*/ + ADD_L ( STRIDE, EDX ) /* next normal */ + + PREFETCH ( REGIND(EDX) ) + + MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */ + SUB_L ( CONST(1), EBP ) /* decrement normal counter */ + JNZ ( LLBL (G3TN_transform) ) + + + POP_L ( EDX ) /* end of transform --- */ + POP_L ( EAX ) /* now normalizing ... */ + POP_L ( EBP ) + + CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */ + JE ( LLBL (G3TN_norm ) ) /* calculate lengths */ + + +ALIGNTEXT32 +LLBL (G3TN_norm_w_lengths): + + PREFETCHW ( REGOFF(12,EAX) ) + + MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */ + + MOVD ( REGIND (EDI), MM3 ) /* | length (x) */ + PFMUL ( MM3, MM1 ) /* | x2 (normalize*/ + + PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */ + PFMUL ( MM3, MM0 ) /* x1 (normalized) | x0 (normalize*/ + + ADD_L ( STRIDE, EDX ) /* next normal */ + ADD_L ( CONST(4), EDI ) /* next length */ + + PREFETCH ( REGIND(EDI) ) + + MOVQ ( MM0, REGIND(EAX) ) /* write new x0, x1 */ + MOVD ( MM1, REGOFF(8, EAX) ) /* write new x2 */ + + ADD_L ( CONST(16), EAX ) /* next r */ + SUB_L ( CONST(1), EBP ) /* decrement normal counter */ + + JNZ ( LLBL (G3TN_norm_w_lengths) ) + JMP ( LLBL (G3TN_exit_3dnow) ) + +ALIGNTEXT32 +LLBL (G3TN_norm): + + PREFETCHW ( REGIND(EAX) ) + + MOVQ ( REGIND (EAX), MM0 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */ + + MOVQ ( MM0, MM3 ) /* x1 | x0 */ + MOVQ ( MM1, MM4 ) /* | x2 */ + + PFMUL ( MM0, MM3 ) /* x1*x1 | x0*x0 */ + ADD_L ( CONST(16), EAX ) /* next r */ + + PFMUL ( MM1, MM4 ) /* | x2*x2 */ + PFADD ( MM4, MM3 ) /* | x0*x0+x2*x2 */ + + PFACC ( MM3, MM3 ) /* **not used** | x0*x0+x1*x1+x2**/ + PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */ + + MOVQ ( MM5, MM4 ) + PUNPCKLDQ ( MM3, MM3 ) + + SUB_L ( CONST(1), EBP ) /* decrement normal counter */ + PFMUL ( MM5, MM5 ) + + PFRSQIT1 ( MM3, MM5 ) + PFRCPIT2 ( MM4, MM5 ) + + PFMUL ( MM5, MM0 ) /* x1 (normalized) | x0 (normalize*/ + + MOVQ ( MM0, REGOFF(-16, EAX) ) /* write new x0, x1 */ + PFMUL ( MM5, MM1 ) /* | x2 (normalize*/ + + MOVD ( MM1, REGOFF(-8, EAX) ) /* write new x2 */ + JNZ ( LLBL (G3TN_norm) ) + +LLBL (G3TN_exit_3dnow): + FEMMS + +LLBL (G3TN_end): + POP_L ( EBP ) + POP_L ( ESI ) + POP_L ( EDI ) + RET + + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_3dnow_transform_normalize_normals_no_rot) +HIDDEN(_mesa_3dnow_transform_normalize_normals_no_rot) +GLNAME(_mesa_3dnow_transform_normalize_normals_no_rot): + +#undef FRAME_OFFSET +#define FRAME_OFFSET 12 + + PUSH_L ( EDI ) + PUSH_L ( ESI ) + PUSH_L ( EBP ) + + MOV_L ( ARG_LENGTHS, EDI ) + MOV_L ( ARG_IN, ESI ) + MOV_L ( ARG_DEST, EAX ) + MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */ + MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) ) + MOV_L ( ARG_MAT, ECX ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */ + MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */ + MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */ + + CMP_L ( CONST(0), EBP ) /* count > 0 ?? */ + JE ( LLBL (G3TNNR_end) ) + + FEMMS + + MOVD ( M(0), MM0 ) /* | m0 */ + PUNPCKLDQ ( M(5), MM0 ) /* m5 | m0 */ + + MOVD ( M(10), MM2 ) /* | m10 */ + PUNPCKLDQ ( MM2, MM2 ) /* m10 | m10 */ + + CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */ + JNE ( LLBL (G3TNNR_scale_end ) ) + + MOVD ( ARG_SCALE, MM7 ) /* | scale */ + PUNPCKLDQ ( MM7, MM7 ) /* scale | scale */ + + PFMUL ( MM7, MM0 ) /* scale * m5 | scale * m0 */ + PFMUL ( MM7, MM2 ) /* scale * m10 | scale * m10 */ + +ALIGNTEXT32 +LLBL (G3TNNR_scale_end): + CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */ + JE ( LLBL (G3TNNR_norm) ) /* need to calculate lengths */ + + MOVD ( REGIND(EDI), MM3 ) /* | length (x) */ + + +ALIGNTEXT32 +LLBL (G3TNNR_norm_w_lengths): /* use precalculated lengths */ + + PREFETCHW ( REGIND(EAX) ) + + MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */ + + PFMUL ( MM0, MM6 ) /* x1*m5 | x0*m0 */ + ADD_L ( STRIDE, EDX ) /* next normal */ + + PREFETCH ( REGIND(EDX) ) + + PFMUL ( MM2, MM7 ) /* | x2*m10 */ + ADD_L ( CONST(16), EAX ) /* next r */ + + PFMUL ( MM3, MM7 ) /* | x2 (normalized) */ + PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */ + + ADD_L ( CONST(4), EDI ) /* next length */ + PFMUL ( MM3, MM6 ) /* x1 (normalized) | x0 (normalized) */ + + SUB_L ( CONST(1), EBP ) /* decrement normal counter */ + MOVQ ( MM6, REGOFF(-16, EAX) ) /* write r0, r1 */ + + MOVD ( MM7, REGOFF(-8, EAX) ) /* write r2 */ + MOVD ( REGIND(EDI), MM3 ) /* | length (x) */ + + JNZ ( LLBL (G3TNNR_norm_w_lengths) ) + JMP ( LLBL (G3TNNR_exit_3dnow) ) + +ALIGNTEXT32 +LLBL (G3TNNR_norm): /* need to calculate lengths */ + + PREFETCHW ( REGIND(EAX) ) + + MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */ + + PFMUL ( MM0, MM6 ) /* x1*m5 | x0*m0 */ + ADD_L ( CONST(16), EAX ) /* next r */ + + PFMUL ( MM2, MM7 ) /* | x2*m10 */ + MOVQ ( MM6, MM3 ) /* x1 (transformed)| x0 (transformed) */ + + MOVQ ( MM7, MM4 ) /* | x2 (transformed) */ + PFMUL ( MM6, MM3 ) /* x1*x1 | x0*x0 */ + + + PFMUL ( MM7, MM4 ) /* | x2*x2 */ + PFACC ( MM3, MM3 ) /* **not used** | x0*x0+x1*x1 */ + + PFADD ( MM4, MM3 ) /* | x0*x0+x1*x1+x2*x2*/ + ADD_L ( STRIDE, EDX ) /* next normal */ + + PREFETCH ( REGIND(EDX) ) + + PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */ + MOVQ ( MM5, MM4 ) + + PUNPCKLDQ ( MM3, MM3 ) + PFMUL ( MM5, MM5 ) + + PFRSQIT1 ( MM3, MM5 ) + SUB_L ( CONST(1), EBP ) /* decrement normal counter */ + + PFRCPIT2 ( MM4, MM5 ) + PFMUL ( MM5, MM6 ) /* x1 (normalized) | x0 (normalized) */ + + MOVQ ( MM6, REGOFF(-16, EAX) ) /* write r0, r1 */ + PFMUL ( MM5, MM7 ) /* | x2 (normalized) */ + + MOVD ( MM7, REGOFF(-8, EAX) ) /* write r2 */ + JNZ ( LLBL (G3TNNR_norm) ) + + +LLBL (G3TNNR_exit_3dnow): + FEMMS + +LLBL (G3TNNR_end): + POP_L ( EBP ) + POP_L ( ESI ) + POP_L ( EDI ) + RET + + + + + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_3dnow_transform_rescale_normals_no_rot) +HIDDEN(_mesa_3dnow_transform_rescale_normals_no_rot) +GLNAME(_mesa_3dnow_transform_rescale_normals_no_rot): + +#undef FRAME_OFFSET +#define FRAME_OFFSET 12 + + PUSH_L ( EDI ) + PUSH_L ( ESI ) + PUSH_L ( EBP ) + + MOV_L ( ARG_IN, EAX ) + MOV_L ( ARG_DEST, EDX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EBP ) /* dest->count = in->count */ + MOV_L ( EBP, REGOFF(V4F_COUNT, EDX) ) + MOV_L ( ARG_IN, ESI ) + MOV_L ( ARG_MAT, ECX ) + MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */ + MOV_L ( REGOFF(V4F_START, EDX), EAX ) /* dest->start */ + MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */ + + CMP_L ( CONST(0), EBP ) + JE ( LLBL (G3TRNR_end) ) + + FEMMS + + MOVD ( ARG_SCALE, MM6 ) /* | scale */ + PUNPCKLDQ ( MM6, MM6 ) /* scale | scale */ + + MOVD ( REGIND(ECX), MM0 ) /* | m0 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m5 | m0 */ + + PFMUL ( MM6, MM0 ) /* scale*m5 | scale*m0 */ + MOVD ( REGOFF(40, ECX), MM2 ) /* | m10 */ + + PFMUL ( MM6, MM2 ) /* | scale*m10 */ + +ALIGNTEXT32 +LLBL (G3TRNR_rescale): + + PREFETCHW ( REGIND(EAX) ) + + MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */ + + PFMUL ( MM0, MM4 ) /* x1*m5 | x0*m0 */ + ADD_L ( STRIDE, EDX ) /* next normal */ + + PREFETCH ( REGIND(EDX) ) + + PFMUL ( MM2, MM5 ) /* | x2*m10 */ + ADD_L ( CONST(16), EAX ) /* next r */ + + SUB_L ( CONST(1), EBP ) /* decrement normal counter */ + MOVQ ( MM4, REGOFF(-16, EAX) ) /* write r0, r1 */ + + MOVD ( MM5, REGOFF(-8, EAX) ) /* write r2 */ + JNZ ( LLBL (G3TRNR_rescale) ) /* cnt > 0 ? -> process next normal */ + + FEMMS + +LLBL (G3TRNR_end): + POP_L ( EBP ) + POP_L ( ESI ) + POP_L ( EDI ) + RET + + + + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_3dnow_transform_rescale_normals) +HIDDEN(_mesa_3dnow_transform_rescale_normals) +GLNAME(_mesa_3dnow_transform_rescale_normals): + +#undef FRAME_OFFSET +#define FRAME_OFFSET 8 + + PUSH_L ( EDI ) + PUSH_L ( ESI ) + + MOV_L ( ARG_IN, ESI ) + MOV_L ( ARG_DEST, EAX ) + MOV_L ( ARG_MAT, ECX ) + MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */ + MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */ + MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */ + MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */ + + CMP_L ( CONST(0), EDI ) + JE ( LLBL (G3TR_end) ) + + FEMMS + + MOVQ ( REGIND(ECX), MM3 ) /* m1 | m0 */ + + MOVQ ( REGOFF(16,ECX), MM4 ) /* m5 | m4 */ + MOVD ( ARG_SCALE, MM0 ) /* scale */ + + MOVD ( REGOFF(8,ECX), MM5 ) /* | m2 */ + PUNPCKLDQ ( MM0, MM0 ) /* scale | scale */ + + PUNPCKLDQ ( REGOFF(24, ECX), MM5 ) + PFMUL ( MM0, MM3 ) /* scale*m1 | scale*m0 */ + + MOVQ ( REGOFF(32, ECX), MM6 ) /* m9 | m8*/ + PFMUL ( MM0, MM4 ) /* scale*m5 | scale*m4 */ + + MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */ + PFMUL ( MM0, MM5 ) /* scale*m6 | scale*m2 */ + + PFMUL ( MM0, MM6 ) /* scale*m9 | scale*m8 */ + + PFMUL ( MM0, MM7 ) /* | scale*m10 */ + +ALIGNTEXT32 +LLBL (G3TR_rescale): + + PREFETCHW ( REGIND(EAX) ) + + MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */ + + MOVQ ( MM0, MM1 ) /* x1 | x0 */ + PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */ + + PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */ + ADD_L ( CONST(16), EAX ) /* next r */ + + PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */ + PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */ + + MOVQ ( REGIND(EDX), MM1 ) /* x1 | x0 */ + + PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */ + PFADD ( MM2, MM0 ) /* x0*m4...+x2*m6| x0*m0+x1*m1+x2*m2 */ + + MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */ + ADD_L ( STRIDE, EDX ) /* next normal */ + + PREFETCH ( REGIND(EDX) ) + + MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */ + PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */ + + PFMUL ( MM7, MM2 ) /* | x2*m10 */ + PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */ + + PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m10 */ + MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */ + + SUB_L ( CONST(1), EDI ) /* decrement normal counter */ + JNZ ( LLBL (G3TR_rescale) ) + + FEMMS + +LLBL (G3TR_end): + POP_L ( ESI ) + POP_L ( EDI ) + RET + + + + + + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_3dnow_transform_normals_no_rot) +HIDDEN(_mesa_3dnow_transform_normals_no_rot) +GLNAME(_mesa_3dnow_transform_normals_no_rot): + +#undef FRAME_OFFSET +#define FRAME_OFFSET 8 + + PUSH_L ( EDI ) + PUSH_L ( ESI ) + + MOV_L ( ARG_IN, ESI ) + MOV_L ( ARG_DEST, EAX ) + MOV_L ( ARG_MAT, ECX ) + MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */ + MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */ + MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */ + MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */ + + CMP_L ( CONST(0), EDI ) + JE ( LLBL (G3TNR_end) ) + + FEMMS + + MOVD ( REGIND(ECX), MM0 ) /* | m0 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m5 | m0 */ + + MOVD ( REGOFF(40, ECX), MM2 ) /* | m10 */ + PUNPCKLDQ ( MM2, MM2 ) /* m10 | m10 */ + +ALIGNTEXT32 +LLBL (G3TNR_transform): + + PREFETCHW ( REGIND(EAX) ) + + MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */ + + PFMUL ( MM0, MM4 ) /* x1*m5 | x0*m0 */ + ADD_L ( STRIDE, EDX) /* next normal */ + + PREFETCH ( REGIND(EDX) ) + + PFMUL ( MM2, MM5 ) /* | x2*m10 */ + ADD_L ( CONST(16), EAX ) /* next r */ + + SUB_L ( CONST(1), EDI ) /* decrement normal counter */ + MOVQ ( MM4, REGOFF(-16, EAX) ) /* write r0, r1 */ + + MOVD ( MM5, REGOFF(-8, EAX) ) /* write r2 */ + JNZ ( LLBL (G3TNR_transform) ) + + FEMMS + +LLBL (G3TNR_end): + POP_L ( ESI ) + POP_L ( EDI ) + RET + + + + + + + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_3dnow_transform_normals) +HIDDEN(_mesa_3dnow_transform_normals) +GLNAME(_mesa_3dnow_transform_normals): + +#undef FRAME_OFFSET +#define FRAME_OFFSET 8 + + PUSH_L ( EDI ) + PUSH_L ( ESI ) + + MOV_L ( ARG_IN, ESI ) + MOV_L ( ARG_DEST, EAX ) + MOV_L ( ARG_MAT, ECX ) + MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */ + MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */ + MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */ + MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */ + + CMP_L ( CONST(0), EDI ) /* count > 0 ?? */ + JE ( LLBL (G3T_end) ) + + FEMMS + + MOVQ ( REGIND(ECX), MM3 ) /* m1 | m0 */ + MOVQ ( REGOFF(16, ECX), MM4 ) /* m5 | m4 */ + + MOVD ( REGOFF(8, ECX), MM5 ) /* | m2 */ + PUNPCKLDQ ( REGOFF(24, ECX), MM5 ) /* m6 | m2 */ + + MOVQ ( REGOFF(32, ECX), MM6 ) /* m9 | m8 */ + MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */ + +ALIGNTEXT32 +LLBL (G3T_transform): + + PREFETCHW ( REGIND(EAX) ) + + MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */ + + MOVQ ( MM0, MM1 ) /* x1 | x0 */ + PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */ + + PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */ + ADD_L ( CONST(16), EAX ) /* next r */ + + PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */ + PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */ + + PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */ + PFADD ( MM2, MM0 ) /* x0*m4...+x2*m6| x0*m0+x1*m1+x2*m2 */ + + MOVQ ( REGIND(EDX), MM1 ) /* x1 | x0 */ + MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */ + + PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */ + MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */ + + PFMUL ( MM7, MM2 ) /* | x2*m10 */ + ADD_L ( STRIDE, EDX ) /* next normal */ + + PREFETCH ( REGIND(EDX) ) + + PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */ + PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m10 */ + + MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */ + SUB_L ( CONST(1), EDI ) /* decrement normal counter */ + + JNZ ( LLBL (G3T_transform) ) + + FEMMS + +LLBL (G3T_end): + POP_L ( ESI ) + POP_L ( EDI ) + RET + + + + + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_3dnow_normalize_normals) +HIDDEN(_mesa_3dnow_normalize_normals) +GLNAME(_mesa_3dnow_normalize_normals): + +#undef FRAME_OFFSET +#define FRAME_OFFSET 12 + + PUSH_L ( EDI ) + PUSH_L ( ESI ) + PUSH_L ( EBP ) + + MOV_L ( ARG_IN, ESI ) + MOV_L ( ARG_DEST, EAX ) + MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */ + MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */ + MOV_L ( REGOFF(V4F_START, ESI), ECX ) /* in->start */ + MOV_L ( ARG_LENGTHS, EDX ) + + CMP_L ( CONST(0), EBP ) /* count > 0 ?? */ + JE ( LLBL (G3N_end) ) + + FEMMS + + CMP_L ( CONST(0), EDX ) /* lengths == 0 ? */ + JE ( LLBL (G3N_norm2) ) /* calculate lengths */ + +ALIGNTEXT32 +LLBL (G3N_norm1): /* use precalculated lengths */ + + PREFETCH ( REGIND(EAX) ) + + MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */ + MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */ + + MOVD ( REGIND(EDX), MM3 ) /* | length (x) */ + PFMUL ( MM3, MM1 ) /* | x2 (normalized) */ + + PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */ + ADD_L ( STRIDE, ECX ) /* next normal */ + + PREFETCH ( REGIND(ECX) ) + + PFMUL ( MM3, MM0 ) /* x1 (normalized) | x0 (normalized) */ + MOVQ ( MM0, REGIND(EAX) ) /* write new x0, x1 */ + + MOVD ( MM1, REGOFF(8, EAX) ) /* write new x2 */ + ADD_L ( CONST(16), EAX ) /* next r */ + + ADD_L ( CONST(4), EDX ) /* next length */ + SUB_L ( CONST(1), EBP ) /* decrement normal counter */ + + JNZ ( LLBL (G3N_norm1) ) + + JMP ( LLBL (G3N_end1) ) + +ALIGNTEXT32 +LLBL (G3N_norm2): /* need to calculate lengths */ + + PREFETCHW ( REGIND(EAX) ) + + PREFETCH ( REGIND(ECX) ) + + MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */ + MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */ + + MOVQ ( MM0, MM3 ) /* x1 | x0 */ + ADD_L ( STRIDE, ECX ) /* next normal */ + + PFMUL ( MM0, MM3 ) /* x1*x1 | x0*x0 */ + MOVQ ( MM1, MM4 ) /* | x2 */ + + ADD_L ( CONST(16), EAX ) /* next r */ + PFMUL ( MM1, MM4 ) /* | x2*x2 */ + + PFADD ( MM4, MM3 ) /* | x0*x0+x2*x2 */ + PFACC ( MM3, MM3 ) /* x0*x0+...+x2*x2 | x0*x0+x1*x1+x2*x2*/ + + PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */ + MOVQ ( MM5, MM4 ) + + PUNPCKLDQ ( MM3, MM3 ) + PFMUL ( MM5, MM5 ) + + PFRSQIT1 ( MM3, MM5 ) + SUB_L ( CONST(1), EBP ) /* decrement normal counter */ + + PFRCPIT2 ( MM4, MM5 ) + + PFMUL ( MM5, MM0 ) /* x1 (normalized) | x0 (normalized) */ + MOVQ ( MM0, REGOFF(-16, EAX) ) /* write new x0, x1 */ + + PFMUL ( MM5, MM1 ) /* | x2 (normalized) */ + MOVD ( MM1, REGOFF(-8, EAX) ) /* write new x2 */ + + JNZ ( LLBL (G3N_norm2) ) + +LLBL (G3N_end1): + FEMMS + +LLBL (G3N_end): + POP_L ( EBP ) + POP_L ( ESI ) + POP_L ( EDI ) + RET + + + + + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_3dnow_rescale_normals) +HIDDEN(_mesa_3dnow_rescale_normals) +GLNAME(_mesa_3dnow_rescale_normals): + +#undef FRAME_OFFSET +#define FRAME_OFFSET 8 + PUSH_L ( EDI ) + PUSH_L ( ESI ) + + MOV_L ( ARG_IN, ESI ) + MOV_L ( ARG_DEST, EAX ) + MOV_L ( REGOFF(V4F_COUNT, ESI), EDX ) /* dest->count = in->count */ + MOV_L ( EDX, REGOFF(V4F_COUNT, EAX) ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */ + MOV_L ( REGOFF(V4F_START, ESI), ECX ) /* in->start */ + + CMP_L ( CONST(0), EDX ) + JE ( LLBL (G3R_end) ) + + FEMMS + + MOVD ( ARG_SCALE, MM0 ) /* scale */ + PUNPCKLDQ ( MM0, MM0 ) + +ALIGNTEXT32 +LLBL (G3R_rescale): + + PREFETCHW ( REGIND(EAX) ) + + MOVQ ( REGIND(ECX), MM1 ) /* x1 | x0 */ + MOVD ( REGOFF(8, ECX), MM2 ) /* | x2 */ + + PFMUL ( MM0, MM1 ) /* x1*scale | x0*scale */ + ADD_L ( STRIDE, ECX ) /* next normal */ + + PREFETCH ( REGIND(ECX) ) + + PFMUL ( MM0, MM2 ) /* | x2*scale */ + ADD_L ( CONST(16), EAX ) /* next r */ + + MOVQ ( MM1, REGOFF(-16, EAX) ) /* write r0, r1 */ + MOVD ( MM2, REGOFF(-8, EAX) ) /* write r2 */ + + SUB_L ( CONST(1), EDX ) /* decrement normal counter */ + JNZ ( LLBL (G3R_rescale) ) + + FEMMS + +LLBL (G3R_end): + POP_L ( ESI ) + POP_L ( EDI ) + RET + +#endif + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/3dnow_xform1.S b/mesalib/src/mesa/x86/3dnow_xform1.S index 9719ba2c1..a73301a8d 100644 --- a/mesalib/src/mesa/x86/3dnow_xform1.S +++ b/mesalib/src/mesa/x86/3dnow_xform1.S @@ -1,437 +1,437 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifdef USE_3DNOW_ASM
-#include "assyntax.h"
-#include "matypes.h"
-#include "xform_args.h"
-
- SEG_TEXT
-
-#define FRAME_OFFSET 4
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points1_general )
-HIDDEN(_mesa_3dnow_transform_points1_general)
-GLNAME( _mesa_3dnow_transform_points1_general ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(4, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPGR_3 ) )
-
- MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
- MOVQ ( REGOFF(8, ECX), MM1 ) /* m03 | m02 */
-
- MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
- MOVQ ( REGOFF(56, ECX), MM3 ) /* m33 | m32 */
-
-ALIGNTEXT16
-LLBL( G3TPGR_2 ):
-
- MOVD ( REGIND(EAX), MM4 ) /* | x0 */
- PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
-
- MOVQ ( MM4, MM5 ) /* x0 | x0 */
- PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
-
- PFMUL ( MM1, MM5 ) /* x0*m03 | x0*m02 */
- PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */
-
- PFADD ( MM3, MM5 ) /* x0*m03+m33 | x0*m02+m32 */
- MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
-
- MOVQ ( MM5, REGOFF(8, EDX) ) /* write r3, r2 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- ADD_L ( CONST(16), EDX ) /* next r */
- DEC_L ( ESI ) /* decrement vertex counter */
-
- JNZ ( LLBL( G3TPGR_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPGR_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points1_identity )
-HIDDEN(_mesa_3dnow_transform_points1_identity)
-GLNAME( _mesa_3dnow_transform_points1_identity ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(1), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_1), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(4, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPIR_4) )
-
-ALIGNTEXT16
-LLBL( G3TPIR_3 ):
-
- MOVD ( REGIND(EAX), MM0 ) /* | x0 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- MOVD ( MM0, REGIND(EDX) ) /* | r0 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TPIR_3 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPIR_4 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points1_3d_no_rot )
-HIDDEN(_mesa_3dnow_transform_points1_3d_no_rot)
-GLNAME( _mesa_3dnow_transform_points1_3d_no_rot ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(4, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP3NRR_3 ) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
-
- MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
-
-ALIGNTEXT16
-LLBL( G3TP3NRR_2 ):
-
- MOVD ( REGIND(EAX), MM4 ) /* | x0 */
- PFMUL ( MM0, MM4 ) /* | x0*m00 */
-
- PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */
- MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
-
- MOVD ( MM3, REGOFF(8, EDX) ) /* write r2 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- ADD_L ( CONST(16), EDX ) /* next r */
- DEC_L ( ESI ) /* decrement vertex counter */
-
- JNZ ( LLBL( G3TP3NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP3NRR_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points1_perspective )
-HIDDEN(_mesa_3dnow_transform_points1_perspective)
-GLNAME( _mesa_3dnow_transform_points1_perspective ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(4, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPPR_3 ) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
-
-ALIGNTEXT16
-LLBL( G3TPPR_2 ):
-
- MOVD ( REGIND(EAX), MM4 ) /* 0 | x0 */
- PFMUL ( MM0, MM4 ) /* 0 | x0*m00 */
-
- MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
- MOVQ ( MM3, REGOFF(8, EDX) ) /* write r2 (=m32), r3 (=0) */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TPPR_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPPR_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points1_2d )
-HIDDEN(_mesa_3dnow_transform_points1_2d)
-GLNAME( _mesa_3dnow_transform_points1_2d ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(4, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP2R_3 ) )
-
- MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
- MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
-
-ALIGNTEXT16
-LLBL( G3TP2R_2 ):
-
- MOVD ( REGIND(EAX), MM4 ) /* | x0 */
- PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
-
- PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
- PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */
-
- MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- ADD_L ( CONST(16), EDX ) /* next r */
- DEC_L ( ESI ) /* decrement vertex counter */
-
- JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP2R_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points1_2d_no_rot )
-HIDDEN(_mesa_3dnow_transform_points1_2d_no_rot)
-GLNAME( _mesa_3dnow_transform_points1_2d_no_rot ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(4, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP2NRR_3 ) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
-
-ALIGNTEXT16
-LLBL( G3TP2NRR_2 ):
-
- MOVD ( REGIND(EAX), MM4 ) /* | x0 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- PFMUL ( MM0, MM4 ) /* | x0*m00 */
- PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */
-
- MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP2NRR_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points1_3d )
-HIDDEN(_mesa_3dnow_transform_points1_3d)
-GLNAME( _mesa_3dnow_transform_points1_3d ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(4, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP3R_3 ) )
-
- MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
- MOVD ( REGOFF(8, ECX), MM1 ) /* | m02 */
-
- MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
- MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
-
-ALIGNTEXT16
-LLBL( G3TP3R_2 ):
-
- MOVD ( REGIND(EAX), MM4 ) /* | x0 */
- PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
-
- MOVQ ( MM4, MM5 ) /* | x0 */
- PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
-
- PFMUL ( MM1, MM5 ) /* | x0*m02 */
- PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */
-
- PFADD ( MM3, MM5 ) /* | x0*m02+m32 */
- MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
-
- MOVD ( MM5, REGOFF(8, EDX) ) /* write r2 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- ADD_L ( CONST(16), EDX ) /* next r */
- DEC_L ( ESI ) /* decrement vertex counter */
-
- JNZ ( LLBL( G3TP3R_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP3R_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-#endif
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef USE_3DNOW_ASM +#include "assyntax.h" +#include "matypes.h" +#include "xform_args.h" + + SEG_TEXT + +#define FRAME_OFFSET 4 + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points1_general ) +HIDDEN(_mesa_3dnow_transform_points1_general) +GLNAME( _mesa_3dnow_transform_points1_general ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(4, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPGR_3 ) ) + + MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */ + MOVQ ( REGOFF(8, ECX), MM1 ) /* m03 | m02 */ + + MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */ + MOVQ ( REGOFF(56, ECX), MM3 ) /* m33 | m32 */ + +ALIGNTEXT16 +LLBL( G3TPGR_2 ): + + MOVD ( REGIND(EAX), MM4 ) /* | x0 */ + PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */ + + MOVQ ( MM4, MM5 ) /* x0 | x0 */ + PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */ + + PFMUL ( MM1, MM5 ) /* x0*m03 | x0*m02 */ + PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */ + + PFADD ( MM3, MM5 ) /* x0*m03+m33 | x0*m02+m32 */ + MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */ + + MOVQ ( MM5, REGOFF(8, EDX) ) /* write r3, r2 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + ADD_L ( CONST(16), EDX ) /* next r */ + DEC_L ( ESI ) /* decrement vertex counter */ + + JNZ ( LLBL( G3TPGR_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPGR_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points1_identity ) +HIDDEN(_mesa_3dnow_transform_points1_identity) +GLNAME( _mesa_3dnow_transform_points1_identity ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(1), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_1), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(4, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPIR_4) ) + +ALIGNTEXT16 +LLBL( G3TPIR_3 ): + + MOVD ( REGIND(EAX), MM0 ) /* | x0 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + MOVD ( MM0, REGIND(EDX) ) /* | r0 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TPIR_3 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPIR_4 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points1_3d_no_rot ) +HIDDEN(_mesa_3dnow_transform_points1_3d_no_rot) +GLNAME( _mesa_3dnow_transform_points1_3d_no_rot ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(4, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP3NRR_3 ) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */ + + MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */ + +ALIGNTEXT16 +LLBL( G3TP3NRR_2 ): + + MOVD ( REGIND(EAX), MM4 ) /* | x0 */ + PFMUL ( MM0, MM4 ) /* | x0*m00 */ + + PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */ + MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */ + + MOVD ( MM3, REGOFF(8, EDX) ) /* write r2 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + ADD_L ( CONST(16), EDX ) /* next r */ + DEC_L ( ESI ) /* decrement vertex counter */ + + JNZ ( LLBL( G3TP3NRR_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP3NRR_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points1_perspective ) +HIDDEN(_mesa_3dnow_transform_points1_perspective) +GLNAME( _mesa_3dnow_transform_points1_perspective ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(4, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPPR_3 ) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */ + +ALIGNTEXT16 +LLBL( G3TPPR_2 ): + + MOVD ( REGIND(EAX), MM4 ) /* 0 | x0 */ + PFMUL ( MM0, MM4 ) /* 0 | x0*m00 */ + + MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */ + MOVQ ( MM3, REGOFF(8, EDX) ) /* write r2 (=m32), r3 (=0) */ + + ADD_L ( EDI, EAX ) /* next vertex */ + ADD_L ( CONST(16), EDX ) /* next r */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TPPR_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPPR_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points1_2d ) +HIDDEN(_mesa_3dnow_transform_points1_2d) +GLNAME( _mesa_3dnow_transform_points1_2d ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(4, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP2R_3 ) ) + + MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */ + MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */ + +ALIGNTEXT16 +LLBL( G3TP2R_2 ): + + MOVD ( REGIND(EAX), MM4 ) /* | x0 */ + PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */ + + PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */ + PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */ + + MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + ADD_L ( CONST(16), EDX ) /* next r */ + DEC_L ( ESI ) /* decrement vertex counter */ + + JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP2R_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points1_2d_no_rot ) +HIDDEN(_mesa_3dnow_transform_points1_2d_no_rot) +GLNAME( _mesa_3dnow_transform_points1_2d_no_rot ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(4, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP2NRR_3 ) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */ + +ALIGNTEXT16 +LLBL( G3TP2NRR_2 ): + + MOVD ( REGIND(EAX), MM4 ) /* | x0 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + PFMUL ( MM0, MM4 ) /* | x0*m00 */ + PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */ + + MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP2NRR_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points1_3d ) +HIDDEN(_mesa_3dnow_transform_points1_3d) +GLNAME( _mesa_3dnow_transform_points1_3d ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(4, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP3R_3 ) ) + + MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */ + MOVD ( REGOFF(8, ECX), MM1 ) /* | m02 */ + + MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */ + MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */ + +ALIGNTEXT16 +LLBL( G3TP3R_2 ): + + MOVD ( REGIND(EAX), MM4 ) /* | x0 */ + PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */ + + MOVQ ( MM4, MM5 ) /* | x0 */ + PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */ + + PFMUL ( MM1, MM5 ) /* | x0*m02 */ + PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */ + + PFADD ( MM3, MM5 ) /* | x0*m02+m32 */ + MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */ + + MOVD ( MM5, REGOFF(8, EDX) ) /* write r2 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + ADD_L ( CONST(16), EDX ) /* next r */ + DEC_L ( ESI ) /* decrement vertex counter */ + + JNZ ( LLBL( G3TP3R_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP3R_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + +#endif + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/3dnow_xform2.S b/mesalib/src/mesa/x86/3dnow_xform2.S index 8c706074a..2988fb7bf 100644 --- a/mesalib/src/mesa/x86/3dnow_xform2.S +++ b/mesalib/src/mesa/x86/3dnow_xform2.S @@ -1,477 +1,477 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifdef USE_3DNOW_ASM
-#include "assyntax.h"
-#include "matypes.h"
-#include "xform_args.h"
-
- SEG_TEXT
-
-#define FRAME_OFFSET 4
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points2_general )
-HIDDEN(_mesa_3dnow_transform_points2_general)
-GLNAME( _mesa_3dnow_transform_points2_general ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPGR_3 ) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
-
- MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
-
- MOVD ( REGOFF(8, ECX), MM2 ) /* | m02 */
- PUNPCKLDQ ( REGOFF(24, ECX), MM2 ) /* m12 | m02 */
-
- MOVD ( REGOFF(12, ECX), MM3 ) /* | m03 */
- PUNPCKLDQ ( REGOFF(28, ECX), MM3 ) /* m13 | m03 */
-
- MOVQ ( REGOFF(48, ECX), MM4 ) /* m31 | m30 */
- MOVQ ( REGOFF(56, ECX), MM5 ) /* m33 | m32 */
-
-ALIGNTEXT16
-LLBL( G3TPGR_2 ):
-
- MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
- MOVQ ( MM6, MM7 ) /* x1 | x0 */
-
- PFMUL ( MM0, MM6 ) /* x1*m10 | x0*m00 */
- PFMUL ( MM1, MM7 ) /* x1*m11 | x0*m01 */
-
- PFACC ( MM7, MM6 ) /* x0*m01+x1*m11 | x0*x00+x1*m10 */
- PFADD ( MM4, MM6 ) /* x0*...*m11+m31 | x0*...*m10+m30 */
-
- MOVQ ( MM6, REGIND(EDX) ) /* write r1, r0 */
- MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
-
- MOVQ ( MM6, MM7 ) /* x1 | x0 */
- PFMUL ( MM2, MM6 ) /* x1*m12 | x0*m02 */
-
- PFMUL ( MM3, MM7 ) /* x1*m13 | x0*m03 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- PFACC ( MM7, MM6 ) /* x0*m03+x1*m13 | x0*x02+x1*m12 */
- PFADD ( MM5, MM6 ) /* x0*...*m13+m33 | x0*...*m12+m32 */
-
- MOVQ ( MM6, REGOFF(8, EDX) ) /* write r3, r2 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TPGR_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPGR_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points2_perspective )
-HIDDEN(_mesa_3dnow_transform_points2_perspective)
-GLNAME( _mesa_3dnow_transform_points2_perspective ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPPR_3 ) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
-
- MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
-
-ALIGNTEXT16
-LLBL( G3TPPR_2 ):
-
- MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
- PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
-
- MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
- MOVQ ( MM3, REGOFF(8, EDX) ) /* write r2 (=m32), r3 (=0) */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TPPR_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPPR_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points2_3d )
-HIDDEN(_mesa_3dnow_transform_points2_3d)
-GLNAME( _mesa_3dnow_transform_points2_3d ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_3 ), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP3R_3 ) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
-
- MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
-
- MOVD ( REGOFF(8, ECX), MM2 ) /* | m02 */
- PUNPCKLDQ ( REGOFF(24, ECX), MM2 ) /* m12 | m02 */
-
- MOVQ ( REGOFF(48, ECX), MM4 ) /* m31 | m30 */
- MOVD ( REGOFF(56, ECX), MM5 ) /* | m32 */
-
-ALIGNTEXT16
-LLBL( G3TP3R_2 ):
-
- MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
- MOVQ ( MM6, MM7 ) /* x1 | x0 */
-
- PFMUL ( MM0, MM6 ) /* x1*m10 | x0*m00 */
- PFMUL ( MM1, MM7 ) /* x1*m11 | x0*m01 */
-
- PFACC ( MM7, MM6 ) /* x0*m01+x1*m11 | x0*x00+x1*m10 */
- PFADD ( MM4, MM6 ) /* x0*...*m11+m31 | x0*...*m10+m30 */
-
- MOVQ ( MM6, REGIND(EDX) ) /* write r1, r0 */
- MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
-
- MOVQ ( MM6, MM7 ) /* x1 | x0 */
- PFMUL ( MM2, MM6 ) /* x1*m12 | x0*m02 */
-
- PFACC ( MM7, MM6 ) /* ***trash*** | x0*x02+x1*m12 */
- PFADD ( MM5, MM6 ) /* ***trash*** | x0*...*m12+m32 */
-
- MOVD ( MM6, REGOFF(8, EDX) ) /* write r2 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- ADD_L ( CONST(16), EDX ) /* next r */
- DEC_L ( ESI ) /* decrement vertex counter */
-
- JNZ ( LLBL( G3TP3R_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP3R_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points2_3d_no_rot )
-HIDDEN(_mesa_3dnow_transform_points2_3d_no_rot)
-GLNAME( _mesa_3dnow_transform_points2_3d_no_rot ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_3 ), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP3NRR_3 ) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
-
- MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
- MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
-
-ALIGNTEXT16
-LLBL( G3TP3NRR_2 ):
-
- MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
- PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
-
- PFADD ( MM2, MM4 ) /* x1*m11+m31 | x0*m00+m30 */
- MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
-
- MOVD ( MM3, REGOFF(8, EDX) ) /* write r2 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- ADD_L ( CONST(16), EDX ) /* next r */
- DEC_L ( ESI ) /* decrement vertex counter */
-
- JNZ ( LLBL( G3TP3NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP3NRR_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points2_2d )
-HIDDEN(_mesa_3dnow_transform_points2_2d)
-GLNAME( _mesa_3dnow_transform_points2_2d ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP2R_3 ) )
-
- MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
- MOVQ ( REGOFF(16, ECX), MM1 ) /* m11 | m10 */
-
- MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
-
-ALIGNTEXT16
-LLBL( G3TP2R_2 ):
-
- MOVD ( REGIND(EAX), MM4 ) /* | x0 */
- MOVD ( REGOFF(4, EAX), MM5 ) /* | x1 */
-
- PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
- PUNPCKLDQ ( MM5, MM5 ) /* x1 | x1 */
-
- PFMUL ( MM1, MM5 ) /* x1*m11 | x1*m10 */
- PFADD ( MM2, MM4 ) /* x...x1*m11+31 | x0*..*m10+m30 */
-
- PFADD ( MM5, MM4 ) /* x0*m01+x1*m11 | x0*m00+x1*m10 */
- MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
-
- ADD_L ( CONST(16), EDX ) /* next r */
- DEC_L ( ESI ) /* decrement vertex counter */
-
- JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP2R_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points2_2d_no_rot )
-HIDDEN(_mesa_3dnow_transform_points2_2d_no_rot)
-GLNAME( _mesa_3dnow_transform_points2_2d_no_rot ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP2NRR_3 ) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
-
- MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
-
-ALIGNTEXT16
-LLBL( G3TP2NRR_2 ):
-
- MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
- PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */
-
- MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP2NRR_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points2_identity )
-HIDDEN(_mesa_3dnow_transform_points2_identity)
-GLNAME( _mesa_3dnow_transform_points2_identity ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPIR_3 ) )
-
-ALIGNTEXT16
-LLBL( G3TPIR_3 ):
-
- MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
- ADD_L ( EDI, EAX ) /* next vertex */
-
- MOVQ ( MM0, REGIND(EDX) ) /* r1 | r0 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TPIR_3 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPIR_4 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#endif
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef USE_3DNOW_ASM +#include "assyntax.h" +#include "matypes.h" +#include "xform_args.h" + + SEG_TEXT + +#define FRAME_OFFSET 4 + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points2_general ) +HIDDEN(_mesa_3dnow_transform_points2_general) +GLNAME( _mesa_3dnow_transform_points2_general ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPGR_3 ) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */ + + MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */ + + MOVD ( REGOFF(8, ECX), MM2 ) /* | m02 */ + PUNPCKLDQ ( REGOFF(24, ECX), MM2 ) /* m12 | m02 */ + + MOVD ( REGOFF(12, ECX), MM3 ) /* | m03 */ + PUNPCKLDQ ( REGOFF(28, ECX), MM3 ) /* m13 | m03 */ + + MOVQ ( REGOFF(48, ECX), MM4 ) /* m31 | m30 */ + MOVQ ( REGOFF(56, ECX), MM5 ) /* m33 | m32 */ + +ALIGNTEXT16 +LLBL( G3TPGR_2 ): + + MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */ + MOVQ ( MM6, MM7 ) /* x1 | x0 */ + + PFMUL ( MM0, MM6 ) /* x1*m10 | x0*m00 */ + PFMUL ( MM1, MM7 ) /* x1*m11 | x0*m01 */ + + PFACC ( MM7, MM6 ) /* x0*m01+x1*m11 | x0*x00+x1*m10 */ + PFADD ( MM4, MM6 ) /* x0*...*m11+m31 | x0*...*m10+m30 */ + + MOVQ ( MM6, REGIND(EDX) ) /* write r1, r0 */ + MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */ + + MOVQ ( MM6, MM7 ) /* x1 | x0 */ + PFMUL ( MM2, MM6 ) /* x1*m12 | x0*m02 */ + + PFMUL ( MM3, MM7 ) /* x1*m13 | x0*m03 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + PFACC ( MM7, MM6 ) /* x0*m03+x1*m13 | x0*x02+x1*m12 */ + PFADD ( MM5, MM6 ) /* x0*...*m13+m33 | x0*...*m12+m32 */ + + MOVQ ( MM6, REGOFF(8, EDX) ) /* write r3, r2 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TPGR_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPGR_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points2_perspective ) +HIDDEN(_mesa_3dnow_transform_points2_perspective) +GLNAME( _mesa_3dnow_transform_points2_perspective ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPPR_3 ) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */ + + MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */ + +ALIGNTEXT16 +LLBL( G3TPPR_2 ): + + MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */ + PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */ + + MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */ + MOVQ ( MM3, REGOFF(8, EDX) ) /* write r2 (=m32), r3 (=0) */ + + ADD_L ( EDI, EAX ) /* next vertex */ + ADD_L ( CONST(16), EDX ) /* next r */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TPPR_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPPR_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points2_3d ) +HIDDEN(_mesa_3dnow_transform_points2_3d) +GLNAME( _mesa_3dnow_transform_points2_3d ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_3 ), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP3R_3 ) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */ + + MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */ + + MOVD ( REGOFF(8, ECX), MM2 ) /* | m02 */ + PUNPCKLDQ ( REGOFF(24, ECX), MM2 ) /* m12 | m02 */ + + MOVQ ( REGOFF(48, ECX), MM4 ) /* m31 | m30 */ + MOVD ( REGOFF(56, ECX), MM5 ) /* | m32 */ + +ALIGNTEXT16 +LLBL( G3TP3R_2 ): + + MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */ + MOVQ ( MM6, MM7 ) /* x1 | x0 */ + + PFMUL ( MM0, MM6 ) /* x1*m10 | x0*m00 */ + PFMUL ( MM1, MM7 ) /* x1*m11 | x0*m01 */ + + PFACC ( MM7, MM6 ) /* x0*m01+x1*m11 | x0*x00+x1*m10 */ + PFADD ( MM4, MM6 ) /* x0*...*m11+m31 | x0*...*m10+m30 */ + + MOVQ ( MM6, REGIND(EDX) ) /* write r1, r0 */ + MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */ + + MOVQ ( MM6, MM7 ) /* x1 | x0 */ + PFMUL ( MM2, MM6 ) /* x1*m12 | x0*m02 */ + + PFACC ( MM7, MM6 ) /* ***trash*** | x0*x02+x1*m12 */ + PFADD ( MM5, MM6 ) /* ***trash*** | x0*...*m12+m32 */ + + MOVD ( MM6, REGOFF(8, EDX) ) /* write r2 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + ADD_L ( CONST(16), EDX ) /* next r */ + DEC_L ( ESI ) /* decrement vertex counter */ + + JNZ ( LLBL( G3TP3R_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP3R_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points2_3d_no_rot ) +HIDDEN(_mesa_3dnow_transform_points2_3d_no_rot) +GLNAME( _mesa_3dnow_transform_points2_3d_no_rot ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_3 ), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP3NRR_3 ) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */ + + MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */ + MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */ + +ALIGNTEXT16 +LLBL( G3TP3NRR_2 ): + + MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */ + PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */ + + PFADD ( MM2, MM4 ) /* x1*m11+m31 | x0*m00+m30 */ + MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */ + + MOVD ( MM3, REGOFF(8, EDX) ) /* write r2 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + ADD_L ( CONST(16), EDX ) /* next r */ + DEC_L ( ESI ) /* decrement vertex counter */ + + JNZ ( LLBL( G3TP3NRR_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP3NRR_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points2_2d ) +HIDDEN(_mesa_3dnow_transform_points2_2d) +GLNAME( _mesa_3dnow_transform_points2_2d ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP2R_3 ) ) + + MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */ + MOVQ ( REGOFF(16, ECX), MM1 ) /* m11 | m10 */ + + MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */ + +ALIGNTEXT16 +LLBL( G3TP2R_2 ): + + MOVD ( REGIND(EAX), MM4 ) /* | x0 */ + MOVD ( REGOFF(4, EAX), MM5 ) /* | x1 */ + + PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */ + PUNPCKLDQ ( MM5, MM5 ) /* x1 | x1 */ + + PFMUL ( MM1, MM5 ) /* x1*m11 | x1*m10 */ + PFADD ( MM2, MM4 ) /* x...x1*m11+31 | x0*..*m10+m30 */ + + PFADD ( MM5, MM4 ) /* x0*m01+x1*m11 | x0*m00+x1*m10 */ + MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */ + + ADD_L ( CONST(16), EDX ) /* next r */ + DEC_L ( ESI ) /* decrement vertex counter */ + + JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP2R_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points2_2d_no_rot ) +HIDDEN(_mesa_3dnow_transform_points2_2d_no_rot) +GLNAME( _mesa_3dnow_transform_points2_2d_no_rot ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP2NRR_3 ) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */ + + MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */ + +ALIGNTEXT16 +LLBL( G3TP2NRR_2 ): + + MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */ + PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */ + + MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP2NRR_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points2_identity ) +HIDDEN(_mesa_3dnow_transform_points2_identity) +GLNAME( _mesa_3dnow_transform_points2_identity ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPIR_3 ) ) + +ALIGNTEXT16 +LLBL( G3TPIR_3 ): + + MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */ + ADD_L ( EDI, EAX ) /* next vertex */ + + MOVQ ( MM0, REGIND(EDX) ) /* r1 | r0 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TPIR_3 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPIR_4 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET +#endif + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/3dnow_xform3.S b/mesalib/src/mesa/x86/3dnow_xform3.S index d119fe821..a356aaee7 100644 --- a/mesalib/src/mesa/x86/3dnow_xform3.S +++ b/mesalib/src/mesa/x86/3dnow_xform3.S @@ -1,561 +1,561 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifdef USE_3DNOW_ASM
-#include "assyntax.h"
-#include "matypes.h"
-#include "xform_args.h"
-
- SEG_TEXT
-
-#define FRAME_OFFSET 4
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points3_general )
-HIDDEN(_mesa_3dnow_transform_points3_general)
-GLNAME( _mesa_3dnow_transform_points3_general ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPGR_2 ) )
-
- PREFETCHW ( REGIND(EDX) )
-
-ALIGNTEXT16
-LLBL( G3TPGR_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EAX), MM2 ) /* | x2 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCH ( REGIND(EAX) )
-
- MOVQ ( MM0, MM1 ) /* x1 | x0 */
- PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
-
- PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */
- MOVQ ( MM2, MM5 ) /* x2 | x2 */
-
- PUNPCKHDQ ( MM1, MM1 ) /* x1 | x1 */
- PFMUL ( REGOFF(32, ECX), MM2 ) /* x2*m9 | x2*m8 */
-
- MOVQ ( MM0, MM3 ) /* x0 | x0 */
- PFMUL ( REGOFF(40, ECX), MM5 ) /* x2*m11 | x2*m10 */
-
- MOVQ ( MM1, MM4 ) /* x1 | x1 */
- PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */
-
- PFADD ( REGOFF(48, ECX), MM2 ) /* x2*m9+m13 | x2*m8+m12 */
- PFMUL ( REGOFF(16, ECX), MM1 ) /* x1*m5 | x1*m4 */
-
- PFADD ( REGOFF(56, ECX), MM5 ) /* x2*m11+m15 | x2*m10+m14 */
- PFADD ( MM0, MM1 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */
-
- PFMUL ( REGOFF(8, ECX), MM3 ) /* x0*m3 | x0*m2 */
- PFADD ( MM1, MM2 ) /* r1 | r0 */
-
- PFMUL ( REGOFF(24, ECX), MM4 ) /* x1*m7 | x1*m6 */
- ADD_L ( CONST(16), EDX ) /* next output vertex */
-
- PFADD ( MM3, MM4 ) /* x0*m3+x1*m7 | x0*m2+x1*m6 */
- MOVQ ( MM2, REGOFF(-16, EDX) ) /* write r0, r1 */
-
- PFADD ( MM4, MM5 ) /* r3 | r2 */
- MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TPGR_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPGR_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points3_perspective )
-HIDDEN(_mesa_3dnow_transform_points3_perspective)
-GLNAME( _mesa_3dnow_transform_points3_perspective ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPPR_2 ) )
-
- PREFETCH ( REGIND(EAX) )
- PREFETCHW ( REGIND(EDX) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
-
- MOVQ ( REGOFF(32, ECX), MM1 ) /* m21 | m20 */
- MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */
-
- MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
-
-ALIGNTEXT16
-LLBL( G3TPPR_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
- MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCH ( REGIND(EAX) )
-
- PXOR ( MM7, MM7 ) /* 0 | 0 */
- MOVQ ( MM5, MM6 ) /* | x2 */
-
- PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
- PFSUB ( MM5, MM7 ) /* | -x2 */
-
- PFMUL ( MM2, MM6 ) /* | x2*m22 */
- PUNPCKLDQ ( MM5, MM5 ) /* x2 | x2 */
-
- ADD_L ( CONST(16), EDX ) /* next r */
- PFMUL ( MM1, MM5 ) /* x2*m21 | x2*m20 */
-
- PFADD ( MM3, MM6 ) /* | x2*m22+m32 */
- PFADD ( MM4, MM5 ) /* x1*m11+x2*m21 | x0*m00+x2*m20 */
-
- MOVQ ( MM5, REGOFF(-16, EDX) ) /* write r0, r1 */
- MOVD ( MM6, REGOFF(-8, EDX) ) /* write r2 */
-
- MOVD ( MM7, REGOFF(-4, EDX) ) /* write r3 */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TPPR_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPPR_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points3_3d )
-HIDDEN(_mesa_3dnow_transform_points3_3d)
-GLNAME( _mesa_3dnow_transform_points3_3d ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP3R_2 ) )
-
- PREFETCH ( REGIND(EAX) )
- PREFETCH ( REGIND(EDX) )
-
- MOVD ( REGOFF(8, ECX), MM7 ) /* | m2 */
- PUNPCKLDQ ( REGOFF(24, ECX), MM7 ) /* m6 | m2 */
-
-
-ALIGNTEXT16
-LLBL( G3TP3R_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCH ( REGIND(EAX) )
-
- MOVQ ( MM0, MM2 ) /* x1 | x0 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- PUNPCKLDQ ( MM2, MM2 ) /* x0 | x0 */
- MOVQ ( MM0, MM3 ) /* x1 | x0 */
-
- PFMUL ( REGIND(ECX), MM2 ) /* x0*m1 | x0*m0 */
- PUNPCKHDQ ( MM3, MM3 ) /* x1 | x1 */
-
- MOVQ ( MM1, MM4 ) /* | x2 */
- PFMUL ( REGOFF(16, ECX), MM3 ) /* x1*m5 | x1*m4 */
-
- PUNPCKLDQ ( MM4, MM4 ) /* x2 | x2 */
- PFADD ( MM2, MM3 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */
-
- PFMUL ( REGOFF(32, ECX), MM4 ) /* x2*m9 | x2*m8 */
- PFADD ( REGOFF(48, ECX), MM3 ) /* x0*m1+...+m11 | x0*m0+x1*m4+m12 */
-
- PFMUL ( MM7, MM0 ) /* x1*m6 | x0*m2 */
- PFADD ( MM4, MM3 ) /* r1 | r0 */
-
- PFMUL ( REGOFF(40, ECX), MM1 ) /* | x2*m10 */
- PUNPCKLDQ ( REGOFF(56, ECX), MM1 ) /* m14 | x2*m10 */
-
- PFACC ( MM0, MM1 )
-
- MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */
- PFACC ( MM1, MM1 ) /* | r2 */
-
- MOVD ( MM1, REGOFF(-8, EDX) ) /* write r2 */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TP3R_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP3R_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points3_3d_no_rot )
-HIDDEN(_mesa_3dnow_transform_points3_3d_no_rot)
-GLNAME( _mesa_3dnow_transform_points3_3d_no_rot ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP3NRR_2 ) )
-
- PREFETCH ( REGIND(EAX) )
- PREFETCHW ( REGIND(EDX) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
-
- MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */
- PUNPCKLDQ ( MM2, MM2 ) /* m22 | m22 */
-
- MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
- MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
-
- PUNPCKLDQ ( MM3, MM3 ) /* m32 | m32 */
-
-
-ALIGNTEXT16
-LLBL( G3TP3NRR_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCHW ( REGIND(EAX) )
-
- PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
-
- PFADD ( MM1, MM4 ) /* x1*m11+m31 | x0*m00+m30 */
- PFMUL ( MM2, MM5 ) /* | x2*m22 */
-
- PFADD ( MM3, MM5 ) /* | x2*m22+m32 */
- MOVQ ( MM4, REGIND(EDX) ) /* write r0, r1 */
-
- ADD_L ( CONST(16), EDX ) /* next r */
- DEC_L ( ESI ) /* decrement vertex counter */
-
- MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 */
- JNZ ( LLBL( G3TP3NRR_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP3NRR_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points3_2d )
-HIDDEN(_mesa_3dnow_transform_points3_2d)
-GLNAME( _mesa_3dnow_transform_points3_2d ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP2R_3) )
-
- PREFETCH ( REGIND(EAX) )
- PREFETCHW ( REGIND(EDX) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
-
- MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
-
- MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
-
-ALIGNTEXT16
-LLBL( G3TP2R_2 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVQ ( REGIND(EAX), MM3 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCH ( REGIND(EAX) )
-
- MOVQ ( MM3, MM4 ) /* x1 | x0 */
- PFMUL ( MM0, MM3 ) /* x1*m10 | x0*m00 */
-
- ADD_L ( CONST(16), EDX ) /* next r */
- PFMUL ( MM1, MM4 ) /* x1*m11 | x0*m01 */
-
- PFACC ( MM4, MM3 ) /* x0*m00+x1*m10 | x0*m01+x1*m11 */
- MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 (=x2) */
-
- PFADD ( MM2, MM3 ) /* x0*...*m10+m30 | x0*...*m11+m31 */
- MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP2R_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points3_2d_no_rot )
-HIDDEN(_mesa_3dnow_transform_points3_2d_no_rot)
-GLNAME( _mesa_3dnow_transform_points3_2d_no_rot ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP2NRR_2 ) )
-
- PREFETCH ( REGIND(EAX) )
- PREFETCHW ( REGIND(EDX) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
-
- MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
-
-
-ALIGNTEXT16
-LLBL( G3TP2NRR_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCH ( REGIND(EAX) )
-
- PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- PFADD ( MM1, MM4 ) /* x1*m11+m31 | x0*m00+m30 */
-
- MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */
- MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 (=x2) */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TP2NRR_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP2NRR_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points3_identity )
-HIDDEN(_mesa_3dnow_transform_points3_identity)
-GLNAME( _mesa_3dnow_transform_points3_identity ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPIR_2 ) )
-
- PREFETCHW ( REGIND(EDX) )
-
-ALIGNTEXT16
-LLBL( G3TPIR_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) )
-
- MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- MOVQ ( MM0, REGOFF(-16, EDX) ) /* r1 | r0 */
-
- MOVD ( MM1, REGOFF(-8, EDX) ) /* | r2 */
- JNZ ( LLBL( G3TPIR_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPIR_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#endif
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef USE_3DNOW_ASM +#include "assyntax.h" +#include "matypes.h" +#include "xform_args.h" + + SEG_TEXT + +#define FRAME_OFFSET 4 + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points3_general ) +HIDDEN(_mesa_3dnow_transform_points3_general) +GLNAME( _mesa_3dnow_transform_points3_general ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPGR_2 ) ) + + PREFETCHW ( REGIND(EDX) ) + +ALIGNTEXT16 +LLBL( G3TPGR_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EAX), MM2 ) /* | x2 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCH ( REGIND(EAX) ) + + MOVQ ( MM0, MM1 ) /* x1 | x0 */ + PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */ + + PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */ + MOVQ ( MM2, MM5 ) /* x2 | x2 */ + + PUNPCKHDQ ( MM1, MM1 ) /* x1 | x1 */ + PFMUL ( REGOFF(32, ECX), MM2 ) /* x2*m9 | x2*m8 */ + + MOVQ ( MM0, MM3 ) /* x0 | x0 */ + PFMUL ( REGOFF(40, ECX), MM5 ) /* x2*m11 | x2*m10 */ + + MOVQ ( MM1, MM4 ) /* x1 | x1 */ + PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */ + + PFADD ( REGOFF(48, ECX), MM2 ) /* x2*m9+m13 | x2*m8+m12 */ + PFMUL ( REGOFF(16, ECX), MM1 ) /* x1*m5 | x1*m4 */ + + PFADD ( REGOFF(56, ECX), MM5 ) /* x2*m11+m15 | x2*m10+m14 */ + PFADD ( MM0, MM1 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */ + + PFMUL ( REGOFF(8, ECX), MM3 ) /* x0*m3 | x0*m2 */ + PFADD ( MM1, MM2 ) /* r1 | r0 */ + + PFMUL ( REGOFF(24, ECX), MM4 ) /* x1*m7 | x1*m6 */ + ADD_L ( CONST(16), EDX ) /* next output vertex */ + + PFADD ( MM3, MM4 ) /* x0*m3+x1*m7 | x0*m2+x1*m6 */ + MOVQ ( MM2, REGOFF(-16, EDX) ) /* write r0, r1 */ + + PFADD ( MM4, MM5 ) /* r3 | r2 */ + MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TPGR_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPGR_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points3_perspective ) +HIDDEN(_mesa_3dnow_transform_points3_perspective) +GLNAME( _mesa_3dnow_transform_points3_perspective ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPPR_2 ) ) + + PREFETCH ( REGIND(EAX) ) + PREFETCHW ( REGIND(EDX) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */ + + MOVQ ( REGOFF(32, ECX), MM1 ) /* m21 | m20 */ + MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */ + + MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */ + +ALIGNTEXT16 +LLBL( G3TPPR_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */ + MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCH ( REGIND(EAX) ) + + PXOR ( MM7, MM7 ) /* 0 | 0 */ + MOVQ ( MM5, MM6 ) /* | x2 */ + + PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */ + PFSUB ( MM5, MM7 ) /* | -x2 */ + + PFMUL ( MM2, MM6 ) /* | x2*m22 */ + PUNPCKLDQ ( MM5, MM5 ) /* x2 | x2 */ + + ADD_L ( CONST(16), EDX ) /* next r */ + PFMUL ( MM1, MM5 ) /* x2*m21 | x2*m20 */ + + PFADD ( MM3, MM6 ) /* | x2*m22+m32 */ + PFADD ( MM4, MM5 ) /* x1*m11+x2*m21 | x0*m00+x2*m20 */ + + MOVQ ( MM5, REGOFF(-16, EDX) ) /* write r0, r1 */ + MOVD ( MM6, REGOFF(-8, EDX) ) /* write r2 */ + + MOVD ( MM7, REGOFF(-4, EDX) ) /* write r3 */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TPPR_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPPR_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points3_3d ) +HIDDEN(_mesa_3dnow_transform_points3_3d) +GLNAME( _mesa_3dnow_transform_points3_3d ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP3R_2 ) ) + + PREFETCH ( REGIND(EAX) ) + PREFETCH ( REGIND(EDX) ) + + MOVD ( REGOFF(8, ECX), MM7 ) /* | m2 */ + PUNPCKLDQ ( REGOFF(24, ECX), MM7 ) /* m6 | m2 */ + + +ALIGNTEXT16 +LLBL( G3TP3R_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCH ( REGIND(EAX) ) + + MOVQ ( MM0, MM2 ) /* x1 | x0 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + PUNPCKLDQ ( MM2, MM2 ) /* x0 | x0 */ + MOVQ ( MM0, MM3 ) /* x1 | x0 */ + + PFMUL ( REGIND(ECX), MM2 ) /* x0*m1 | x0*m0 */ + PUNPCKHDQ ( MM3, MM3 ) /* x1 | x1 */ + + MOVQ ( MM1, MM4 ) /* | x2 */ + PFMUL ( REGOFF(16, ECX), MM3 ) /* x1*m5 | x1*m4 */ + + PUNPCKLDQ ( MM4, MM4 ) /* x2 | x2 */ + PFADD ( MM2, MM3 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */ + + PFMUL ( REGOFF(32, ECX), MM4 ) /* x2*m9 | x2*m8 */ + PFADD ( REGOFF(48, ECX), MM3 ) /* x0*m1+...+m11 | x0*m0+x1*m4+m12 */ + + PFMUL ( MM7, MM0 ) /* x1*m6 | x0*m2 */ + PFADD ( MM4, MM3 ) /* r1 | r0 */ + + PFMUL ( REGOFF(40, ECX), MM1 ) /* | x2*m10 */ + PUNPCKLDQ ( REGOFF(56, ECX), MM1 ) /* m14 | x2*m10 */ + + PFACC ( MM0, MM1 ) + + MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */ + PFACC ( MM1, MM1 ) /* | r2 */ + + MOVD ( MM1, REGOFF(-8, EDX) ) /* write r2 */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TP3R_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP3R_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points3_3d_no_rot ) +HIDDEN(_mesa_3dnow_transform_points3_3d_no_rot) +GLNAME( _mesa_3dnow_transform_points3_3d_no_rot ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP3NRR_2 ) ) + + PREFETCH ( REGIND(EAX) ) + PREFETCHW ( REGIND(EDX) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */ + + MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */ + PUNPCKLDQ ( MM2, MM2 ) /* m22 | m22 */ + + MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */ + MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */ + + PUNPCKLDQ ( MM3, MM3 ) /* m32 | m32 */ + + +ALIGNTEXT16 +LLBL( G3TP3NRR_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCHW ( REGIND(EAX) ) + + PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */ + + PFADD ( MM1, MM4 ) /* x1*m11+m31 | x0*m00+m30 */ + PFMUL ( MM2, MM5 ) /* | x2*m22 */ + + PFADD ( MM3, MM5 ) /* | x2*m22+m32 */ + MOVQ ( MM4, REGIND(EDX) ) /* write r0, r1 */ + + ADD_L ( CONST(16), EDX ) /* next r */ + DEC_L ( ESI ) /* decrement vertex counter */ + + MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 */ + JNZ ( LLBL( G3TP3NRR_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP3NRR_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points3_2d ) +HIDDEN(_mesa_3dnow_transform_points3_2d) +GLNAME( _mesa_3dnow_transform_points3_2d ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP2R_3) ) + + PREFETCH ( REGIND(EAX) ) + PREFETCHW ( REGIND(EDX) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */ + + MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */ + + MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */ + +ALIGNTEXT16 +LLBL( G3TP2R_2 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVQ ( REGIND(EAX), MM3 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCH ( REGIND(EAX) ) + + MOVQ ( MM3, MM4 ) /* x1 | x0 */ + PFMUL ( MM0, MM3 ) /* x1*m10 | x0*m00 */ + + ADD_L ( CONST(16), EDX ) /* next r */ + PFMUL ( MM1, MM4 ) /* x1*m11 | x0*m01 */ + + PFACC ( MM4, MM3 ) /* x0*m00+x1*m10 | x0*m01+x1*m11 */ + MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 (=x2) */ + + PFADD ( MM2, MM3 ) /* x0*...*m10+m30 | x0*...*m11+m31 */ + MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP2R_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points3_2d_no_rot ) +HIDDEN(_mesa_3dnow_transform_points3_2d_no_rot) +GLNAME( _mesa_3dnow_transform_points3_2d_no_rot ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP2NRR_2 ) ) + + PREFETCH ( REGIND(EAX) ) + PREFETCHW ( REGIND(EDX) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */ + + MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */ + + +ALIGNTEXT16 +LLBL( G3TP2NRR_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCH ( REGIND(EAX) ) + + PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + PFADD ( MM1, MM4 ) /* x1*m11+m31 | x0*m00+m30 */ + + MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */ + MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 (=x2) */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TP2NRR_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP2NRR_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points3_identity ) +HIDDEN(_mesa_3dnow_transform_points3_identity) +GLNAME( _mesa_3dnow_transform_points3_identity ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPIR_2 ) ) + + PREFETCHW ( REGIND(EDX) ) + +ALIGNTEXT16 +LLBL( G3TPIR_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) + + MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */ + MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + ADD_L ( CONST(16), EDX ) /* next r */ + + DEC_L ( ESI ) /* decrement vertex counter */ + MOVQ ( MM0, REGOFF(-16, EDX) ) /* r1 | r0 */ + + MOVD ( MM1, REGOFF(-8, EDX) ) /* | r2 */ + JNZ ( LLBL( G3TPIR_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPIR_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET +#endif + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/3dnow_xform4.S b/mesalib/src/mesa/x86/3dnow_xform4.S index 3a9b97160..b2b7c64f2 100644 --- a/mesalib/src/mesa/x86/3dnow_xform4.S +++ b/mesalib/src/mesa/x86/3dnow_xform4.S @@ -1,570 +1,570 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifdef USE_3DNOW_ASM
-#include "assyntax.h"
-#include "matypes.h"
-#include "xform_args.h"
-
- SEG_TEXT
-
-#define FRAME_OFFSET 4
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points4_general )
-HIDDEN(_mesa_3dnow_transform_points4_general)
-GLNAME( _mesa_3dnow_transform_points4_general ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPGR_2 ) )
-
- PREFETCHW ( REGIND(EDX) )
-
-ALIGNTEXT16
-LLBL( G3TPGR_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
- MOVQ ( REGOFF(8, EAX), MM4 ) /* x3 | x2 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCH ( REGIND(EAX) )
-
- MOVQ ( MM0, MM2 ) /* x1 | x0 */
- MOVQ ( MM4, MM6 ) /* x3 | x2 */
-
- PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */
- PUNPCKHDQ ( MM2, MM2 ) /* x1 | x1 */
-
- MOVQ ( MM0, MM1 ) /* x0 | x0 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */
- MOVQ ( MM2, MM3 ) /* x1 | x1 */
-
- PFMUL ( REGOFF(8, ECX), MM1 ) /* x0*m3 | x0*m2 */
- PUNPCKLDQ ( MM4, MM4 ) /* x2 | x2 */
-
- PFMUL ( REGOFF(16, ECX), MM2 ) /* x1*m5 | x1*m4 */
- MOVQ ( MM4, MM5 ) /* x2 | x2 */
-
- PFMUL ( REGOFF(24, ECX), MM3 ) /* x1*m7 | x1*m6 */
- PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
-
- PFMUL ( REGOFF(32, ECX), MM4 ) /* x2*m9 | x2*m8 */
- MOVQ ( MM6, MM7 ) /* x3 | x3 */
-
- PFMUL ( REGOFF(40, ECX), MM5 ) /* x2*m11 | x2*m10 */
- PFADD ( MM0, MM2 )
-
- PFMUL ( REGOFF(48, ECX), MM6 ) /* x3*m13 | x3*m12 */
- PFADD ( MM1, MM3 )
-
- PFMUL ( REGOFF(56, ECX), MM7 ) /* x3*m15 | x3*m14 */
- PFADD ( MM4, MM6 )
-
- PFADD ( MM5, MM7 )
- PFADD ( MM2, MM6 )
-
- PFADD ( MM3, MM7 )
- MOVQ ( MM6, REGOFF(-16, EDX) )
-
- MOVQ ( MM7, REGOFF(-8, EDX) )
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TPGR_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPGR_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points4_perspective )
-HIDDEN(_mesa_3dnow_transform_points4_perspective)
-GLNAME( _mesa_3dnow_transform_points4_perspective ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPPR_2 ) )
-
- PREFETCH ( REGIND(EAX) )
- PREFETCHW ( REGIND(EDX) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
-
- MOVD ( REGOFF(40, ECX), MM1 ) /* | m22 */
- PUNPCKLDQ ( REGOFF(56, ECX), MM1 ) /* m32 | m22 */
-
- MOVQ ( REGOFF(32, ECX), MM2 ) /* m21 | m20 */
- PXOR ( MM7, MM7 ) /* 0 | 0 */
-
-ALIGNTEXT16
-LLBL( G3TPPR_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
- MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
- MOVD ( REGOFF(8, EAX), MM3 ) /* | x2 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCH ( REGOFF(32, EAX) ) /* hopefully stride is zero */
-
- MOVQ ( MM5, MM6 ) /* x3 | x2 */
- PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
-
- PUNPCKLDQ ( MM5, MM5 ) /* x2 | x2 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- PFMUL ( MM2, MM5 ) /* x2*m21 | x2*m20 */
- PFSUBR ( MM7, MM3 ) /* | -x2 */
-
- PFMUL ( MM1, MM6 ) /* x3*m32 | x2*m22 */
- PFADD ( MM4, MM5 ) /* x1*m11+x2*m21 | x0*m00+x2*m20 */
-
- PFACC ( MM3, MM6 ) /* -x2 | x2*m22+x3*m32 */
- MOVQ ( MM5, REGOFF(-16, EDX) ) /* write r0, r1 */
-
- MOVQ ( MM6, REGOFF(-8, EDX) ) /* write r2, r3 */
- DEC_L ( ESI ) /* decrement vertex counter */
-
- JNZ ( LLBL( G3TPPR_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPPR_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points4_3d )
-HIDDEN(_mesa_3dnow_transform_points4_3d)
-GLNAME( _mesa_3dnow_transform_points4_3d ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP3R_2 ) )
-
- MOVD ( REGOFF(8, ECX), MM6 ) /* | m2 */
- PUNPCKLDQ ( REGOFF(24, ECX), MM6 ) /* m6 | m2 */
-
- MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */
- PUNPCKLDQ ( REGOFF(56, ECX), MM7 ) /* m14 | m10 */
-
-ALIGNTEXT16
-LLBL( G3TP3R_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
- PREFETCH ( REGOFF(32, EAX) ) /* hopefully array is tightly packed */
-
- MOVQ ( REGIND(EAX), MM2 ) /* x1 | x0 */
- MOVQ ( REGOFF(8, EAX), MM3 ) /* x3 | x2 */
-
- MOVQ ( MM2, MM0 ) /* x1 | x0 */
- MOVQ ( MM3, MM4 ) /* x3 | x2 */
-
- MOVQ ( MM0, MM1 ) /* x1 | x0 */
- MOVQ ( MM4, MM5 ) /* x3 | x2 */
-
- PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */
- PUNPCKHDQ ( MM1, MM1 ) /* x1 | x1 */
-
- PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */
- PUNPCKLDQ ( MM3, MM3 ) /* x2 | x2 */
-
- PFMUL ( REGOFF(16, ECX), MM1 ) /* x1*m5 | x1*m4 */
- PUNPCKHDQ ( MM4, MM4 ) /* x3 | x3 */
-
- PFMUL ( MM6, MM2 ) /* x1*m6 | x0*m2 */
- PFADD ( MM0, MM1 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */
-
- PFMUL ( REGOFF(32, ECX), MM3 ) /* x2*m9 | x2*m8 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- PFMUL ( REGOFF(48, ECX), MM4 ) /* x3*m13 | x3*m12 */
- PFADD ( MM1, MM3 ) /* x0*m1+..+x2*m9 | x0*m0+...+x2*m8 */
-
- PFMUL ( MM7, MM5 ) /* x3*m14 | x2*m10 */
- PFADD ( MM3, MM4 ) /* r1 | r0 */
-
- PFACC ( MM2, MM5 ) /* x0*m2+x1*m6 | x2*m10+x3*m14 */
- MOVD ( REGOFF(12, EAX), MM0 ) /* | x3 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PFACC ( MM0, MM5 ) /* r3 | r2 */
-
- MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */
- MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TP3R_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP3R_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points4_3d_no_rot )
-HIDDEN(_mesa_3dnow_transform_points4_3d_no_rot)
-GLNAME( _mesa_3dnow_transform_points4_3d_no_rot ):
-
- PUSH_L ( ESI )
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP3NRR_2 ) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
-
- MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */
- PUNPCKLDQ ( REGOFF(56, ECX), MM2 ) /* m32 | m22 */
-
- MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
-
-ALIGNTEXT16
-LLBL( G3TP3NRR_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
- MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
- MOVD ( REGOFF(12, EAX), MM7 ) /* | x3 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCH ( REGOFF(32, EAX) ) /* hopefully stride is zero */
-
- MOVQ ( MM5, MM6 ) /* x3 | x2 */
- PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
-
- PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
- PFMUL ( MM2, MM5 ) /* x3*m32 | x2*m22 */
-
- PFMUL ( MM1, MM6 ) /* x3*m31 | x3*m30 */
- PFACC ( MM7, MM5 ) /* x3 | x2*m22+x3*m32 */
-
- PFADD ( MM6, MM4 ) /* x1*m11+x3*m31 | x0*m00+x3*m30 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */
- MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TP3NRR_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP3NRR_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points4_2d )
-HIDDEN(_mesa_3dnow_transform_points4_2d)
-GLNAME( _mesa_3dnow_transform_points4_2d ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP2R_2 ) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
-
- MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
-
- MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
-
-ALIGNTEXT16
-LLBL( G3TP2R_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVQ ( REGIND(EAX), MM3 ) /* x1 | x0 */
- MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCH ( REGIND(EAX) )
-
- MOVQ ( MM3, MM4 ) /* x1 | x0 */
- MOVQ ( MM5, MM6 ) /* x3 | x2 */
-
- PFMUL ( MM1, MM4 ) /* x1*m11 | x0*m01 */
- PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
-
- PFMUL ( MM0, MM3 ) /* x1*m10 | x0*m00 */
- ADD_L ( CONST(16), EDX ) /* next r */
-
- PFACC ( MM4, MM3 ) /* x0*m01+x1*m11 | x0*m00+x1*m10 */
- PFMUL ( MM2, MM6 ) /* x3*m31 | x3*m30 */
-
- PFADD ( MM6, MM3 ) /* r1 | r0 */
- MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
-
- MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TP2R_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP2R_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points4_2d_no_rot )
-HIDDEN(_mesa_3dnow_transform_points4_2d_no_rot)
-GLNAME( _mesa_3dnow_transform_points4_2d_no_rot ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TP2NRR_3 ) )
-
- MOVD ( REGIND(ECX), MM0 ) /* | m00 */
- PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
-
- MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
-
-ALIGNTEXT16
-LLBL( G3TP2NRR_2 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
- MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCH ( REGIND(EAX) )
-
- PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
- MOVQ ( MM5, MM6 ) /* x3 | x2 */
-
- ADD_L ( CONST(16), EDX ) /* next r */
- PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
-
- PFMUL ( MM1, MM6 ) /* x3*m31 | x3*m30 */
- PFADD ( MM4, MM6 ) /* x1*m11+x3*m31 | x0*m00+x3*m30 */
-
- MOVQ ( MM6, REGOFF(-16, EDX) ) /* write r0, r1 */
- MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
-
- DEC_L ( ESI ) /* decrement vertex counter */
-
- JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TP2NRR_3 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_3dnow_transform_points4_identity )
-HIDDEN(_mesa_3dnow_transform_points4_identity)
-GLNAME( _mesa_3dnow_transform_points4_identity ):
-
- PUSH_L ( ESI )
-
- MOV_L ( ARG_DEST, ECX )
- MOV_L ( ARG_MATRIX, ESI )
- MOV_L ( ARG_SOURCE, EAX )
- MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
- OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
- MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
- MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
-
- PUSH_L ( EDI )
-
- MOV_L ( REGOFF(V4F_START, ECX), EDX )
- MOV_L ( ESI, ECX )
- MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
- MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
- MOV_L ( REGOFF(V4F_START, EAX), EAX )
-
- TEST_L ( ESI, ESI )
- JZ ( LLBL( G3TPIR_2 ) )
-
-ALIGNTEXT16
-LLBL( G3TPIR_1 ):
-
- PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
-
- MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
- MOVQ ( REGOFF(8, EAX), MM1 ) /* x3 | x2 */
-
- ADD_L ( EDI, EAX ) /* next vertex */
- PREFETCH ( REGIND(EAX) )
-
- ADD_L ( CONST(16), EDX ) /* next r */
- MOVQ ( MM0, REGOFF(-16, EDX) ) /* r1 | r0 */
-
- MOVQ ( MM1, REGOFF(-8, EDX) ) /* r3 | r2 */
-
- DEC_L ( ESI ) /* decrement vertex counter */
- JNZ ( LLBL( G3TPIR_1 ) ) /* cnt > 0 ? -> process next vertex */
-
-LLBL( G3TPIR_2 ):
-
- FEMMS
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#endif
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef USE_3DNOW_ASM +#include "assyntax.h" +#include "matypes.h" +#include "xform_args.h" + + SEG_TEXT + +#define FRAME_OFFSET 4 + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points4_general ) +HIDDEN(_mesa_3dnow_transform_points4_general) +GLNAME( _mesa_3dnow_transform_points4_general ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPGR_2 ) ) + + PREFETCHW ( REGIND(EDX) ) + +ALIGNTEXT16 +LLBL( G3TPGR_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */ + MOVQ ( REGOFF(8, EAX), MM4 ) /* x3 | x2 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCH ( REGIND(EAX) ) + + MOVQ ( MM0, MM2 ) /* x1 | x0 */ + MOVQ ( MM4, MM6 ) /* x3 | x2 */ + + PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */ + PUNPCKHDQ ( MM2, MM2 ) /* x1 | x1 */ + + MOVQ ( MM0, MM1 ) /* x0 | x0 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */ + MOVQ ( MM2, MM3 ) /* x1 | x1 */ + + PFMUL ( REGOFF(8, ECX), MM1 ) /* x0*m3 | x0*m2 */ + PUNPCKLDQ ( MM4, MM4 ) /* x2 | x2 */ + + PFMUL ( REGOFF(16, ECX), MM2 ) /* x1*m5 | x1*m4 */ + MOVQ ( MM4, MM5 ) /* x2 | x2 */ + + PFMUL ( REGOFF(24, ECX), MM3 ) /* x1*m7 | x1*m6 */ + PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */ + + PFMUL ( REGOFF(32, ECX), MM4 ) /* x2*m9 | x2*m8 */ + MOVQ ( MM6, MM7 ) /* x3 | x3 */ + + PFMUL ( REGOFF(40, ECX), MM5 ) /* x2*m11 | x2*m10 */ + PFADD ( MM0, MM2 ) + + PFMUL ( REGOFF(48, ECX), MM6 ) /* x3*m13 | x3*m12 */ + PFADD ( MM1, MM3 ) + + PFMUL ( REGOFF(56, ECX), MM7 ) /* x3*m15 | x3*m14 */ + PFADD ( MM4, MM6 ) + + PFADD ( MM5, MM7 ) + PFADD ( MM2, MM6 ) + + PFADD ( MM3, MM7 ) + MOVQ ( MM6, REGOFF(-16, EDX) ) + + MOVQ ( MM7, REGOFF(-8, EDX) ) + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TPGR_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPGR_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points4_perspective ) +HIDDEN(_mesa_3dnow_transform_points4_perspective) +GLNAME( _mesa_3dnow_transform_points4_perspective ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPPR_2 ) ) + + PREFETCH ( REGIND(EAX) ) + PREFETCHW ( REGIND(EDX) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */ + + MOVD ( REGOFF(40, ECX), MM1 ) /* | m22 */ + PUNPCKLDQ ( REGOFF(56, ECX), MM1 ) /* m32 | m22 */ + + MOVQ ( REGOFF(32, ECX), MM2 ) /* m21 | m20 */ + PXOR ( MM7, MM7 ) /* 0 | 0 */ + +ALIGNTEXT16 +LLBL( G3TPPR_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */ + MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */ + MOVD ( REGOFF(8, EAX), MM3 ) /* | x2 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCH ( REGOFF(32, EAX) ) /* hopefully stride is zero */ + + MOVQ ( MM5, MM6 ) /* x3 | x2 */ + PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */ + + PUNPCKLDQ ( MM5, MM5 ) /* x2 | x2 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + PFMUL ( MM2, MM5 ) /* x2*m21 | x2*m20 */ + PFSUBR ( MM7, MM3 ) /* | -x2 */ + + PFMUL ( MM1, MM6 ) /* x3*m32 | x2*m22 */ + PFADD ( MM4, MM5 ) /* x1*m11+x2*m21 | x0*m00+x2*m20 */ + + PFACC ( MM3, MM6 ) /* -x2 | x2*m22+x3*m32 */ + MOVQ ( MM5, REGOFF(-16, EDX) ) /* write r0, r1 */ + + MOVQ ( MM6, REGOFF(-8, EDX) ) /* write r2, r3 */ + DEC_L ( ESI ) /* decrement vertex counter */ + + JNZ ( LLBL( G3TPPR_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPPR_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points4_3d ) +HIDDEN(_mesa_3dnow_transform_points4_3d) +GLNAME( _mesa_3dnow_transform_points4_3d ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP3R_2 ) ) + + MOVD ( REGOFF(8, ECX), MM6 ) /* | m2 */ + PUNPCKLDQ ( REGOFF(24, ECX), MM6 ) /* m6 | m2 */ + + MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */ + PUNPCKLDQ ( REGOFF(56, ECX), MM7 ) /* m14 | m10 */ + +ALIGNTEXT16 +LLBL( G3TP3R_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + PREFETCH ( REGOFF(32, EAX) ) /* hopefully array is tightly packed */ + + MOVQ ( REGIND(EAX), MM2 ) /* x1 | x0 */ + MOVQ ( REGOFF(8, EAX), MM3 ) /* x3 | x2 */ + + MOVQ ( MM2, MM0 ) /* x1 | x0 */ + MOVQ ( MM3, MM4 ) /* x3 | x2 */ + + MOVQ ( MM0, MM1 ) /* x1 | x0 */ + MOVQ ( MM4, MM5 ) /* x3 | x2 */ + + PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */ + PUNPCKHDQ ( MM1, MM1 ) /* x1 | x1 */ + + PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */ + PUNPCKLDQ ( MM3, MM3 ) /* x2 | x2 */ + + PFMUL ( REGOFF(16, ECX), MM1 ) /* x1*m5 | x1*m4 */ + PUNPCKHDQ ( MM4, MM4 ) /* x3 | x3 */ + + PFMUL ( MM6, MM2 ) /* x1*m6 | x0*m2 */ + PFADD ( MM0, MM1 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */ + + PFMUL ( REGOFF(32, ECX), MM3 ) /* x2*m9 | x2*m8 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + PFMUL ( REGOFF(48, ECX), MM4 ) /* x3*m13 | x3*m12 */ + PFADD ( MM1, MM3 ) /* x0*m1+..+x2*m9 | x0*m0+...+x2*m8 */ + + PFMUL ( MM7, MM5 ) /* x3*m14 | x2*m10 */ + PFADD ( MM3, MM4 ) /* r1 | r0 */ + + PFACC ( MM2, MM5 ) /* x0*m2+x1*m6 | x2*m10+x3*m14 */ + MOVD ( REGOFF(12, EAX), MM0 ) /* | x3 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PFACC ( MM0, MM5 ) /* r3 | r2 */ + + MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */ + MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TP3R_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP3R_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points4_3d_no_rot ) +HIDDEN(_mesa_3dnow_transform_points4_3d_no_rot) +GLNAME( _mesa_3dnow_transform_points4_3d_no_rot ): + + PUSH_L ( ESI ) + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP3NRR_2 ) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */ + + MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */ + PUNPCKLDQ ( REGOFF(56, ECX), MM2 ) /* m32 | m22 */ + + MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */ + +ALIGNTEXT16 +LLBL( G3TP3NRR_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */ + MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */ + MOVD ( REGOFF(12, EAX), MM7 ) /* | x3 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCH ( REGOFF(32, EAX) ) /* hopefully stride is zero */ + + MOVQ ( MM5, MM6 ) /* x3 | x2 */ + PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */ + + PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */ + PFMUL ( MM2, MM5 ) /* x3*m32 | x2*m22 */ + + PFMUL ( MM1, MM6 ) /* x3*m31 | x3*m30 */ + PFACC ( MM7, MM5 ) /* x3 | x2*m22+x3*m32 */ + + PFADD ( MM6, MM4 ) /* x1*m11+x3*m31 | x0*m00+x3*m30 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */ + MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TP3NRR_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP3NRR_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points4_2d ) +HIDDEN(_mesa_3dnow_transform_points4_2d) +GLNAME( _mesa_3dnow_transform_points4_2d ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP2R_2 ) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */ + + MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */ + + MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */ + +ALIGNTEXT16 +LLBL( G3TP2R_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVQ ( REGIND(EAX), MM3 ) /* x1 | x0 */ + MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCH ( REGIND(EAX) ) + + MOVQ ( MM3, MM4 ) /* x1 | x0 */ + MOVQ ( MM5, MM6 ) /* x3 | x2 */ + + PFMUL ( MM1, MM4 ) /* x1*m11 | x0*m01 */ + PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */ + + PFMUL ( MM0, MM3 ) /* x1*m10 | x0*m00 */ + ADD_L ( CONST(16), EDX ) /* next r */ + + PFACC ( MM4, MM3 ) /* x0*m01+x1*m11 | x0*m00+x1*m10 */ + PFMUL ( MM2, MM6 ) /* x3*m31 | x3*m30 */ + + PFADD ( MM6, MM3 ) /* r1 | r0 */ + MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */ + + MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TP2R_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP2R_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points4_2d_no_rot ) +HIDDEN(_mesa_3dnow_transform_points4_2d_no_rot) +GLNAME( _mesa_3dnow_transform_points4_2d_no_rot ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TP2NRR_3 ) ) + + MOVD ( REGIND(ECX), MM0 ) /* | m00 */ + PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */ + + MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */ + +ALIGNTEXT16 +LLBL( G3TP2NRR_2 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */ + MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCH ( REGIND(EAX) ) + + PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */ + MOVQ ( MM5, MM6 ) /* x3 | x2 */ + + ADD_L ( CONST(16), EDX ) /* next r */ + PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */ + + PFMUL ( MM1, MM6 ) /* x3*m31 | x3*m30 */ + PFADD ( MM4, MM6 ) /* x1*m11+x3*m31 | x0*m00+x3*m30 */ + + MOVQ ( MM6, REGOFF(-16, EDX) ) /* write r0, r1 */ + MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */ + + DEC_L ( ESI ) /* decrement vertex counter */ + + JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TP2NRR_3 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_3dnow_transform_points4_identity ) +HIDDEN(_mesa_3dnow_transform_points4_identity) +GLNAME( _mesa_3dnow_transform_points4_identity ): + + PUSH_L ( ESI ) + + MOV_L ( ARG_DEST, ECX ) + MOV_L ( ARG_MATRIX, ESI ) + MOV_L ( ARG_SOURCE, EAX ) + MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) ) + OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) ) + MOV_L ( REGOFF(V4F_COUNT, EAX), EDX ) + MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) ) + + PUSH_L ( EDI ) + + MOV_L ( REGOFF(V4F_START, ECX), EDX ) + MOV_L ( ESI, ECX ) + MOV_L ( REGOFF(V4F_COUNT, EAX), ESI ) + MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI ) + MOV_L ( REGOFF(V4F_START, EAX), EAX ) + + TEST_L ( ESI, ESI ) + JZ ( LLBL( G3TPIR_2 ) ) + +ALIGNTEXT16 +LLBL( G3TPIR_1 ): + + PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */ + + MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */ + MOVQ ( REGOFF(8, EAX), MM1 ) /* x3 | x2 */ + + ADD_L ( EDI, EAX ) /* next vertex */ + PREFETCH ( REGIND(EAX) ) + + ADD_L ( CONST(16), EDX ) /* next r */ + MOVQ ( MM0, REGOFF(-16, EDX) ) /* r1 | r0 */ + + MOVQ ( MM1, REGOFF(-8, EDX) ) /* r3 | r2 */ + + DEC_L ( ESI ) /* decrement vertex counter */ + JNZ ( LLBL( G3TPIR_1 ) ) /* cnt > 0 ? -> process next vertex */ + +LLBL( G3TPIR_2 ): + + FEMMS + POP_L ( EDI ) + POP_L ( ESI ) + RET +#endif + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/Makefile b/mesalib/src/mesa/x86/Makefile index 8cbe26015..9716dc27f 100644 --- a/mesalib/src/mesa/x86/Makefile +++ b/mesalib/src/mesa/x86/Makefile @@ -1,46 +1,46 @@ -# src/mesa/x86/Makefile
-
-TOP = ../../..
-include $(TOP)/configs/current
-
-
-INCLUDE_DIRS = \
- -I$(TOP)/include/GL \
- -I$(TOP)/include \
- -I$(TOP)/src/mapi \
- -I.. \
- -I../main \
- -I../math \
- -I../tnl
-
-
-default: gen_matypes matypes.h
-
-clean:
- -rm -f matypes.h gen_matypes
-
-
-gen_matypes: gen_matypes.c
- $(HOST_CC) $(ARCH_FLAGS) $(INCLUDE_DIRS) $(HOST_CFLAGS) gen_matypes.c -o gen_matypes
-
-# need some special rules here, unfortunately
-matypes.h: ../main/mtypes.h ../tnl/t_context.h gen_matypes
- ./gen_matypes > matypes.h
-
-common_x86_asm.o: matypes.h
-3dnow_normal.o: matypes.h
-3dnow_xform1.o: matypes.h
-3dnow_xform2.o: matypes.h
-3dnow_xform3.o: matypes.h
-3dnow_xform4.o: matypes.h
-mmx_blend.o: matypes.h
-sse_normal.o: matypes.h
-sse_xform1.o: matypes.h
-sse_xform2.o: matypes.h
-sse_xform3.o: matypes.h
-sse_xform4.o: matypes.h
-x86_cliptest.o: matypes.h
-x86_xform2.o: matypes.h
-x86_xform3.o: matypes.h
-x86_xform4.o: matypes.h
-
+# src/mesa/x86/Makefile + +TOP = ../../.. +include $(TOP)/configs/current + + +INCLUDE_DIRS = \ + -I$(TOP)/include/GL \ + -I$(TOP)/include \ + -I$(TOP)/src/mapi \ + -I.. \ + -I../main \ + -I../math \ + -I../tnl + + +default: gen_matypes matypes.h + +clean: + -rm -f matypes.h gen_matypes + + +gen_matypes: gen_matypes.c + $(HOST_CC) $(ARCH_FLAGS) $(INCLUDE_DIRS) $(HOST_CFLAGS) gen_matypes.c -o gen_matypes + +# need some special rules here, unfortunately +matypes.h: ../main/mtypes.h ../tnl/t_context.h gen_matypes + ./gen_matypes > matypes.h + +common_x86_asm.o: matypes.h +3dnow_normal.o: matypes.h +3dnow_xform1.o: matypes.h +3dnow_xform2.o: matypes.h +3dnow_xform3.o: matypes.h +3dnow_xform4.o: matypes.h +mmx_blend.o: matypes.h +sse_normal.o: matypes.h +sse_xform1.o: matypes.h +sse_xform2.o: matypes.h +sse_xform3.o: matypes.h +sse_xform4.o: matypes.h +x86_cliptest.o: matypes.h +x86_xform2.o: matypes.h +x86_xform3.o: matypes.h +x86_xform4.o: matypes.h + diff --git a/mesalib/src/mesa/x86/assyntax.h b/mesalib/src/mesa/x86/assyntax.h index 1e3ea66e2..4a41812f6 100644 --- a/mesalib/src/mesa/x86/assyntax.h +++ b/mesalib/src/mesa/x86/assyntax.h @@ -1,1747 +1,1747 @@ -
-#ifndef __ASSYNTAX_H__
-#define __ASSYNTAX_H__
-
-/*
- * Copyright 1992 Vrije Universiteit, The Netherlands
- *
- * Permission to use, copy, modify, and distribute this software and its
- * documentation for any purpose and without fee is hereby granted, provided
- * that the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of the Vrije Universiteit not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. The Vrije Universiteit makes no
- * representations about the suitability of this software for any purpose.
- * It is provided "as is" without express or implied warranty.
- *
- * The Vrije Universiteit DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS,
- * IN NO EVENT SHALL The Vrije Universiteit BE LIABLE FOR ANY SPECIAL,
- * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
- * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
- * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
-
-/*
- * assyntax.h
- *
- * Select the syntax appropriate to the 386 assembler being used
- * To add support for more assemblers add more columns to the CHOICE
- * macro. Note that register names must also have uppercase names
- * to avoid macro recursion. e.g., #define ah %ah recurses!
- *
- * NB 1. Some of the macros for certain assemblers imply that the code is to
- * run in protected mode!! Caveat emptor.
- *
- * NB 2. 486 specific instructions are not included. This is to discourage
- * their accidental use in code that is intended to run on 386 and 486
- * systems.
- *
- * Supported assemblers:
- *
- * (a) AT&T SysVr4 as(1): define ATT_ASSEMBLER
- * (b) GNU Assembler gas: define GNU_ASSEMBLER (default)
- * (c) Amsterdam Compiler kit: define ACK_ASSEMBLER
- * (d) The Netwide Assembler: define NASM_ASSEMBLER
- * (e) Microsoft Assembler: define MASM_ASSEMBLER (UNTESTED!)
- *
- * The following naming conventions have been used to identify the various
- * data types:
- * _SR = segment register version
- * Integer:
- * _Q = quadword = 64 bits
- * _L = long = 32 bits
- * _W = short = 16 bits
- * _B = byte = 8 bits
- * Floating-point:
- * _X = m80real = 80 bits
- * _D = double = 64 bits
- * _S = single = 32 bits
- *
- * Author: Gregory J. Sharp, Sept 1992
- * Vrije Universiteit, Amsterdam, The Netherlands
- *
- * [support for Intel syntax added by Josh Vanderhoof, 1999]
- */
-
-#if !(defined(NASM_ASSEMBLER) || defined(MASM_ASSEMBLER))
-
-/* Default to ATT_ASSEMBLER when SVR4 or SYSV are defined */
-#if (defined(SVR4) || defined(SYSV)) && !defined(GNU_ASSEMBLER)
-#define ATT_ASSEMBLER
-#endif
-
-#if !defined(ATT_ASSEMBLER) && !defined(GNU_ASSEMBLER) && !defined(ACK_ASSEMBLER)
-#define GNU_ASSEMBLER
-#endif
-
-#if (defined(__STDC__) && !defined(UNIXCPP)) || (defined (sun) && defined (i386) && defined (SVR4) && defined (__STDC__) && !defined (__GNUC__))
-#define CONCAT(x, y) x ## y
-#define CONCAT3(x, y, z) x ## y ## z
-#else
-#define CONCAT(x, y) x/**/y
-#define CONCAT3(x, y, z) x/**/y/**/z
-#endif
-
-#ifdef ACK_ASSEMBLER
-
-/* Assume we write code for 32-bit protected mode! */
-
-/* Redefine register names for GAS & AT&T assemblers */
-#define AL al
-#define AH ah
-#define AX ax
-#define EAX ax
-#define BL bl
-#define BH bh
-#define BX bx
-#define EBX bx
-#define CL cl
-#define CH ch
-#define CX cx
-#define ECX cx
-#define DL dl
-#define DH dh
-#define DX dx
-#define EDX dx
-#define BP bp
-#define EBP bp
-#define SI si
-#define ESI si
-#define DI di
-#define EDI di
-#define SP sp
-#define ESP sp
-#define CS cs
-#define SS ss
-#define DS ds
-#define ES es
-#define FS fs
-#define GS gs
-/* Control Registers */
-#define CR0 cr0
-#define CR1 cr1
-#define CR2 cr2
-#define CR3 cr3
-/* Debug Registers */
-#define DR0 dr0
-#define DR1 dr1
-#define DR2 dr2
-#define DR3 dr3
-#define DR4 dr4
-#define DR5 dr5
-#define DR6 dr6
-#define DR7 dr7
-/* Floating-point Stack */
-#define ST st
-
-#define AS_BEGIN .sect .text; .sect .rom; .sect .data; .sect .bss; .sect .text
-
-
-#define _WTOG o16 /* word toggle for _W instructions */
-#define _LTOG /* long toggle for _L instructions */
-#define ADDR_TOGGLE a16
-#define OPSZ_TOGGLE o16
-#define USE16 .use16
-#define USE32 .use32
-
-#define CHOICE(a,b,c) c
-
-#else /* AT&T or GAS */
-
-/* Redefine register names for GAS & AT&T assemblers */
-#define AL %al
-#define AH %ah
-#define AX %ax
-#define EAX %eax
-#define BL %bl
-#define BH %bh
-#define BX %bx
-#define EBX %ebx
-#define CL %cl
-#define CH %ch
-#define CX %cx
-#define ECX %ecx
-#define DL %dl
-#define DH %dh
-#define DX %dx
-#define EDX %edx
-#define BP %bp
-#define EBP %ebp
-#define SI %si
-#define ESI %esi
-#define DI %di
-#define EDI %edi
-#define SP %sp
-#define ESP %esp
-#define CS %cs
-#define SS %ss
-#define DS %ds
-#define ES %es
-#define FS %fs
-#define GS %gs
-/* Control Registers */
-#define CR0 %cr0
-#define CR1 %cr1
-#define CR2 %cr2
-#define CR3 %cr3
-/* Debug Registers */
-#define DR0 %db0
-#define DR1 %db1
-#define DR2 %db2
-#define DR3 %db3
-#define DR4 %db4
-#define DR5 %db5
-#define DR6 %db6
-#define DR7 %db7
-/* Floating-point Stack */
-#define _STX0 %st(0)
-#define _STX1 %st(1)
-#define _STX2 %st(2)
-#define _STX3 %st(3)
-#define _STX4 %st(4)
-#define _STX5 %st(5)
-#define _STX6 %st(6)
-#define _STX7 %st(7)
-#define ST(x) CONCAT(_STX,x)
-#ifdef GNU_ASSEMBLER
-#define ST0 %st(0)
-#else
-#define ST0 %st
-#endif
-/* MMX Registers */
-#define MM0 %mm0
-#define MM1 %mm1
-#define MM2 %mm2
-#define MM3 %mm3
-#define MM4 %mm4
-#define MM5 %mm5
-#define MM6 %mm6
-#define MM7 %mm7
-/* SSE Registers */
-#define XMM0 %xmm0
-#define XMM1 %xmm1
-#define XMM2 %xmm2
-#define XMM3 %xmm3
-#define XMM4 %xmm4
-#define XMM5 %xmm5
-#define XMM6 %xmm6
-#define XMM7 %xmm7
-
-#define AS_BEGIN
-#define USE16
-#define USE32
-
-#ifdef GNU_ASSEMBLER
-
-#define ADDR_TOGGLE aword
-#define OPSZ_TOGGLE word
-
-#define CHOICE(a,b,c) b
-
-#else
-/*
- * AT&T ASSEMBLER SYNTAX
- * *********************
- */
-#define CHOICE(a,b,c) a
-
-#define ADDR_TOGGLE addr16
-#define OPSZ_TOGGLE data16
-
-#endif /* GNU_ASSEMBLER */
-#endif /* ACK_ASSEMBLER */
-
-
-#if defined(__QNX__) || defined(Lynx) || (defined(SYSV) || defined(SVR4)) && !defined(ACK_ASSEMBLER) || defined(__ELF__) || defined(__GNU__) || defined(__GNUC__) && !defined(__DJGPP__) && !defined(__MINGW32__)
-#define GLNAME(a) a
-#else
-#define GLNAME(a) CONCAT(_,a)
-#endif
-
-
- /****************************************/
- /* */
- /* Select the various choices */
- /* */
- /****************************************/
-
-
-/* Redefine assembler directives */
-/*********************************/
-#define GLOBL CHOICE(.globl, .globl, .extern)
-#define GLOBAL GLOBL
-#define EXTERN GLOBL
-#ifndef __AOUT__
-#define ALIGNTEXT32 CHOICE(.align 32, .balign 32, .align 32)
-#define ALIGNTEXT16 CHOICE(.align 16, .balign 16, .align 16)
-#define ALIGNTEXT8 CHOICE(.align 8, .balign 8, .align 8)
-#define ALIGNTEXT4 CHOICE(.align 4, .balign 4, .align 4)
-#define ALIGNTEXT2 CHOICE(.align 2, .balign 2, .align 2)
-/* ALIGNTEXT4ifNOP is the same as ALIGNTEXT4, but only if the space is
- * guaranteed to be filled with NOPs. Otherwise it does nothing.
- */
-#define ALIGNTEXT32ifNOP CHOICE(.align 32, .balign ARG2(32,0x90), /*can't do it*/)
-#define ALIGNTEXT16ifNOP CHOICE(.align 16, .balign ARG2(16,0x90), /*can't do it*/)
-#define ALIGNTEXT8ifNOP CHOICE(.align 8, .balign ARG2(8,0x90), /*can't do it*/)
-#define ALIGNTEXT4ifNOP CHOICE(.align 4, .balign ARG2(4,0x90), /*can't do it*/)
-#define ALIGNDATA32 CHOICE(.align 32, .balign ARG2(32,0x0), .align 32)
-#define ALIGNDATA16 CHOICE(.align 16, .balign ARG2(16,0x0), .align 16)
-#define ALIGNDATA8 CHOICE(.align 8, .balign ARG2(8,0x0), .align 8)
-#define ALIGNDATA4 CHOICE(.align 4, .balign ARG2(4,0x0), .align 4)
-#define ALIGNDATA2 CHOICE(.align 2, .balign ARG2(2,0x0), .align 2)
-#else
-/* 'as -aout' on FreeBSD doesn't have .balign */
-#define ALIGNTEXT32 CHOICE(.align 32, .align ARG2(5,0x90), .align 32)
-#define ALIGNTEXT16 CHOICE(.align 16, .align ARG2(4,0x90), .align 16)
-#define ALIGNTEXT8 CHOICE(.align 8, .align ARG2(3,0x90), .align 8)
-#define ALIGNTEXT4 CHOICE(.align 4, .align ARG2(2,0x90), .align 4)
-#define ALIGNTEXT2 CHOICE(.align 2, .align ARG2(1,0x90), .align 2)
-/* ALIGNTEXT4ifNOP is the same as ALIGNTEXT4, but only if the space is
- * guaranteed to be filled with NOPs. Otherwise it does nothing.
- */
-#define ALIGNTEXT32ifNOP CHOICE(.align 32, .align ARG2(5,0x90), /*can't do it*/)
-#define ALIGNTEXT16ifNOP CHOICE(.align 16, .align ARG2(4,0x90), /*can't do it*/)
-#define ALIGNTEXT8ifNOP CHOICE(.align 8, .align ARG2(3,0x90), /*can't do it*/)
-#define ALIGNTEXT4ifNOP CHOICE(.align 4, .align ARG2(2,0x90), /*can't do it*/)
-#define ALIGNDATA32 CHOICE(.align 32, .align ARG2(5,0x0), .align 32)
-#define ALIGNDATA16 CHOICE(.align 16, .align ARG2(4,0x0), .align 16)
-#define ALIGNDATA8 CHOICE(.align 8, .align ARG2(3,0x0), .align 8)
-#define ALIGNDATA4 CHOICE(.align 4, .align ARG2(2,0x0), .align 4)
-#define ALIGNDATA2 CHOICE(.align 2, .align ARG2(1,0x0), .align 2)
-#endif /* __AOUT__ */
-#define FILE(s) CHOICE(.file s, .file s, .file s)
-#define STRING(s) CHOICE(.string s, .asciz s, .asciz s)
-#define D_LONG CHOICE(.long, .long, .data4)
-#define D_WORD CHOICE(.value, .short, .data2)
-#define D_BYTE CHOICE(.byte, .byte, .data1)
-#define SPACE CHOICE(.comm, .space, .space)
-#define COMM CHOICE(.comm, .comm, .comm)
-#define SEG_DATA CHOICE(.data, .data, .sect .data)
-#define SEG_TEXT CHOICE(.text, .text, .sect .text)
-#define SEG_BSS CHOICE(.bss, .bss, .sect .bss)
-
-#ifdef GNU_ASSEMBLER
-#define D_SPACE(n) . = . + n
-#else
-#define D_SPACE(n) .space n
-#endif
-
-/* Addressing Modes */
-/* Immediate Mode */
-#define ADDR(a) CHOICE(CONCAT($,a), $a, a)
-#define CONST(a) CHOICE(CONCAT($,a), $a, a)
-
-/* Indirect Mode */
-#define CONTENT(a) CHOICE(a, a, (a)) /* take contents of variable */
-#define REGIND(a) CHOICE((a), (a), (a)) /* Register a indirect */
-/* Register b indirect plus displacement a */
-#define REGOFF(a, b) CHOICE(a(b), a(b), a(b))
-/* Reg indirect Base + Index + Displacement - this is mainly for 16-bit mode
- * which has no scaling
- */
-#define REGBID(b,i,d) CHOICE(d(b,i), d(b,i), d(b)(i))
-/* Reg indirect Base + (Index * Scale) */
-#define REGBIS(b,i,s) CHOICE((b,i,s), (b,i,s), (b)(i*s))
-/* Reg indirect Base + (Index * Scale) + Displacement */
-#define REGBISD(b,i,s,d) CHOICE(d(b,i,s), d(b,i,s), d(b)(i*s))
-/* Displaced Scaled Index: */
-#define REGDIS(d,i,s) CHOICE(d(,i,s), d(,i,s), d(i * s))
-/* Indexed Base: */
-#define REGBI(b,i) CHOICE((b,i), (b,i), (b)(i))
-/* Displaced Base: */
-#define REGDB(d,b) CHOICE(d(b), d(b), d(b))
-/* Variable indirect: */
-#define VARINDIRECT(var) CHOICE(*var, *var, (var))
-/* Use register contents as jump/call target: */
-#define CODEPTR(reg) CHOICE(*reg, *reg, reg)
-
-/* For expressions requiring bracketing
- * eg. (CRT0_PM | CRT_EM)
- */
-
-#define EXPR(a) CHOICE([a], (a), [a])
-#define ENOT(a) CHOICE(0!a, ~a, ~a)
-#define EMUL(a,b) CHOICE(a\*b, a*b, a*b)
-#define EDIV(a,b) CHOICE(a\/b, a/b, a/b)
-
-/*
- * We have to beat the problem of commas within arguments to choice.
- * eg. choice (add a,b, add b,a) will get argument mismatch. Luckily ANSI
- * and other known cpp definitions evaluate arguments before substitution
- * so the following works.
- */
-#define ARG2(a, b) a,b
-#define ARG3(a,b,c) a,b,c
-
-/* Redefine assembler commands */
-#define AAA CHOICE(aaa, aaa, aaa)
-#define AAD CHOICE(aad, aad, aad)
-#define AAM CHOICE(aam, aam, aam)
-#define AAS CHOICE(aas, aas, aas)
-#define ADC_L(a, b) CHOICE(adcl ARG2(a,b), adcl ARG2(a,b), _LTOG adc ARG2(b,a))
-#define ADC_W(a, b) CHOICE(adcw ARG2(a,b), adcw ARG2(a,b), _WTOG adc ARG2(b,a))
-#define ADC_B(a, b) CHOICE(adcb ARG2(a,b), adcb ARG2(a,b), adcb ARG2(b,a))
-#define ADD_L(a, b) CHOICE(addl ARG2(a,b), addl ARG2(a,b), _LTOG add ARG2(b,a))
-#define ADD_W(a, b) CHOICE(addw ARG2(a,b), addw ARG2(a,b), _WTOG add ARG2(b,a))
-#define ADD_B(a, b) CHOICE(addb ARG2(a,b), addb ARG2(a,b), addb ARG2(b,a))
-#define AND_L(a, b) CHOICE(andl ARG2(a,b), andl ARG2(a,b), _LTOG and ARG2(b,a))
-#define AND_W(a, b) CHOICE(andw ARG2(a,b), andw ARG2(a,b), _WTOG and ARG2(b,a))
-#define AND_B(a, b) CHOICE(andb ARG2(a,b), andb ARG2(a,b), andb ARG2(b,a))
-#define ARPL(a,b) CHOICE(arpl ARG2(a,b), arpl ARG2(a,b), arpl ARG2(b,a))
-#define BOUND_L(a, b) CHOICE(boundl ARG2(a,b), boundl ARG2(b,a), _LTOG bound ARG2(b,a))
-#define BOUND_W(a, b) CHOICE(boundw ARG2(a,b), boundw ARG2(b,a), _WTOG bound ARG2(b,a))
-#define BSF_L(a, b) CHOICE(bsfl ARG2(a,b), bsfl ARG2(a,b), _LTOG bsf ARG2(b,a))
-#define BSF_W(a, b) CHOICE(bsfw ARG2(a,b), bsfw ARG2(a,b), _WTOG bsf ARG2(b,a))
-#define BSR_L(a, b) CHOICE(bsrl ARG2(a,b), bsrl ARG2(a,b), _LTOG bsr ARG2(b,a))
-#define BSR_W(a, b) CHOICE(bsrw ARG2(a,b), bsrw ARG2(a,b), _WTOG bsr ARG2(b,a))
-#define BT_L(a, b) CHOICE(btl ARG2(a,b), btl ARG2(a,b), _LTOG bt ARG2(b,a))
-#define BT_W(a, b) CHOICE(btw ARG2(a,b), btw ARG2(a,b), _WTOG bt ARG2(b,a))
-#define BTC_L(a, b) CHOICE(btcl ARG2(a,b), btcl ARG2(a,b), _LTOG btc ARG2(b,a))
-#define BTC_W(a, b) CHOICE(btcw ARG2(a,b), btcw ARG2(a,b), _WTOG btc ARG2(b,a))
-#define BTR_L(a, b) CHOICE(btrl ARG2(a,b), btrl ARG2(a,b), _LTOG btr ARG2(b,a))
-#define BTR_W(a, b) CHOICE(btrw ARG2(a,b), btrw ARG2(a,b), _WTOG btr ARG2(b,a))
-#define BTS_L(a, b) CHOICE(btsl ARG2(a,b), btsl ARG2(a,b), _LTOG bts ARG2(b,a))
-#define BTS_W(a, b) CHOICE(btsw ARG2(a,b), btsw ARG2(a,b), _WTOG bts ARG2(b,a))
-#define CALL(a) CHOICE(call a, call a, call a)
-#define CALLF(s,a) CHOICE(lcall ARG2(s,a), lcall ARG2(s,a), callf s:a)
-#define CBW CHOICE(cbtw, cbw, cbw)
-#define CWDE CHOICE(cwtd, cwde, cwde)
-#define CLC CHOICE(clc, clc, clc)
-#define CLD CHOICE(cld, cld, cld)
-#define CLI CHOICE(cli, cli, cli)
-#define CLTS CHOICE(clts, clts, clts)
-#define CMC CHOICE(cmc, cmc, cmc)
-#define CMP_L(a, b) CHOICE(cmpl ARG2(a,b), cmpl ARG2(a,b), _LTOG cmp ARG2(b,a))
-#define CMP_W(a, b) CHOICE(cmpw ARG2(a,b), cmpw ARG2(a,b), _WTOG cmp ARG2(b,a))
-#define CMP_B(a, b) CHOICE(cmpb ARG2(a,b), cmpb ARG2(a,b), cmpb ARG2(b,a))
-#define CMPS_L CHOICE(cmpsl, cmpsl, _LTOG cmps)
-#define CMPS_W CHOICE(cmpsw, cmpsw, _WTOG cmps)
-#define CMPS_B CHOICE(cmpsb, cmpsb, cmpsb)
-#define CWD CHOICE(cwtl, cwd, cwd)
-#define CDQ CHOICE(cltd, cdq, cdq)
-#define DAA CHOICE(daa, daa, daa)
-#define DAS CHOICE(das, das, das)
-#define DEC_L(a) CHOICE(decl a, decl a, _LTOG dec a)
-#define DEC_W(a) CHOICE(decw a, decw a, _WTOG dec a)
-#define DEC_B(a) CHOICE(decb a, decb a, decb a)
-#define DIV_L(a) CHOICE(divl a, divl a, div a)
-#define DIV_W(a) CHOICE(divw a, divw a, div a)
-#define DIV_B(a) CHOICE(divb a, divb a, divb a)
-#define ENTER(a,b) CHOICE(enter ARG2(a,b), enter ARG2(a,b), enter ARG2(b,a))
-#define HLT CHOICE(hlt, hlt, hlt)
-#define IDIV_L(a) CHOICE(idivl a, idivl a, _LTOG idiv a)
-#define IDIV_W(a) CHOICE(idivw a, idivw a, _WTOG idiv a)
-#define IDIV_B(a) CHOICE(idivb a, idivb a, idivb a)
-/* More forms than this for imul!! */
-#define IMUL_L(a, b) CHOICE(imull ARG2(a,b), imull ARG2(a,b), _LTOG imul ARG2(b,a))
-#define IMUL_W(a, b) CHOICE(imulw ARG2(a,b), imulw ARG2(a,b), _WTOG imul ARG2(b,a))
-#define IMUL_B(a) CHOICE(imulb a, imulb a, imulb a)
-#define IN_L CHOICE(inl (DX), inl ARG2(DX,EAX), _LTOG in DX)
-#define IN_W CHOICE(inw (DX), inw ARG2(DX,AX), _WTOG in DX)
-#define IN_B CHOICE(inb (DX), inb ARG2(DX,AL), inb DX)
-/* Please AS code writer: use the following ONLY, if you refer to ports<256
- * directly, but not in IN1_W(DX), for instance, even if IN1_ looks nicer
- */
-#if defined (sun)
-#define IN1_L(a) CHOICE(inl (a), inl ARG2(a,EAX), _LTOG in a)
-#define IN1_W(a) CHOICE(inw (a), inw ARG2(a,AX), _WTOG in a)
-#define IN1_B(a) CHOICE(inb (a), inb ARG2(a,AL), inb a)
-#else
-#define IN1_L(a) CHOICE(inl a, inl ARG2(a,EAX), _LTOG in a)
-#define IN1_W(a) CHOICE(inw a, inw ARG2(a,AX), _WTOG in a)
-#define IN1_B(a) CHOICE(inb a, inb ARG2(a,AL), inb a)
-#endif
-#define INC_L(a) CHOICE(incl a, incl a, _LTOG inc a)
-#define INC_W(a) CHOICE(incw a, incw a, _WTOG inc a)
-#define INC_B(a) CHOICE(incb a, incb a, incb a)
-#define INS_L CHOICE(insl, insl, _LTOG ins)
-#define INS_W CHOICE(insw, insw, _WTOG ins)
-#define INS_B CHOICE(insb, insb, insb)
-#define INT(a) CHOICE(int a, int a, int a)
-#define INT3 CHOICE(int CONST(3), int3, int CONST(3))
-#define INTO CHOICE(into, into, into)
-#define IRET CHOICE(iret, iret, iret)
-#define IRETD CHOICE(iret, iret, iretd)
-#define JA(a) CHOICE(ja a, ja a, ja a)
-#define JAE(a) CHOICE(jae a, jae a, jae a)
-#define JB(a) CHOICE(jb a, jb a, jb a)
-#define JBE(a) CHOICE(jbe a, jbe a, jbe a)
-#define JC(a) CHOICE(jc a, jc a, jc a)
-#define JE(a) CHOICE(je a, je a, je a)
-#define JG(a) CHOICE(jg a, jg a, jg a)
-#define JGE(a) CHOICE(jge a, jge a, jge a)
-#define JL(a) CHOICE(jl a, jl a, jl a)
-#define JLE(a) CHOICE(jle a, jle a, jle a)
-#define JNA(a) CHOICE(jna a, jna a, jna a)
-#define JNAE(a) CHOICE(jnae a, jnae a, jnae a)
-#define JNB(a) CHOICE(jnb a, jnb a, jnb a)
-#define JNBE(a) CHOICE(jnbe a, jnbe a, jnbe a)
-#define JNC(a) CHOICE(jnc a, jnc a, jnc a)
-#define JNE(a) CHOICE(jne a, jne a, jne a)
-#define JNG(a) CHOICE(jng a, jng a, jng a)
-#define JNGE(a) CHOICE(jnge a, jnge a, jnge a)
-#define JNL(a) CHOICE(jnl a, jnl a, jnl a)
-#define JNLE(a) CHOICE(jnle a, jnle a, jnle a)
-#define JNO(a) CHOICE(jno a, jno a, jno a)
-#define JNP(a) CHOICE(jnp a, jnp a, jnp a)
-#define JNS(a) CHOICE(jns a, jns a, jns a)
-#define JNZ(a) CHOICE(jnz a, jnz a, jnz a)
-#define JO(a) CHOICE(jo a, jo a, jo a)
-#define JP(a) CHOICE(jp a, jp a, jp a)
-#define JPE(a) CHOICE(jpe a, jpe a, jpe a)
-#define JPO(a) CHOICE(jpo a, jpo a, jpo a)
-#define JS(a) CHOICE(js a, js a, js a)
-#define JZ(a) CHOICE(jz a, jz a, jz a)
-#define JMP(a) CHOICE(jmp a, jmp a, jmp a)
-#define JMPF(s,a) CHOICE(ljmp ARG2(s,a), ljmp ARG2(s,a), jmpf s:a)
-#define LAHF CHOICE(lahf, lahf, lahf)
-#if !defined(_REAL_MODE) && !defined(_V86_MODE)
-#define LAR(a, b) CHOICE(lar ARG2(a, b), lar ARG2(a, b), lar ARG2(b, a))
-#endif
-#define LEA_L(a, b) CHOICE(leal ARG2(a,b), leal ARG2(a,b), _LTOG lea ARG2(b,a))
-#define LEA_W(a, b) CHOICE(leaw ARG2(a,b), leaw ARG2(a,b), _WTOG lea ARG2(b,a))
-#define LEAVE CHOICE(leave, leave, leave)
-#define LGDT(a) CHOICE(lgdt a, lgdt a, lgdt a)
-#define LIDT(a) CHOICE(lidt a, lidt a, lidt a)
-#define LDS(a, b) CHOICE(ldsl ARG2(a,b), lds ARG2(a,b), lds ARG2(b,a))
-#define LES(a, b) CHOICE(lesl ARG2(a,b), les ARG2(a,b), les ARG2(b,a))
-#define LFS(a, b) CHOICE(lfsl ARG2(a,b), lfs ARG2(a,b), lfs ARG2(b,a))
-#define LGS(a, b) CHOICE(lgsl ARG2(a,b), lgs ARG2(a,b), lgs ARG2(b,a))
-#define LSS(a, b) CHOICE(lssl ARG2(a,b), lss ARG2(a,b), lss ARG2(b,a))
-#define LLDT(a) CHOICE(lldt a, lldt a, lldt a)
-#define LMSW(a) CHOICE(lmsw a, lmsw a, lmsw a)
-#define LOCK CHOICE(lock, lock, lock)
-#define LODS_L CHOICE(lodsl, lodsl, _LTOG lods)
-#define LODS_W CHOICE(lodsw, lodsw, _WTOG lods)
-#define LODS_B CHOICE(lodsb, lodsb, lodsb)
-#define LOOP(a) CHOICE(loop a, loop a, loop a)
-#define LOOPE(a) CHOICE(loope a, loope a, loope a)
-#define LOOPZ(a) CHOICE(loopz a, loopz a, loopz a)
-#define LOOPNE(a) CHOICE(loopne a, loopne a, loopne a)
-#define LOOPNZ(a) CHOICE(loopnz a, loopnz a, loopnz a)
-#if !defined(_REAL_MODE) && !defined(_V86_MODE)
-#define LSL(a, b) CHOICE(lsl ARG2(a,b), lsl ARG2(a,b), lsl ARG2(b,a))
-#endif
-#define LTR(a) CHOICE(ltr a, ltr a, ltr a)
-#define MOV_SR(a, b) CHOICE(movw ARG2(a,b), mov ARG2(a,b), mov ARG2(b,a))
-#define MOV_L(a, b) CHOICE(movl ARG2(a,b), movl ARG2(a,b), _LTOG mov ARG2(b,a))
-#define MOV_W(a, b) CHOICE(movw ARG2(a,b), movw ARG2(a,b), _WTOG mov ARG2(b,a))
-#define MOV_B(a, b) CHOICE(movb ARG2(a,b), movb ARG2(a,b), movb ARG2(b,a))
-#define MOVS_L CHOICE(movsl, movsl, _LTOG movs)
-#define MOVS_W CHOICE(movsw, movsw, _WTOG movs)
-#define MOVS_B CHOICE(movsb, movsb, movsb)
-#define MOVSX_BL(a, b) CHOICE(movsbl ARG2(a,b), movsbl ARG2(a,b), movsx ARG2(b,a))
-#define MOVSX_BW(a, b) CHOICE(movsbw ARG2(a,b), movsbw ARG2(a,b), movsx ARG2(b,a))
-#define MOVSX_WL(a, b) CHOICE(movswl ARG2(a,b), movswl ARG2(a,b), movsx ARG2(b,a))
-#define MOVZX_BL(a, b) CHOICE(movzbl ARG2(a,b), movzbl ARG2(a,b), movzx ARG2(b,a))
-#define MOVZX_BW(a, b) CHOICE(movzbw ARG2(a,b), movzbw ARG2(a,b), movzx ARG2(b,a))
-#define MOVZX_WL(a, b) CHOICE(movzwl ARG2(a,b), movzwl ARG2(a,b), movzx ARG2(b,a))
-#define MUL_L(a) CHOICE(mull a, mull a, _LTOG mul a)
-#define MUL_W(a) CHOICE(mulw a, mulw a, _WTOG mul a)
-#define MUL_B(a) CHOICE(mulb a, mulb a, mulb a)
-#define NEG_L(a) CHOICE(negl a, negl a, _LTOG neg a)
-#define NEG_W(a) CHOICE(negw a, negw a, _WTOG neg a)
-#define NEG_B(a) CHOICE(negb a, negb a, negb a)
-#define NOP CHOICE(nop, nop, nop)
-#define NOT_L(a) CHOICE(notl a, notl a, _LTOG not a)
-#define NOT_W(a) CHOICE(notw a, notw a, _WTOG not a)
-#define NOT_B(a) CHOICE(notb a, notb a, notb a)
-#define OR_L(a,b) CHOICE(orl ARG2(a,b), orl ARG2(a,b), _LTOG or ARG2(b,a))
-#define OR_W(a,b) CHOICE(orw ARG2(a,b), orw ARG2(a,b), _WTOG or ARG2(b,a))
-#define OR_B(a,b) CHOICE(orb ARG2(a,b), orb ARG2(a,b), orb ARG2(b,a))
-#define OUT_L CHOICE(outl (DX), outl ARG2(EAX,DX), _LTOG out DX)
-#define OUT_W CHOICE(outw (DX), outw ARG2(AX,DX), _WTOG out DX)
-#define OUT_B CHOICE(outb (DX), outb ARG2(AL,DX), outb DX)
-/* Please AS code writer: use the following ONLY, if you refer to ports<256
- * directly, but not in OUT1_W(DX), for instance, even if OUT1_ looks nicer
- */
-#define OUT1_L(a) CHOICE(outl (a), outl ARG2(EAX,a), _LTOG out a)
-#define OUT1_W(a) CHOICE(outw (a), outw ARG2(AX,a), _WTOG out a)
-#define OUT1_B(a) CHOICE(outb (a), outb ARG2(AL,a), outb a)
-#define OUTS_L CHOICE(outsl, outsl, _LTOG outs)
-#define OUTS_W CHOICE(outsw, outsw, _WTOG outs)
-#define OUTS_B CHOICE(outsb, outsb, outsb)
-#define POP_SR(a) CHOICE(pop a, pop a, pop a)
-#define POP_L(a) CHOICE(popl a, popl a, _LTOG pop a)
-#define POP_W(a) CHOICE(popw a, popw a, _WTOG pop a)
-#define POPA_L CHOICE(popal, popal, _LTOG popa)
-#define POPA_W CHOICE(popaw, popaw, _WTOG popa)
-#define POPF_L CHOICE(popfl, popfl, _LTOG popf)
-#define POPF_W CHOICE(popfw, popfw, _WTOG popf)
-#define PUSH_SR(a) CHOICE(push a, push a, push a)
-#define PUSH_L(a) CHOICE(pushl a, pushl a, _LTOG push a)
-#define PUSH_W(a) CHOICE(pushw a, pushw a, _WTOG push a)
-#define PUSH_B(a) CHOICE(push a, pushb a, push a)
-#define PUSHA_L CHOICE(pushal, pushal, _LTOG pusha)
-#define PUSHA_W CHOICE(pushaw, pushaw, _WTOG pusha)
-#define PUSHF_L CHOICE(pushfl, pushfl, _LTOG pushf)
-#define PUSHF_W CHOICE(pushfw, pushfw, _WTOG pushf)
-#define RCL_L(a, b) CHOICE(rcll ARG2(a,b), rcll ARG2(a,b), _LTOG rcl ARG2(b,a))
-#define RCL_W(a, b) CHOICE(rclw ARG2(a,b), rclw ARG2(a,b), _WTOG rcl ARG2(b,a))
-#define RCL_B(a, b) CHOICE(rclb ARG2(a,b), rclb ARG2(a,b), rclb ARG2(b,a))
-#define RCR_L(a, b) CHOICE(rcrl ARG2(a,b), rcrl ARG2(a,b), _LTOG rcr ARG2(b,a))
-#define RCR_W(a, b) CHOICE(rcrw ARG2(a,b), rcrw ARG2(a,b), _WTOG rcr ARG2(b,a))
-#define RCR_B(a, b) CHOICE(rcrb ARG2(a,b), rcrb ARG2(a,b), rcrb ARG2(b,a))
-#define ROL_L(a, b) CHOICE(roll ARG2(a,b), roll ARG2(a,b), _LTOG rol ARG2(b,a))
-#define ROL_W(a, b) CHOICE(rolw ARG2(a,b), rolw ARG2(a,b), _WTOG rol ARG2(b,a))
-#define ROL_B(a, b) CHOICE(rolb ARG2(a,b), rolb ARG2(a,b), rolb ARG2(b,a))
-#define ROR_L(a, b) CHOICE(rorl ARG2(a,b), rorl ARG2(a,b), _LTOG ror ARG2(b,a))
-#define ROR_W(a, b) CHOICE(rorw ARG2(a,b), rorw ARG2(a,b), _WTOG ror ARG2(b,a))
-#define ROR_B(a, b) CHOICE(rorb ARG2(a,b), rorb ARG2(a,b), rorb ARG2(b,a))
-#define REP CHOICE(rep ;, rep ;, repe)
-#define REPE CHOICE(repz ;, repe ;, repe)
-#define REPNE CHOICE(repnz ;, repne ;, repne)
-#define REPNZ REPNE
-#define REPZ REPE
-#define RET CHOICE(ret, ret, ret)
-#define SAHF CHOICE(sahf, sahf, sahf)
-#define SAL_L(a, b) CHOICE(sall ARG2(a,b), sall ARG2(a,b), _LTOG sal ARG2(b,a))
-#define SAL_W(a, b) CHOICE(salw ARG2(a,b), salw ARG2(a,b), _WTOG sal ARG2(b,a))
-#define SAL_B(a, b) CHOICE(salb ARG2(a,b), salb ARG2(a,b), salb ARG2(b,a))
-#define SAR_L(a, b) CHOICE(sarl ARG2(a,b), sarl ARG2(a,b), _LTOG sar ARG2(b,a))
-#define SAR_W(a, b) CHOICE(sarw ARG2(a,b), sarw ARG2(a,b), _WTOG sar ARG2(b,a))
-#define SAR_B(a, b) CHOICE(sarb ARG2(a,b), sarb ARG2(a,b), sarb ARG2(b,a))
-#define SBB_L(a, b) CHOICE(sbbl ARG2(a,b), sbbl ARG2(a,b), _LTOG sbb ARG2(b,a))
-#define SBB_W(a, b) CHOICE(sbbw ARG2(a,b), sbbw ARG2(a,b), _WTOG sbb ARG2(b,a))
-#define SBB_B(a, b) CHOICE(sbbb ARG2(a,b), sbbb ARG2(a,b), sbbb ARG2(b,a))
-#define SCAS_L CHOICE(scasl, scasl, _LTOG scas)
-#define SCAS_W CHOICE(scasw, scasw, _WTOG scas)
-#define SCAS_B CHOICE(scasb, scasb, scasb)
-#define SETA(a) CHOICE(seta a, seta a, seta a)
-#define SETAE(a) CHOICE(setae a, setae a, setae a)
-#define SETB(a) CHOICE(setb a, setb a, setb a)
-#define SETBE(a) CHOICE(setbe a, setbe a, setbe a)
-#define SETC(a) CHOICE(setc a, setb a, setb a)
-#define SETE(a) CHOICE(sete a, sete a, sete a)
-#define SETG(a) CHOICE(setg a, setg a, setg a)
-#define SETGE(a) CHOICE(setge a, setge a, setge a)
-#define SETL(a) CHOICE(setl a, setl a, setl a)
-#define SETLE(a) CHOICE(setle a, setle a, setle a)
-#define SETNA(a) CHOICE(setna a, setna a, setna a)
-#define SETNAE(a) CHOICE(setnae a, setnae a, setnae a)
-#define SETNB(a) CHOICE(setnb a, setnb a, setnb a)
-#define SETNBE(a) CHOICE(setnbe a, setnbe a, setnbe a)
-#define SETNC(a) CHOICE(setnc a, setnb a, setnb a)
-#define SETNE(a) CHOICE(setne a, setne a, setne a)
-#define SETNG(a) CHOICE(setng a, setng a, setng a)
-#define SETNGE(a) CHOICE(setnge a, setnge a, setnge a)
-#define SETNL(a) CHOICE(setnl a, setnl a, setnl a)
-#define SETNLE(a) CHOICE(setnle a, setnle a, setnle a)
-#define SETNO(a) CHOICE(setno a, setno a, setno a)
-#define SETNP(a) CHOICE(setnp a, setnp a, setnp a)
-#define SETNS(a) CHOICE(setns a, setns a, setna a)
-#define SETNZ(a) CHOICE(setnz a, setnz a, setnz a)
-#define SETO(a) CHOICE(seto a, seto a, seto a)
-#define SETP(a) CHOICE(setp a, setp a, setp a)
-#define SETPE(a) CHOICE(setpe a, setpe a, setpe a)
-#define SETPO(a) CHOICE(setpo a, setpo a, setpo a)
-#define SETS(a) CHOICE(sets a, sets a, seta a)
-#define SETZ(a) CHOICE(setz a, setz a, setz a)
-#define SGDT(a) CHOICE(sgdt a, sgdt a, sgdt a)
-#define SIDT(a) CHOICE(sidt a, sidt a, sidt a)
-#define SHL_L(a, b) CHOICE(shll ARG2(a,b), shll ARG2(a,b), _LTOG shl ARG2(b,a))
-#define SHL_W(a, b) CHOICE(shlw ARG2(a,b), shlw ARG2(a,b), _WTOG shl ARG2(b,a))
-#define SHL_B(a, b) CHOICE(shlb ARG2(a,b), shlb ARG2(a,b), shlb ARG2(b,a))
-#define SHLD_L(a,b,c) CHOICE(shldl ARG3(a,b,c), shldl ARG3(a,b,c), _LTOG shld ARG3(c,b,a))
-#define SHLD2_L(a,b) CHOICE(shldl ARG2(a,b), shldl ARG3(CL,a,b), _LTOG shld ARG3(b,a,CL))
-#define SHLD_W(a,b,c) CHOICE(shldw ARG3(a,b,c), shldw ARG3(a,b,c), _WTOG shld ARG3(c,b,a))
-#define SHLD2_W(a,b) CHOICE(shldw ARG2(a,b), shldw ARG3(CL,a,b), _WTOG shld ARG3(b,a,CL))
-#define SHR_L(a, b) CHOICE(shrl ARG2(a,b), shrl ARG2(a,b), _LTOG shr ARG2(b,a))
-#define SHR_W(a, b) CHOICE(shrw ARG2(a,b), shrw ARG2(a,b), _WTOG shr ARG2(b,a))
-#define SHR_B(a, b) CHOICE(shrb ARG2(a,b), shrb ARG2(a,b), shrb ARG2(b,a))
-#define SHRD_L(a,b,c) CHOICE(shrdl ARG3(a,b,c), shrdl ARG3(a,b,c), _LTOG shrd ARG3(c,b,a))
-#define SHRD2_L(a,b) CHOICE(shrdl ARG2(a,b), shrdl ARG3(CL,a,b), _LTOG shrd ARG3(b,a,CL))
-#define SHRD_W(a,b,c) CHOICE(shrdw ARG3(a,b,c), shrdw ARG3(a,b,c), _WTOG shrd ARG3(c,b,a))
-#define SHRD2_W(a,b) CHOICE(shrdw ARG2(a,b), shrdw ARG3(CL,a,b), _WTOG shrd ARG3(b,a,CL))
-#define SLDT(a) CHOICE(sldt a, sldt a, sldt a)
-#define SMSW(a) CHOICE(smsw a, smsw a, smsw a)
-#define STC CHOICE(stc, stc, stc)
-#define STD CHOICE(std, std, std)
-#define STI CHOICE(sti, sti, sti)
-#define STOS_L CHOICE(stosl, stosl, _LTOG stos)
-#define STOS_W CHOICE(stosw, stosw, _WTOG stos)
-#define STOS_B CHOICE(stosb, stosb, stosb)
-#define STR(a) CHOICE(str a, str a, str a)
-#define SUB_L(a, b) CHOICE(subl ARG2(a,b), subl ARG2(a,b), _LTOG sub ARG2(b,a))
-#define SUB_W(a, b) CHOICE(subw ARG2(a,b), subw ARG2(a,b), _WTOG sub ARG2(b,a))
-#define SUB_B(a, b) CHOICE(subb ARG2(a,b), subb ARG2(a,b), subb ARG2(b,a))
-#define TEST_L(a, b) CHOICE(testl ARG2(a,b), testl ARG2(a,b), _LTOG test ARG2(b,a))
-#define TEST_W(a, b) CHOICE(testw ARG2(a,b), testw ARG2(a,b), _WTOG test ARG2(b,a))
-#define TEST_B(a, b) CHOICE(testb ARG2(a,b), testb ARG2(a,b), testb ARG2(b,a))
-#define VERR(a) CHOICE(verr a, verr a, verr a)
-#define VERW(a) CHOICE(verw a, verw a, verw a)
-#define WAIT CHOICE(wait, wait, wait)
-#define XCHG_L(a, b) CHOICE(xchgl ARG2(a,b), xchgl ARG2(a,b), _LTOG xchg ARG2(b,a))
-#define XCHG_W(a, b) CHOICE(xchgw ARG2(a,b), xchgw ARG2(a,b), _WTOG xchg ARG2(b,a))
-#define XCHG_B(a, b) CHOICE(xchgb ARG2(a,b), xchgb ARG2(a,b), xchgb ARG2(b,a))
-#define XLAT CHOICE(xlat, xlat, xlat)
-#define XOR_L(a, b) CHOICE(xorl ARG2(a,b), xorl ARG2(a,b), _LTOG xor ARG2(b,a))
-#define XOR_W(a, b) CHOICE(xorw ARG2(a,b), xorw ARG2(a,b), _WTOG xor ARG2(b,a))
-#define XOR_B(a, b) CHOICE(xorb ARG2(a,b), xorb ARG2(a,b), xorb ARG2(b,a))
-
-
-/* Floating Point Instructions */
-#define F2XM1 CHOICE(f2xm1, f2xm1, f2xm1)
-#define FABS CHOICE(fabs, fabs, fabs)
-#define FADD_D(a) CHOICE(faddl a, faddl a, faddd a)
-#define FADD_S(a) CHOICE(fadds a, fadds a, fadds a)
-#define FADD2(a, b) CHOICE(fadd ARG2(a,b), fadd ARG2(a,b), fadd ARG2(b,a))
-#define FADDP(a, b) CHOICE(faddp ARG2(a,b), faddp ARG2(a,b), faddp ARG2(b,a))
-#define FIADD_L(a) CHOICE(fiaddl a, fiaddl a, fiaddl a)
-#define FIADD_W(a) CHOICE(fiadd a, fiadds a, fiadds a)
-#define FBLD(a) CHOICE(fbld a, fbld a, fbld a)
-#define FBSTP(a) CHOICE(fbstp a, fbstp a, fbstp a)
-#define FCHS CHOICE(fchs, fchs, fchs)
-#define FCLEX CHOICE(fclex, wait; fnclex, wait; fclex)
-#define FNCLEX CHOICE(fnclex, fnclex, fclex)
-#define FCOM(a) CHOICE(fcom a, fcom a, fcom a)
-#define FCOM_D(a) CHOICE(fcoml a, fcoml a, fcomd a)
-#define FCOM_S(a) CHOICE(fcoms a, fcoms a, fcoms a)
-#define FCOMP(a) CHOICE(fcomp a, fcomp a, fcomp a)
-#define FCOMP_D(a) CHOICE(fcompl a, fcompl a, fcompd a)
-#define FCOMP_S(a) CHOICE(fcomps a, fcomps a, fcomps a)
-#define FCOMPP CHOICE(fcompp, fcompp, fcompp)
-#define FCOS CHOICE(fcos, fcos, fcos)
-#define FDECSTP CHOICE(fdecstp, fdecstp, fdecstp)
-#define FDIV_D(a) CHOICE(fdivl a, fdivl a, fdivd a)
-#define FDIV_S(a) CHOICE(fdivs a, fdivs a, fdivs a)
-#define FDIV2(a, b) CHOICE(fdiv ARG2(a,b), fdiv ARG2(a,b), fdiv ARG2(b,a))
-#define FDIVP(a, b) CHOICE(fdivp ARG2(a,b), fdivp ARG2(a,b), fdivp ARG2(b,a))
-#define FIDIV_L(a) CHOICE(fidivl a, fidivl a, fidivl a)
-#define FIDIV_W(a) CHOICE(fidiv a, fidivs a, fidivs a)
-#define FDIVR_D(a) CHOICE(fdivrl a, fdivrl a, fdivrd a)
-#define FDIVR_S(a) CHOICE(fdivrs a, fdivrs a, fdivrs a)
-#define FDIVR2(a, b) CHOICE(fdivr ARG2(a,b), fdivr ARG2(a,b), fdivr ARG2(b,a))
-#define FDIVRP(a, b) CHOICE(fdivrp ARG2(a,b), fdivrp ARG2(a,b), fdivrp ARG2(b,a))
-#define FIDIVR_L(a) CHOICE(fidivrl a, fidivrl a, fidivrl a)
-#define FIDIVR_W(a) CHOICE(fidivr a, fidivrs a, fidivrs a)
-#define FFREE(a) CHOICE(ffree a, ffree a, ffree a)
-#define FICOM_L(a) CHOICE(ficoml a, ficoml a, ficoml a)
-#define FICOM_W(a) CHOICE(ficom a, ficoms a, ficoms a)
-#define FICOMP_L(a) CHOICE(ficompl a, ficompl a, ficompl a)
-#define FICOMP_W(a) CHOICE(ficomp a, ficomps a, ficomps a)
-#define FILD_Q(a) CHOICE(fildll a, fildq a, fildq a)
-#define FILD_L(a) CHOICE(fildl a, fildl a, fildl a)
-#define FILD_W(a) CHOICE(fild a, filds a, filds a)
-#define FINCSTP CHOICE(fincstp, fincstp, fincstp)
-#define FINIT CHOICE(finit, wait; fninit, wait; finit)
-#define FNINIT CHOICE(fninit, fninit, finit)
-#define FIST_L(a) CHOICE(fistl a, fistl a, fistl a)
-#define FIST_W(a) CHOICE(fist a, fists a, fists a)
-#define FISTP_Q(a) CHOICE(fistpll a, fistpq a, fistpq a)
-#define FISTP_L(a) CHOICE(fistpl a, fistpl a, fistpl a)
-#define FISTP_W(a) CHOICE(fistp a, fistps a, fistps a)
-#define FLD_X(a) CHOICE(fldt a, fldt a, fldx a) /* 80 bit data type! */
-#define FLD_D(a) CHOICE(fldl a, fldl a, fldd a)
-#define FLD_S(a) CHOICE(flds a, flds a, flds a)
-#define FLD1 CHOICE(fld1, fld1, fld1)
-#define FLDL2T CHOICE(fldl2t, fldl2t, fldl2t)
-#define FLDL2E CHOICE(fldl2e, fldl2e, fldl2e)
-#define FLDPI CHOICE(fldpi, fldpi, fldpi)
-#define FLDLG2 CHOICE(fldlg2, fldlg2, fldlg2)
-#define FLDLN2 CHOICE(fldln2, fldln2, fldln2)
-#define FLDZ CHOICE(fldz, fldz, fldz)
-#define FLDCW(a) CHOICE(fldcw a, fldcw a, fldcw a)
-#define FLDENV(a) CHOICE(fldenv a, fldenv a, fldenv a)
-#define FMUL_S(a) CHOICE(fmuls a, fmuls a, fmuls a)
-#define FMUL_D(a) CHOICE(fmull a, fmull a, fmuld a)
-#define FMUL2(a, b) CHOICE(fmul ARG2(a,b), fmul ARG2(a,b), fmul ARG2(b,a))
-#define FMULP(a, b) CHOICE(fmulp ARG2(a,b), fmulp ARG2(a,b), fmulp ARG2(b,a))
-#define FIMUL_L(a) CHOICE(fimull a, fimull a, fimull a)
-#define FIMUL_W(a) CHOICE(fimul a, fimuls a, fimuls a)
-#define FNOP CHOICE(fnop, fnop, fnop)
-#define FPATAN CHOICE(fpatan, fpatan, fpatan)
-#define FPREM CHOICE(fprem, fprem, fprem)
-#define FPREM1 CHOICE(fprem1, fprem1, fprem1)
-#define FPTAN CHOICE(fptan, fptan, fptan)
-#define FRNDINT CHOICE(frndint, frndint, frndint)
-#define FRSTOR(a) CHOICE(frstor a, frstor a, frstor a)
-#define FSAVE(a) CHOICE(fsave a, wait; fnsave a, wait; fsave a)
-#define FNSAVE(a) CHOICE(fnsave a, fnsave a, fsave a)
-#define FSCALE CHOICE(fscale, fscale, fscale)
-#define FSIN CHOICE(fsin, fsin, fsin)
-#define FSINCOS CHOICE(fsincos, fsincos, fsincos)
-#define FSQRT CHOICE(fsqrt, fsqrt, fsqrt)
-#define FST_D(a) CHOICE(fstl a, fstl a, fstd a)
-#define FST_S(a) CHOICE(fsts a, fsts a, fsts a)
-#define FSTP_X(a) CHOICE(fstpt a, fstpt a, fstpx a)
-#define FSTP_D(a) CHOICE(fstpl a, fstpl a, fstpd a)
-#define FSTP_S(a) CHOICE(fstps a, fstps a, fstps a)
-#define FSTP(a) CHOICE(fstp a, fstp a, fstp a)
-#define FSTCW(a) CHOICE(fstcw a, wait; fnstcw a, wait; fstcw a)
-#define FNSTCW(a) CHOICE(fnstcw a, fnstcw a, fstcw a)
-#define FSTENV(a) CHOICE(fstenv a, wait; fnstenv a, fstenv a)
-#define FNSTENV(a) CHOICE(fnstenv a, fnstenv a, fstenv a)
-#define FSTSW(a) CHOICE(fstsw a, wait; fnstsw a, wait; fstsw a)
-#define FNSTSW(a) CHOICE(fnstsw a, fnstsw a, fstsw a)
-#define FSUB_S(a) CHOICE(fsubs a, fsubs a, fsubs a)
-#define FSUB_D(a) CHOICE(fsubl a, fsubl a, fsubd a)
-#define FSUB2(a, b) CHOICE(fsub ARG2(a,b), fsub ARG2(a,b), fsub ARG2(b,a))
-#define FSUBP(a, b) CHOICE(fsubp ARG2(a,b), fsubp ARG2(a,b), fsubp ARG2(b,a))
-#define FISUB_L(a) CHOICE(fisubl a, fisubl a, fisubl a)
-#define FISUB_W(a) CHOICE(fisub a, fisubs a, fisubs a)
-#define FSUBR_S(a) CHOICE(fsubrs a, fsubrs a, fsubrs a)
-#define FSUBR_D(a) CHOICE(fsubrl a, fsubrl a, fsubrd a)
-#define FSUBR2(a, b) CHOICE(fsubr ARG2(a,b), fsubr ARG2(a,b), fsubr ARG2(b,a))
-#define FSUBRP(a, b) CHOICE(fsubrp ARG2(a,b), fsubrp ARG2(a,b), fsubrp ARG2(b,a))
-#define FISUBR_L(a) CHOICE(fisubrl a, fisubrl a, fisubrl a)
-#define FISUBR_W(a) CHOICE(fisubr a, fisubrs a, fisubrs a)
-#define FTST CHOICE(ftst, ftst, ftst)
-#define FUCOM(a) CHOICE(fucom a, fucom a, fucom a)
-#define FUCOMP(a) CHOICE(fucomp a, fucomp a, fucomp a)
-#define FUCOMPP CHOICE(fucompp, fucompp, fucompp)
-#define FWAIT CHOICE(wait, wait, wait)
-#define FXAM CHOICE(fxam, fxam, fxam)
-#define FXCH(a) CHOICE(fxch a, fxch a, fxch a)
-#define FXTRACT CHOICE(fxtract, fxtract, fxtract)
-#define FYL2X CHOICE(fyl2x, fyl2x, fyl2x)
-#define FYL2XP1 CHOICE(fyl2xp1, fyl2xp1, fyl2xp1)
-
-/* New instructions */
-#define CPUID CHOICE(D_BYTE ARG2(15, 162), cpuid, D_BYTE ARG2(15, 162))
-#define RDTSC CHOICE(D_BYTE ARG2(15, 49), rdtsc, D_BYTE ARG2(15, 49))
-
-#else /* NASM_ASSEMBLER || MASM_ASSEMBLER is defined */
-
- /****************************************/
- /* */
- /* Intel style assemblers. */
- /* (NASM and MASM) */
- /* */
- /****************************************/
-
-#define P_EAX EAX
-#define L_EAX EAX
-#define W_AX AX
-#define B_AH AH
-#define B_AL AL
-
-#define P_EBX EBX
-#define L_EBX EBX
-#define W_BX BX
-#define B_BH BH
-#define B_BL BL
-
-#define P_ECX ECX
-#define L_ECX ECX
-#define W_CX CX
-#define B_CH CH
-#define B_CL CL
-
-#define P_EDX EDX
-#define L_EDX EDX
-#define W_DX DX
-#define B_DH DH
-#define B_DL DL
-
-#define P_EBP EBP
-#define L_EBP EBP
-#define W_BP BP
-
-#define P_ESI ESI
-#define L_ESI ESI
-#define W_SI SI
-
-#define P_EDI EDI
-#define L_EDI EDI
-#define W_DI DI
-
-#define P_ESP ESP
-#define L_ESP ESP
-#define W_SP SP
-
-#define W_CS CS
-#define W_SS SS
-#define W_DS DS
-#define W_ES ES
-#define W_FS FS
-#define W_GS GS
-
-#define X_ST ST
-#define D_ST ST
-#define L_ST ST
-
-#define P_MM0 mm0
-#define P_MM1 mm1
-#define P_MM2 mm2
-#define P_MM3 mm3
-#define P_MM4 mm4
-#define P_MM5 mm5
-#define P_MM6 mm6
-#define P_MM7 mm7
-
-#define P_XMM0 xmm0
-#define P_XMM1 xmm1
-#define P_XMM2 xmm2
-#define P_XMM3 xmm3
-#define P_XMM4 xmm4
-#define P_XMM5 xmm5
-#define P_XMM6 xmm6
-#define P_XMM7 xmm7
-
-#define CONCAT(x, y) x ## y
-#define CONCAT3(x, y, z) x ## y ## z
-
-#if defined(NASM_ASSEMBLER)
-
-#define ST(n) st ## n
-#define ST0 st0
-
-#define TBYTE_PTR tword
-#define QWORD_PTR qword
-#define DWORD_PTR dword
-#define WORD_PTR word
-#define BYTE_PTR byte
-
-#define OFFSET
-
-#define GLOBL GLOBAL
-#define ALIGNTEXT32 ALIGN 32
-#define ALIGNTEXT16 ALIGN 16
-#define ALIGNTEXT8 ALIGN 8
-#define ALIGNTEXT4 ALIGN 4
-#define ALIGNTEXT2 ALIGN 2
-#define ALIGNTEXT32ifNOP ALIGN 32
-#define ALIGNTEXT16ifNOP ALIGN 16
-#define ALIGNTEXT8ifNOP ALIGN 8
-#define ALIGNTEXT4ifNOP ALIGN 4
-#define ALIGNDATA32 ALIGN 32
-#define ALIGNDATA16 ALIGN 16
-#define ALIGNDATA8 ALIGN 8
-#define ALIGNDATA4 ALIGN 4
-#define ALIGNDATA2 ALIGN 2
-#define FILE(s)
-#define STRING(s) db s
-#define D_LONG dd
-#define D_WORD dw
-#define D_BYTE db
-/* #define SPACE */
-/* #define COMM */
-#if defined(__WATCOMC__)
-SECTION _TEXT public align=16 class=CODE use32 flat
-SECTION _DATA public align=16 class=DATA use32 flat
-#define SEG_TEXT SECTION _TEXT
-#define SEG_DATA SECTION _DATA
-#define SEG_BSS SECTION .bss
-#else
-#define SEG_DATA SECTION .data
-#define SEG_TEXT SECTION .text
-#define SEG_BSS SECTION .bss
-#endif
-
-#define D_SPACE(n) db n REP 0
-
-#define AS_BEGIN
-
-/* Jcc's should be handled better than this... */
-#define NEAR near
-
-#else /* MASM */
-
-#define TBYTE_PTR tbyte ptr
-#define QWORD_PTR qword ptr
-#define DWORD_PTR dword ptr
-#define WORD_PTR word ptr
-#define BYTE_PTR byte ptr
-
-#define OFFSET offset
-
-#define GLOBL GLOBAL
-#define ALIGNTEXT32 ALIGN 32
-#define ALIGNTEXT16 ALIGN 16
-#define ALIGNTEXT8 ALIGN 8
-#define ALIGNTEXT4 ALIGN 4
-#define ALIGNTEXT2 ALIGN 2
-#define ALIGNTEXT32ifNOP ALIGN 32
-#define ALIGNTEXT16ifNOP ALIGN 16
-#define ALIGNTEXT8ifNOP ALIGN 8
-#define ALIGNTEXT4ifNOP ALIGN 4
-#define ALIGNDATA32 ALIGN 32
-#define ALIGNDATA16 ALIGN 16
-#define ALIGNDATA8 ALIGN 8
-#define ALIGNDATA4 ALIGN 4
-#define ALIGNDATA2 ALIGN 2
-#define FILE(s)
-#define STRING(s) db s
-#define D_LONG dd
-#define D_WORD dw
-#define D_BYTE db
-/* #define SPACE */
-/* #define COMM */
-#define SEG_DATA .DATA
-#define SEG_TEXT .CODE
-#define SEG_BSS .DATA
-
-#define D_SPACE(n) db n REP 0
-
-#define AS_BEGIN
-
-#define NEAR
-
-#endif
-
-#if defined(Lynx) || (defined(SYSV) || defined(SVR4)) \
- || (defined(__linux__) || defined(__OS2ELF__)) && defined(__ELF__) \
- || (defined(__FreeBSD__) && __FreeBSD__ >= 3) \
- || (defined(__NetBSD__) && defined(__ELF__))
-#define GLNAME(a) a
-#else
-#define GLNAME(a) CONCAT(_, a)
-#endif
-
-/*
- * Addressing Modes
- */
-
-/* Immediate Mode */
-#define P_ADDR(a) OFFSET a
-#define X_ADDR(a) OFFSET a
-#define D_ADDR(a) OFFSET a
-#define L_ADDR(a) OFFSET a
-#define W_ADDR(a) OFFSET a
-#define B_ADDR(a) OFFSET a
-
-#define P_CONST(a) a
-#define X_CONST(a) a
-#define D_CONST(a) a
-#define L_CONST(a) a
-#define W_CONST(a) a
-#define B_CONST(a) a
-
-/* Indirect Mode */
-#ifdef NASM_ASSEMBLER
-#define P_CONTENT(a) [a]
-#define X_CONTENT(a) TBYTE_PTR [a]
-#define D_CONTENT(a) QWORD_PTR [a]
-#define L_CONTENT(a) DWORD_PTR [a]
-#define W_CONTENT(a) WORD_PTR [a]
-#define B_CONTENT(a) BYTE_PTR [a]
-#else
-#define P_CONTENT(a) a
-#define X_CONTENT(a) TBYTE_PTR a
-#define D_CONTENT(a) QWORD_PTR a
-#define L_CONTENT(a) DWORD_PTR a
-#define W_CONTENT(a) WORD_PTR a
-#define B_CONTENT(a) BYTE_PTR a
-#endif
-
-/* Register a indirect */
-#define P_REGIND(a) [a]
-#define X_REGIND(a) TBYTE_PTR [a]
-#define D_REGIND(a) QWORD_PTR [a]
-#define L_REGIND(a) DWORD_PTR [a]
-#define W_REGIND(a) WORD_PTR [a]
-#define B_REGIND(a) BYTE_PTR [a]
-
-/* Register b indirect plus displacement a */
-#define P_REGOFF(a, b) [b + a]
-#define X_REGOFF(a, b) TBYTE_PTR [b + a]
-#define D_REGOFF(a, b) QWORD_PTR [b + a]
-#define L_REGOFF(a, b) DWORD_PTR [b + a]
-#define W_REGOFF(a, b) WORD_PTR [b + a]
-#define B_REGOFF(a, b) BYTE_PTR [b + a]
-
-/* Reg indirect Base + Index + Displacement - this is mainly for 16-bit mode
- * which has no scaling
- */
-#define P_REGBID(b, i, d) [b + i + d]
-#define X_REGBID(b, i, d) TBYTE_PTR [b + i + d]
-#define D_REGBID(b, i, d) QWORD_PTR [b + i + d]
-#define L_REGBID(b, i, d) DWORD_PTR [b + i + d]
-#define W_REGBID(b, i, d) WORD_PTR [b + i + d]
-#define B_REGBID(b, i, d) BYTE_PTR [b + i + d]
-
-/* Reg indirect Base + (Index * Scale) */
-#define P_REGBIS(b, i, s) [b + i * s]
-#define X_REGBIS(b, i, s) TBYTE_PTR [b + i * s]
-#define D_REGBIS(b, i, s) QWORD_PTR [b + i * s]
-#define L_REGBIS(b, i, s) DWORD_PTR [b + i * s]
-#define W_REGBIS(b, i, s) WORD_PTR [b + i * s]
-#define B_REGBIS(b, i, s) BYTE_PTR [b + i * s]
-
-/* Reg indirect Base + (Index * Scale) + Displacement */
-#define P_REGBISD(b, i, s, d) [b + i * s + d]
-#define X_REGBISD(b, i, s, d) TBYTE_PTR [b + i * s + d]
-#define D_REGBISD(b, i, s, d) QWORD_PTR [b + i * s + d]
-#define L_REGBISD(b, i, s, d) DWORD_PTR [b + i * s + d]
-#define W_REGBISD(b, i, s, d) WORD_PTR [b + i * s + d]
-#define B_REGBISD(b, i, s, d) BYTE_PTR [b + i * s + d]
-
-/* Displaced Scaled Index: */
-#define P_REGDIS(d, i, s) [i * s + d]
-#define X_REGDIS(d, i, s) TBYTE_PTR [i * s + d]
-#define D_REGDIS(d, i, s) QWORD_PTR [i * s + d]
-#define L_REGDIS(d, i, s) DWORD_PTR [i * s + d]
-#define W_REGDIS(d, i, s) WORD_PTR [i * s + d]
-#define B_REGDIS(d, i, s) BYTE_PTR [i * s + d]
-
-/* Indexed Base: */
-#define P_REGBI(b, i) [b + i]
-#define X_REGBI(b, i) TBYTE_PTR [b + i]
-#define D_REGBI(b, i) QWORD_PTR [b + i]
-#define L_REGBI(b, i) DWORD_PTR [b + i]
-#define W_REGBI(b, i) WORD_PTR [b + i]
-#define B_REGBI(b, i) BYTE_PTR [b + i]
-
-/* Displaced Base: */
-#define P_REGDB(d, b) [b + d]
-#define X_REGDB(d, b) TBYTE_PTR [b + d]
-#define D_REGDB(d, b) QWORD_PTR [b + d]
-#define L_REGDB(d, b) DWORD_PTR [b + d]
-#define W_REGDB(d, b) WORD_PTR [b + d]
-#define B_REGDB(d, b) BYTE_PTR [b + d]
-
-/* Variable indirect: */
-#define VARINDIRECT(var) [var]
-
-/* Use register contents as jump/call target: */
-#define CODEPTR(reg) P_(reg)
-
-/*
- * Redefine assembler commands
- */
-
-#define P_(a) P_ ## a
-#define X_(a) X_ ## a
-#define D_(a) D_ ## a
-#define SR_(a) W_ ## a
-#define S_(a) L_ ## a
-#define L_(a) L_ ## a
-#define W_(a) W_ ## a
-#define B_(a) B_ ## a
-
-#define AAA aaa
-#define AAD aad
-#define AAM aam
-#define AAS aas
-#define ADC_L(a, b) adc L_(b), L_(a)
-#define ADC_W(a, b) adc W_(b), W_(a)
-#define ADC_B(a, b) adc B_(b), B_(a)
-#define ADD_L(a, b) add L_(b), L_(a)
-#define ADD_W(a, b) add W_(b), W_(a)
-#define ADD_B(a, b) add B_(b), B_(a)
-#define AND_L(a, b) and L_(b), L_(a)
-#define AND_W(a, b) and W_(b), W_(a)
-#define AND_B(a, b) and B_(b), B_(a)
-#define ARPL(a,b) arpl W_(b), a
-#define BOUND_L(a, b) bound L_(b), L_(a)
-#define BOUND_W(a, b) bound W_(b), W_(a)
-#define BSF_L(a, b) bsf L_(b), L_(a)
-#define BSF_W(a, b) bsf W_(b), W_(a)
-#define BSR_L(a, b) bsr L_(b), L_(a)
-#define BSR_W(a, b) bsr W_(b), W_(a)
-#define BT_L(a, b) bt L_(b), L_(a)
-#define BT_W(a, b) bt W_(b), W_(a)
-#define BTC_L(a, b) btc L_(b), L_(a)
-#define BTC_W(a, b) btc W_(b), W_(a)
-#define BTR_L(a, b) btr L_(b), L_(a)
-#define BTR_W(a, b) btr W_(b), W_(a)
-#define BTS_L(a, b) bts L_(b), L_(a)
-#define BTS_W(a, b) bts W_(b), W_(a)
-#define CALL(a) call a
-#define CALLF(s,a) call far s:a
-#define CBW cbw
-#define CWDE cwde
-#define CLC clc
-#define CLD cld
-#define CLI cli
-#define CLTS clts
-#define CMC cmc
-#define CMP_L(a, b) cmp L_(b), L_(a)
-#define CMP_W(a, b) cmp W_(b), W_(a)
-#define CMP_B(a, b) cmp B_(b), B_(a)
-#define CMPS_L cmpsd
-#define CMPS_W cmpsw
-#define CMPS_B cmpsb
-#define CPUID cpuid
-#define CWD cwd
-#define CDQ cdq
-#define DAA daa
-#define DAS das
-#define DEC_L(a) dec L_(a)
-#define DEC_W(a) dec W_(a)
-#define DEC_B(a) dec B_(a)
-#define DIV_L(a) div L_(a)
-#define DIV_W(a) div W_(a)
-#define DIV_B(a) div B_(a)
-#define ENTER(a,b) enter b, a
-#define HLT hlt
-#define IDIV_L(a) idiv L_(a)
-#define IDIV_W(a) idiv W_(a)
-#define IDIV_B(a) idiv B_(a)
-#define IMUL_L(a, b) imul L_(b), L_(a)
-#define IMUL_W(a, b) imul W_(b), W_(a)
-#define IMUL_B(a) imul B_(a)
-#define IN_L in EAX, DX
-#define IN_W in AX, DX
-#define IN_B in AL, DX
-#define IN1_L(a) in1 L_(a)
-#define IN1_W(a) in1 W_(a)
-#define IN1_B(a) in1 B_(a)
-#define INC_L(a) inc L_(a)
-#define INC_W(a) inc W_(a)
-#define INC_B(a) inc B_(a)
-#define INS_L ins
-#define INS_W ins
-#define INS_B ins
-#define INT(a) int B_(a)
-#define INT3 int3
-#define INTO into
-#define IRET iret
-#define IRETD iretd
-#define JA(a) ja NEAR a
-#define JAE(a) jae NEAR a
-#define JB(a) jb NEAR a
-#define JBE(a) jbe NEAR a
-#define JC(a) jc NEAR a
-#define JE(a) je NEAR a
-#define JG(a) jg NEAR a
-#define JGE(a) jge NEAR a
-#define JL(a) jl NEAR a
-#define JLE(a) jle NEAR a
-#define JNA(a) jna NEAR a
-#define JNAE(a) jnae NEAR a
-#define JNB(a) jnb NEAR a
-#define JNBE(a) jnbe NEAR a
-#define JNC(a) jnc NEAR a
-#define JNE(a) jne NEAR a
-#define JNG(a) jng NEAR a
-#define JNGE(a) jnge NEAR a
-#define JNL(a) jnl NEAR a
-#define JNLE(a) jnle NEAR a
-#define JNO(a) jno NEAR a
-#define JNP(a) jnp NEAR a
-#define JNS(a) jns NEAR a
-#define JNZ(a) jnz NEAR a
-#define JO(a) jo NEAR a
-#define JP(a) jp NEAR a
-#define JPE(a) jpe NEAR a
-#define JPO(a) jpo NEAR a
-#define JS(a) js NEAR a
-#define JZ(a) jz NEAR a
-#define JMP(a) jmp a
-#define JMPF(s,a) jmp far s:a
-#define LAHF lahf
-#define LAR(a, b) lar b, a
-#define LEA_L(a, b) lea P_(b), P_(a)
-#define LEA_W(a, b) lea P_(b), P_(a)
-#define LEAVE leave
-#define LGDT(a) lgdt a
-#define LIDT(a) lidt a
-#define LDS(a, b) lds b, P_(a)
-#define LES(a, b) les b, P_(a)
-#define LFS(a, b) lfs b, P_(a)
-#define LGS(a, b) lgs b, P_(a)
-#define LSS(a, b) lss b, P_(a)
-#define LLDT(a) lldt a
-#define LMSW(a) lmsw a
-#define LOCK lock
-#define LODS_L lodsd
-#define LODS_W lodsw
-#define LODS_B lodsb
-#define LOOP(a) loop a
-#define LOOPE(a) loope a
-#define LOOPZ(a) loopz a
-#define LOOPNE(a) loopne a
-#define LOOPNZ(a) loopnz a
-#define LSL(a, b) lsl b, a
-#define LTR(a) ltr a
-#define MOV_SR(a, b) mov SR_(b), SR_(a)
-#define MOV_L(a, b) mov L_(b), L_(a)
-#define MOV_W(a, b) mov W_(b), W_(a)
-#define MOV_B(a, b) mov B_(b), B_(a)
-#define MOVS_L movsd
-#define MOVS_W movsw
-#define MOVS_B movsb
-#define MOVSX_BL(a, b) movsx B_(b), B_(a)
-#define MOVSX_BW(a, b) movsx B_(b), B_(a)
-#define MOVSX_WL(a, b) movsx W_(b), W_(a)
-#define MOVZX_BL(a, b) movzx B_(b), B_(a)
-#define MOVZX_BW(a, b) movzx B_(b), B_(a)
-#define MOVZX_WL(a, b) movzx W_(b), W_(a)
-#define MUL_L(a) mul L_(a)
-#define MUL_W(a) mul W_(a)
-#define MUL_B(a) mul B_(a)
-#define NEG_L(a) neg L_(a)
-#define NEG_W(a) neg W_(a)
-#define NEG_B(a) neg B_(a)
-#define NOP nop
-#define NOT_L(a) not L_(a)
-#define NOT_W(a) not W_(a)
-#define NOT_B(a) not B_(a)
-#define OR_L(a,b) or L_(b), L_(a)
-#define OR_W(a,b) or W_(b), W_(a)
-#define OR_B(a,b) or B_(b), B_(a)
-#define OUT_L out DX, EAX
-#define OUT_W out DX, AX
-#define OUT_B out DX, AL
-#define OUT1_L(a) out1 L_(a)
-#define OUT1_W(a) out1 W_(a)
-#define OUT1_B(a) out1 B_(a)
-#define OUTS_L outsd
-#define OUTS_W outsw
-#define OUTS_B outsb
-#define POP_SR(a) pop SR_(a)
-#define POP_L(a) pop L_(a)
-#define POP_W(a) pop W_(a)
-#define POPA_L popad
-#define POPA_W popa
-#define POPF_L popfd
-#define POPF_W popf
-#define PUSH_SR(a) push SR_(a)
-#define PUSH_L(a) push L_(a)
-#define PUSH_W(a) push W_(a)
-#define PUSH_B(a) push B_(a)
-#define PUSHA_L pushad
-#define PUSHA_W pusha
-#define PUSHF_L pushfd
-#define PUSHF_W pushf
-#define RCL_L(a, b) rcl L_(b), L_(a)
-#define RCL_W(a, b) rcl W_(b), W_(a)
-#define RCL_B(a, b) rcl B_(b), B_(a)
-#define RCR_L(a, b) rcr L_(b), L_(a)
-#define RCR_W(a, b) rcr W_(b), W_(a)
-#define RCR_B(a, b) rcr B_(b), B_(a)
-#define RDTSC rdtsc
-#define ROL_L(a, b) rol L_(b), L_(a)
-#define ROL_W(a, b) rol W_(b), W_(a)
-#define ROL_B(a, b) rol B_(b), B_(a)
-#define ROR_L(a, b) ror L_(b), L_(a)
-#define ROR_W(a, b) ror W_(b), W_(a)
-#define ROR_B(a, b) ror B_(b), B_(a)
-#define REP rep
-#define REPE repe
-#define REPNE repne
-#define REPNZ REPNE
-#define REPZ REPE
-#define RET ret
-#define SAHF sahf
-#define SAL_L(a, b) sal L_(b), B_(a)
-#define SAL_W(a, b) sal W_(b), B_(a)
-#define SAL_B(a, b) sal B_(b), B_(a)
-#define SAR_L(a, b) sar L_(b), B_(a)
-#define SAR_W(a, b) sar W_(b), B_(a)
-#define SAR_B(a, b) sar B_(b), B_(a)
-#define SBB_L(a, b) sbb L_(b), L_(a)
-#define SBB_W(a, b) sbb W_(b), W_(a)
-#define SBB_B(a, b) sbb B_(b), B_(a)
-#define SCAS_L scas
-#define SCAS_W scas
-#define SCAS_B scas
-#define SETA(a) seta a
-#define SETAE(a) setae a
-#define SETB(a) setb a
-#define SETBE(a) setbe a
-#define SETC(a) setc a
-#define SETE(a) sete a
-#define SETG(a) setg a
-#define SETGE(a) setge a
-#define SETL(a) setl a
-#define SETLE(a) setle a
-#define SETNA(a) setna a
-#define SETNAE(a) setnae a
-#define SETNB(a) setnb a
-#define SETNBE(a) setnbe a
-#define SETNC(a) setnc a
-#define SETNE(a) setne a
-#define SETNG(a) setng a
-#define SETNGE(a) setnge a
-#define SETNL(a) setnl a
-#define SETNLE(a) setnle a
-#define SETNO(a) setno a
-#define SETNP(a) setnp a
-#define SETNS(a) setns a
-#define SETNZ(a) setnz a
-#define SETO(a) seto a
-#define SETP(a) setp a
-#define SETPE(a) setpe a
-#define SETPO(a) setpo a
-#define SETS(a) sets a
-#define SETZ(a) setz a
-#define SGDT(a) sgdt a
-#define SIDT(a) sidt a
-#define SHL_L(a, b) shl L_(b), B_(a)
-#define SHL_W(a, b) shl W_(b), B_(a)
-#define SHL_B(a, b) shl B_(b), B_(a)
-#define SHLD_L(a,b,c) shld
-#define SHLD2_L(a,b) shld L_(b), L_(a)
-#define SHLD_W(a,b,c) shld
-#define SHLD2_W(a,b) shld W_(b), W_(a)
-#define SHR_L(a, b) shr L_(b), B_(a)
-#define SHR_W(a, b) shr W_(b), B_(a)
-#define SHR_B(a, b) shr B_(b), B_(a)
-#define SHRD_L(a,b,c) shrd
-#define SHRD2_L(a,b) shrd L_(b), L_(a)
-#define SHRD_W(a,b,c) shrd
-#define SHRD2_W(a,b) shrd W_(b), W_(a)
-#define SLDT(a) sldt a
-#define SMSW(a) smsw a
-#define STC stc
-#define STD std
-#define STI sti
-#define STOS_L stosd
-#define STOS_W stosw
-#define STOS_B stosb
-#define STR(a) str a
-#define SUB_L(a, b) sub L_(b), L_(a)
-#define SUB_W(a, b) sub W_(b), W_(a)
-#define SUB_B(a, b) sub B_(b), B_(a)
-#define TEST_L(a, b) test L_(b), L_(a)
-#define TEST_W(a, b) test W_(b), W_(a)
-#define TEST_B(a, b) test B_(b), B_(a)
-#define VERR(a) verr a
-#define VERW(a) verw a
-#define WAIT wait
-#define XCHG_L(a, b) xchg L_(b), L_(a)
-#define XCHG_W(a, b) xchg W_(b), W_(a)
-#define XCHG_B(a, b) xchg B_(b), B_(a)
-#define XLAT xlat
-#define XOR_L(a, b) xor L_(b), L_(a)
-#define XOR_W(a, b) xor W_(b), W_(a)
-#define XOR_B(a, b) xor B_(b), B_(a)
-
-
-/* Floating Point Instructions */
-#define F2XM1 f2xm1
-#define FABS fabs
-#define FADD_D(a) fadd D_(a)
-#define FADD_S(a) fadd S_(a)
-#define FADD2(a, b) fadd b, a
-#define FADDP(a, b) faddp b, a
-#define FIADD_L(a) fiadd L_(a)
-#define FIADD_W(a) fiadd W_(a)
-#define FBLD(a) fbld a
-#define FBSTP(a) fbstp a
-#define FCHS fchs
-#define FCLEX fclex
-#define FNCLEX fnclex
-#define FCOM(a) fcom a
-#define FCOM_D(a) fcom D_(a)
-#define FCOM_S(a) fcom S_(a)
-#define FCOMP(a) fcomp a
-#define FCOMP_D(a) fcomp D_(a)
-#define FCOMP_S(a) fcomp S_(a)
-#define FCOMPP fcompp
-#define FCOS fcos
-#define FDECSTP fdecstp
-#define FDIV_D(a) fdiv D_(a)
-#define FDIV_S(a) fdiv S_(a)
-#define FDIV2(a, b) fdiv b, a
-#define FDIVP(a, b) fdivp b, a
-#define FIDIV_L(a) fidiv L_(a)
-#define FIDIV_W(a) fidiv W_(a)
-#define FDIVR_D(a) fdivr D_(a)
-#define FDIVR_S(a) fdivr S_(a)
-#define FDIVR2(a, b) fdivr b, a
-#define FDIVRP(a, b) fdivrp b, a
-#define FIDIVR_L(a) fidivr L_(a)
-#define FIDIVR_W(a) fidivr W_(a)
-#define FFREE(a) ffree a
-#define FICOM_L(a) ficom L_(a)
-#define FICOM_W(a) ficom W_(a)
-#define FICOMP_L(a) ficomp L_(a)
-#define FICOMP_W(a) ficomp W_(a)
-#define FILD_Q(a) fild D_(a)
-#define FILD_L(a) fild L_(a)
-#define FILD_W(a) fild W_(a)
-#define FINCSTP fincstp
-#define FINIT finit
-#define FNINIT fninit
-#define FIST_L(a) fist L_(a)
-#define FIST_W(a) fist W_(a)
-#define FISTP_Q(a) fistp D_(a)
-#define FISTP_L(a) fistp L_(a)
-#define FISTP_W(a) fistp W_(a)
-#define FLD_X(a) fld X_(a)
-#define FLD_D(a) fld D_(a)
-#define FLD_S(a) fld S_(a)
-#define FLD1 fld1
-#define FLDL2T fldl2t
-#define FLDL2E fldl2e
-#define FLDPI fldpi
-#define FLDLG2 fldlg2
-#define FLDLN2 fldln2
-#define FLDZ fldz
-#define FLDCW(a) fldcw a
-#define FLDENV(a) fldenv a
-#define FMUL_S(a) fmul S_(a)
-#define FMUL_D(a) fmul D_(a)
-#define FMUL2(a, b) fmul b, a
-#define FMULP(a, b) fmulp b, a
-#define FIMUL_L(a) fimul L_(a)
-#define FIMUL_W(a) fimul W_(a)
-#define FNOP fnop
-#define FPATAN fpatan
-#define FPREM fprem
-#define FPREM1 fprem1
-#define FPTAN fptan
-#define FRNDINT frndint
-#define FRSTOR(a) frstor a
-#define FSAVE(a) fsave a
-#define FNSAVE(a) fnsave a
-#define FSCALE fscale
-#define FSIN fsin
-#define FSINCOS fsincos
-#define FSQRT fsqrt
-#define FST_D(a) fst D_(a)
-#define FST_S(a) fst S_(a)
-#define FSTP_X(a) fstp X_(a)
-#define FSTP_D(a) fstp D_(a)
-#define FSTP_S(a) fstp S_(a)
-#define FSTP(a) fstp a
-#define FSTCW(a) fstcw a
-#define FNSTCW(a) fnstcw a
-#define FSTENV(a) fstenv a
-#define FNSTENV(a) fnstenv a
-#define FSTSW(a) fstsw a
-#define FNSTSW(a) fnstsw a
-#define FSUB_S(a) fsub S_(a)
-#define FSUB_D(a) fsub D_(a)
-#define FSUB2(a, b) fsub b, a
-#define FSUBP(a, b) fsubp b, a
-#define FISUB_L(a) fisub L_(a)
-#define FISUB_W(a) fisub W_(a)
-#define FSUBR_S(a) fsubr S_(a)
-#define FSUBR_D(a) fsubr D_(a)
-#define FSUBR2(a, b) fsubr b, a
-#define FSUBRP(a, b) fsubrp b, a
-#define FISUBR_L(a) fisubr L_(a)
-#define FISUBR_W(a) fisubr W_(a)
-#define FTST ftst
-#define FUCOM(a) fucom a
-#define FUCOMP(a) fucomp a
-#define FUCOMPP fucompp
-#define FWAIT fwait
-#define FXAM fxam
-#define FXCH(a) fxch a
-#define FXTRACT fxtract
-#define FYL2X fyl2x
-#define FYL2XP1 fyl2xp1
-
-#endif /* NASM_ASSEMBLER, MASM_ASSEMBLER */
-
- /****************************************/
- /* */
- /* Extensions to x86 insn set - */
- /* MMX, 3DNow! */
- /* */
- /****************************************/
-
-#if defined(NASM_ASSEMBLER) || defined(MASM_ASSEMBLER)
-#define P_ARG1(a) P_ ## a
-#define P_ARG2(a, b) P_ ## b, P_ ## a
-#define P_ARG3(a, b, c) P_ ## c, P_ ## b, P_ ## a
-#else
-#define P_ARG1(a) a
-#define P_ARG2(a, b) a, b
-#define P_ARG3(a, b, c) a, b, c
-#endif
-
-/* MMX */
-#define MOVD(a, b) movd P_ARG2(a, b)
-#define MOVQ(a, b) movq P_ARG2(a, b)
-
-#define PADDB(a, b) paddb P_ARG2(a, b)
-#define PADDW(a, b) paddw P_ARG2(a, b)
-#define PADDD(a, b) paddd P_ARG2(a, b)
-
-#define PADDSB(a, b) paddsb P_ARG2(a, b)
-#define PADDSW(a, b) paddsw P_ARG2(a, b)
-
-#define PADDUSB(a, b) paddusb P_ARG2(a, b)
-#define PADDUSW(a, b) paddusw P_ARG2(a, b)
-
-#define PSUBB(a, b) psubb P_ARG2(a, b)
-#define PSUBW(a, b) psubw P_ARG2(a, b)
-#define PSUBD(a, b) psubd P_ARG2(a, b)
-
-#define PSUBSB(a, b) psubsb P_ARG2(a, b)
-#define PSUBSW(a, b) psubsw P_ARG2(a, b)
-
-#define PSUBUSB(a, b) psubusb P_ARG2(a, b)
-#define PSUBUSW(a, b) psubusw P_ARG2(a, b)
-
-#define PCMPEQB(a, b) pcmpeqb P_ARG2(a, b)
-#define PCMPEQW(a, b) pcmpeqw P_ARG2(a, b)
-#define PCMPEQD(a, b) pcmpeqd P_ARG2(a, b)
-
-#define PCMPGTB(a, b) pcmpgtb P_ARG2(a, b)
-#define PCMPGTW(a, b) pcmpgtw P_ARG2(a, b)
-#define PCMPGTD(a, b) pcmpgtd P_ARG2(a, b)
-
-#define PMULHW(a, b) pmulhw P_ARG2(a, b)
-#define PMULLW(a, b) pmullw P_ARG2(a, b)
-
-#define PMADDWD(a, b) pmaddwd P_ARG2(a, b)
-
-#define PAND(a, b) pand P_ARG2(a, b)
-
-#define PANDN(a, b) pandn P_ARG2(a, b)
-
-#define POR(a, b) por P_ARG2(a, b)
-
-#define PXOR(a, b) pxor P_ARG2(a, b)
-
-#define PSRAW(a, b) psraw P_ARG2(a, b)
-#define PSRAD(a, b) psrad P_ARG2(a, b)
-
-#define PSRLW(a, b) psrlw P_ARG2(a, b)
-#define PSRLD(a, b) psrld P_ARG2(a, b)
-#define PSRLQ(a, b) psrlq P_ARG2(a, b)
-
-#define PSLLW(a, b) psllw P_ARG2(a, b)
-#define PSLLD(a, b) pslld P_ARG2(a, b)
-#define PSLLQ(a, b) psllq P_ARG2(a, b)
-
-#define PACKSSWB(a, b) packsswb P_ARG2(a, b)
-#define PACKSSDW(a, b) packssdw P_ARG2(a, b)
-#define PACKUSWB(a, b) packuswb P_ARG2(a, b)
-
-#define PUNPCKHBW(a, b) punpckhbw P_ARG2(a, b)
-#define PUNPCKHWD(a, b) punpckhwd P_ARG2(a, b)
-#define PUNPCKHDQ(a, b) punpckhdq P_ARG2(a, b)
-#define PUNPCKLBW(a, b) punpcklbw P_ARG2(a, b)
-#define PUNPCKLWD(a, b) punpcklwd P_ARG2(a, b)
-#define PUNPCKLDQ(a, b) punpckldq P_ARG2(a, b)
-
-#define EMMS emms
-
-/* AMD 3DNow! */
-#define PAVGUSB(a, b) pavgusb P_ARG2(a, b)
-#define PFADD(a, b) pfadd P_ARG2(a, b)
-#define PFSUB(a, b) pfsub P_ARG2(a, b)
-#define PFSUBR(a, b) pfsubr P_ARG2(a, b)
-#define PFACC(a, b) pfacc P_ARG2(a, b)
-#define PFCMPGE(a, b) pfcmpge P_ARG2(a, b)
-#define PFCMPGT(a, b) pfcmpgt P_ARG2(a, b)
-#define PFCMPEQ(a, b) pfcmpeq P_ARG2(a, b)
-#define PFMIN(a, b) pfmin P_ARG2(a, b)
-#define PFMAX(a, b) pfmax P_ARG2(a, b)
-#define PI2FD(a, b) pi2fd P_ARG2(a, b)
-#define PF2ID(a, b) pf2id P_ARG2(a, b)
-#define PFRCP(a, b) pfrcp P_ARG2(a, b)
-#define PFRSQRT(a, b) pfrsqrt P_ARG2(a, b)
-#define PFMUL(a, b) pfmul P_ARG2(a, b)
-#define PFRCPIT1(a, b) pfrcpit1 P_ARG2(a, b)
-#define PFRSQIT1(a, b) pfrsqit1 P_ARG2(a, b)
-#define PFRCPIT2(a, b) pfrcpit2 P_ARG2(a, b)
-#define PMULHRW(a, b) pmulhrw P_ARG2(a, b)
-
-#define FEMMS femms
-#define PREFETCH(a) prefetch P_ARG1(a)
-#define PREFETCHW(a) prefetchw P_ARG1(a)
-
-/* Intel SSE */
-#define ADDPS(a, b) addps P_ARG2(a, b)
-#define ADDSS(a, b) addss P_ARG2(a, b)
-#define ANDNPS(a, b) andnps P_ARG2(a, b)
-#define ANDPS(a, b) andps P_ARG2(a, b)
-/* NASM only knows the pseudo ops for these.
-#define CMPPS(a, b, c) cmpps P_ARG3(a, b, c)
-#define CMPSS(a, b, c) cmpss P_ARG3(a, b, c)
-*/
-#define CMPEQPS(a, b) cmpeqps P_ARG2(a, b)
-#define CMPLTPS(a, b) cmpltps P_ARG2(a, b)
-#define CMPLEPS(a, b) cmpleps P_ARG2(a, b)
-#define CMPUNORDPS(a, b) cmpunordps P_ARG2(a, b)
-#define CMPNEQPS(a, b) cmpneqps P_ARG2(a, b)
-#define CMPNLTPS(a, b) cmpnltps P_ARG2(a, b)
-#define CMPNLEPS(a, b) cmpnleps P_ARG2(a, b)
-#define CMPORDPS(a, b) cmpordps P_ARG2(a, b)
-#define CMPEQSS(a, b) cmpeqss P_ARG2(a, b)
-#define CMPLTSS(a, b) cmpltss P_ARG2(a, b)
-#define CMPLESS(a, b) cmpless P_ARG2(a, b)
-#define CMPUNORDSS(a, b) cmpunordss P_ARG2(a, b)
-#define CMPNEQSS(a, b) cmpneqss P_ARG2(a, b)
-#define CMPNLTSS(a, b) cmpnltss P_ARG2(a, b)
-#define CMPNLESS(a, b) cmpnless P_ARG2(a, b)
-#define CMPORDSS(a, b) cmpordss P_ARG2(a, b)
-#define COMISS(a, b) comiss P_ARG2(a, b)
-#define CVTPI2PS(a, b) cvtpi2ps P_ARG2(a, b)
-#define CVTPS2PI(a, b) cvtps2pi P_ARG2(a, b)
-#define CVTSI2SS(a, b) cvtsi2ss P_ARG2(a, b)
-#define CVTSS2SI(a, b) cvtss2si P_ARG2(a, b)
-#define CVTTPS2PI(a, b) cvttps2pi P_ARG2(a, b)
-#define CVTTSS2SI(a, b) cvttss2si P_ARG2(a, b)
-#define DIVPS(a, b) divps P_ARG2(a, b)
-#define DIVSS(a, b) divss P_ARG2(a, b)
-#define FXRSTOR(a) fxrstor P_ARG1(a)
-#define FXSAVE(a) fxsave P_ARG1(a)
-#define LDMXCSR(a) ldmxcsr P_ARG1(a)
-#define MAXPS(a, b) maxps P_ARG2(a, b)
-#define MAXSS(a, b) maxss P_ARG2(a, b)
-#define MINPS(a, b) minps P_ARG2(a, b)
-#define MINSS(a, b) minss P_ARG2(a, b)
-#define MOVAPS(a, b) movaps P_ARG2(a, b)
-#define MOVHLPS(a, b) movhlps P_ARG2(a, b)
-#define MOVHPS(a, b) movhps P_ARG2(a, b)
-#define MOVLHPS(a, b) movlhps P_ARG2(a, b)
-#define MOVLPS(a, b) movlps P_ARG2(a, b)
-#define MOVMSKPS(a, b) movmskps P_ARG2(a, b)
-#define MOVNTPS(a, b) movntps P_ARG2(a, b)
-#define MOVNTQ(a, b) movntq P_ARG2(a, b)
-#define MOVSS(a, b) movss P_ARG2(a, b)
-#define MOVUPS(a, b) movups P_ARG2(a, b)
-#define MULPS(a, b) mulps P_ARG2(a, b)
-#define MULSS(a, b) mulss P_ARG2(a, b)
-#define ORPS(a, b) orps P_ARG2(a, b)
-#define RCPPS(a, b) rcpps P_ARG2(a, b)
-#define RCPSS(a, b) rcpss P_ARG2(a, b)
-#define RSQRTPS(a, b) rsqrtps P_ARG2(a, b)
-#define RSQRTSS(a, b) rsqrtss P_ARG2(a, b)
-#define SHUFPS(a, b, c) shufps P_ARG3(a, b, c)
-#define SQRTPS(a, b) sqrtps P_ARG2(a, b)
-#define SQRTSS(a, b) sqrtss P_ARG2(a, b)
-#define STMXCSR(a) stmxcsr P_ARG1(a)
-#define SUBPS(a, b) subps P_ARG2(a, b)
-#define UCOMISS(a, b) ucomiss P_ARG2(a, b)
-#define UNPCKHPS(a, b) unpckhps P_ARG2(a, b)
-#define UNPCKLPS(a, b) unpcklps P_ARG2(a, b)
-#define XORPS(a, b) xorps P_ARG2(a, b)
-
-#define PREFETCHNTA(a) prefetchnta P_ARG1(a)
-#define PREFETCHT0(a) prefetcht0 P_ARG1(a)
-#define PREFETCHT1(a) prefetcht1 P_ARG1(a)
-#define PREFETCHT2(a) prefetcht2 P_ARG1(a)
-#define SFENCE sfence
-
-/* Added by BrianP for FreeBSD (per David Dawes) */
-#if !defined(NASM_ASSEMBLER) && !defined(MASM_ASSEMBLER) && !defined(__bsdi__)
-#define LLBL(a) CONCAT(.L,a)
-#define LLBL2(a,b) CONCAT3(.L,a,b)
-#else
-#define LLBL(a) a
-#define LLBL2(a,b) CONCAT(a,b)
-#endif
-
-/* Segment overrides */
-#define SEGCS D_BYTE 46
-#define SEGDS D_BYTE 62
-#define SEGES D_BYTE 38
-#define SEGFS D_BYTE 100
-#define SEGGS D_BYTE 101
-
-/* Temporary labels: valid until next non-local label */
-#ifdef NASM_ASSEMBLER
-#define TLBL(a) CONCAT(.,a)
-#else
-#define TLBL(a) CONCAT(a,$)
-#endif
-
-/* Hidden symbol visibility support.
- * If we build with gcc's -fvisibility=hidden flag, we'll need to change
- * the symbol visibility mode to 'default'.
- */
-#if defined(GNU_ASSEMBLER) && !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__)
-# define HIDDEN(x) .hidden x
-#elif defined(__GNUC__) && !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__)
-# pragma GCC visibility push(default)
-# define HIDDEN(x) .hidden x
-#else
-# define HIDDEN(x)
-#endif
-
-#endif /* __ASSYNTAX_H__ */
+ +#ifndef __ASSYNTAX_H__ +#define __ASSYNTAX_H__ + +/* + * Copyright 1992 Vrije Universiteit, The Netherlands + * + * Permission to use, copy, modify, and distribute this software and its + * documentation for any purpose and without fee is hereby granted, provided + * that the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the Vrije Universiteit not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The Vrije Universiteit makes no + * representations about the suitability of this software for any purpose. + * It is provided "as is" without express or implied warranty. + * + * The Vrije Universiteit DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, + * IN NO EVENT SHALL The Vrije Universiteit BE LIABLE FOR ANY SPECIAL, + * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * assyntax.h + * + * Select the syntax appropriate to the 386 assembler being used + * To add support for more assemblers add more columns to the CHOICE + * macro. Note that register names must also have uppercase names + * to avoid macro recursion. e.g., #define ah %ah recurses! + * + * NB 1. Some of the macros for certain assemblers imply that the code is to + * run in protected mode!! Caveat emptor. + * + * NB 2. 486 specific instructions are not included. This is to discourage + * their accidental use in code that is intended to run on 386 and 486 + * systems. + * + * Supported assemblers: + * + * (a) AT&T SysVr4 as(1): define ATT_ASSEMBLER + * (b) GNU Assembler gas: define GNU_ASSEMBLER (default) + * (c) Amsterdam Compiler kit: define ACK_ASSEMBLER + * (d) The Netwide Assembler: define NASM_ASSEMBLER + * (e) Microsoft Assembler: define MASM_ASSEMBLER (UNTESTED!) + * + * The following naming conventions have been used to identify the various + * data types: + * _SR = segment register version + * Integer: + * _Q = quadword = 64 bits + * _L = long = 32 bits + * _W = short = 16 bits + * _B = byte = 8 bits + * Floating-point: + * _X = m80real = 80 bits + * _D = double = 64 bits + * _S = single = 32 bits + * + * Author: Gregory J. Sharp, Sept 1992 + * Vrije Universiteit, Amsterdam, The Netherlands + * + * [support for Intel syntax added by Josh Vanderhoof, 1999] + */ + +#if !(defined(NASM_ASSEMBLER) || defined(MASM_ASSEMBLER)) + +/* Default to ATT_ASSEMBLER when SVR4 or SYSV are defined */ +#if (defined(SVR4) || defined(SYSV)) && !defined(GNU_ASSEMBLER) +#define ATT_ASSEMBLER +#endif + +#if !defined(ATT_ASSEMBLER) && !defined(GNU_ASSEMBLER) && !defined(ACK_ASSEMBLER) +#define GNU_ASSEMBLER +#endif + +#if (defined(__STDC__) && !defined(UNIXCPP)) || (defined (sun) && defined (i386) && defined (SVR4) && defined (__STDC__) && !defined (__GNUC__)) +#define CONCAT(x, y) x ## y +#define CONCAT3(x, y, z) x ## y ## z +#else +#define CONCAT(x, y) x/**/y +#define CONCAT3(x, y, z) x/**/y/**/z +#endif + +#ifdef ACK_ASSEMBLER + +/* Assume we write code for 32-bit protected mode! */ + +/* Redefine register names for GAS & AT&T assemblers */ +#define AL al +#define AH ah +#define AX ax +#define EAX ax +#define BL bl +#define BH bh +#define BX bx +#define EBX bx +#define CL cl +#define CH ch +#define CX cx +#define ECX cx +#define DL dl +#define DH dh +#define DX dx +#define EDX dx +#define BP bp +#define EBP bp +#define SI si +#define ESI si +#define DI di +#define EDI di +#define SP sp +#define ESP sp +#define CS cs +#define SS ss +#define DS ds +#define ES es +#define FS fs +#define GS gs +/* Control Registers */ +#define CR0 cr0 +#define CR1 cr1 +#define CR2 cr2 +#define CR3 cr3 +/* Debug Registers */ +#define DR0 dr0 +#define DR1 dr1 +#define DR2 dr2 +#define DR3 dr3 +#define DR4 dr4 +#define DR5 dr5 +#define DR6 dr6 +#define DR7 dr7 +/* Floating-point Stack */ +#define ST st + +#define AS_BEGIN .sect .text; .sect .rom; .sect .data; .sect .bss; .sect .text + + +#define _WTOG o16 /* word toggle for _W instructions */ +#define _LTOG /* long toggle for _L instructions */ +#define ADDR_TOGGLE a16 +#define OPSZ_TOGGLE o16 +#define USE16 .use16 +#define USE32 .use32 + +#define CHOICE(a,b,c) c + +#else /* AT&T or GAS */ + +/* Redefine register names for GAS & AT&T assemblers */ +#define AL %al +#define AH %ah +#define AX %ax +#define EAX %eax +#define BL %bl +#define BH %bh +#define BX %bx +#define EBX %ebx +#define CL %cl +#define CH %ch +#define CX %cx +#define ECX %ecx +#define DL %dl +#define DH %dh +#define DX %dx +#define EDX %edx +#define BP %bp +#define EBP %ebp +#define SI %si +#define ESI %esi +#define DI %di +#define EDI %edi +#define SP %sp +#define ESP %esp +#define CS %cs +#define SS %ss +#define DS %ds +#define ES %es +#define FS %fs +#define GS %gs +/* Control Registers */ +#define CR0 %cr0 +#define CR1 %cr1 +#define CR2 %cr2 +#define CR3 %cr3 +/* Debug Registers */ +#define DR0 %db0 +#define DR1 %db1 +#define DR2 %db2 +#define DR3 %db3 +#define DR4 %db4 +#define DR5 %db5 +#define DR6 %db6 +#define DR7 %db7 +/* Floating-point Stack */ +#define _STX0 %st(0) +#define _STX1 %st(1) +#define _STX2 %st(2) +#define _STX3 %st(3) +#define _STX4 %st(4) +#define _STX5 %st(5) +#define _STX6 %st(6) +#define _STX7 %st(7) +#define ST(x) CONCAT(_STX,x) +#ifdef GNU_ASSEMBLER +#define ST0 %st(0) +#else +#define ST0 %st +#endif +/* MMX Registers */ +#define MM0 %mm0 +#define MM1 %mm1 +#define MM2 %mm2 +#define MM3 %mm3 +#define MM4 %mm4 +#define MM5 %mm5 +#define MM6 %mm6 +#define MM7 %mm7 +/* SSE Registers */ +#define XMM0 %xmm0 +#define XMM1 %xmm1 +#define XMM2 %xmm2 +#define XMM3 %xmm3 +#define XMM4 %xmm4 +#define XMM5 %xmm5 +#define XMM6 %xmm6 +#define XMM7 %xmm7 + +#define AS_BEGIN +#define USE16 +#define USE32 + +#ifdef GNU_ASSEMBLER + +#define ADDR_TOGGLE aword +#define OPSZ_TOGGLE word + +#define CHOICE(a,b,c) b + +#else +/* + * AT&T ASSEMBLER SYNTAX + * ********************* + */ +#define CHOICE(a,b,c) a + +#define ADDR_TOGGLE addr16 +#define OPSZ_TOGGLE data16 + +#endif /* GNU_ASSEMBLER */ +#endif /* ACK_ASSEMBLER */ + + +#if defined(__QNX__) || defined(Lynx) || (defined(SYSV) || defined(SVR4)) && !defined(ACK_ASSEMBLER) || defined(__ELF__) || defined(__GNU__) || defined(__GNUC__) && !defined(__DJGPP__) && !defined(__MINGW32__) +#define GLNAME(a) a +#else +#define GLNAME(a) CONCAT(_,a) +#endif + + + /****************************************/ + /* */ + /* Select the various choices */ + /* */ + /****************************************/ + + +/* Redefine assembler directives */ +/*********************************/ +#define GLOBL CHOICE(.globl, .globl, .extern) +#define GLOBAL GLOBL +#define EXTERN GLOBL +#ifndef __AOUT__ +#define ALIGNTEXT32 CHOICE(.align 32, .balign 32, .align 32) +#define ALIGNTEXT16 CHOICE(.align 16, .balign 16, .align 16) +#define ALIGNTEXT8 CHOICE(.align 8, .balign 8, .align 8) +#define ALIGNTEXT4 CHOICE(.align 4, .balign 4, .align 4) +#define ALIGNTEXT2 CHOICE(.align 2, .balign 2, .align 2) +/* ALIGNTEXT4ifNOP is the same as ALIGNTEXT4, but only if the space is + * guaranteed to be filled with NOPs. Otherwise it does nothing. + */ +#define ALIGNTEXT32ifNOP CHOICE(.align 32, .balign ARG2(32,0x90), /*can't do it*/) +#define ALIGNTEXT16ifNOP CHOICE(.align 16, .balign ARG2(16,0x90), /*can't do it*/) +#define ALIGNTEXT8ifNOP CHOICE(.align 8, .balign ARG2(8,0x90), /*can't do it*/) +#define ALIGNTEXT4ifNOP CHOICE(.align 4, .balign ARG2(4,0x90), /*can't do it*/) +#define ALIGNDATA32 CHOICE(.align 32, .balign ARG2(32,0x0), .align 32) +#define ALIGNDATA16 CHOICE(.align 16, .balign ARG2(16,0x0), .align 16) +#define ALIGNDATA8 CHOICE(.align 8, .balign ARG2(8,0x0), .align 8) +#define ALIGNDATA4 CHOICE(.align 4, .balign ARG2(4,0x0), .align 4) +#define ALIGNDATA2 CHOICE(.align 2, .balign ARG2(2,0x0), .align 2) +#else +/* 'as -aout' on FreeBSD doesn't have .balign */ +#define ALIGNTEXT32 CHOICE(.align 32, .align ARG2(5,0x90), .align 32) +#define ALIGNTEXT16 CHOICE(.align 16, .align ARG2(4,0x90), .align 16) +#define ALIGNTEXT8 CHOICE(.align 8, .align ARG2(3,0x90), .align 8) +#define ALIGNTEXT4 CHOICE(.align 4, .align ARG2(2,0x90), .align 4) +#define ALIGNTEXT2 CHOICE(.align 2, .align ARG2(1,0x90), .align 2) +/* ALIGNTEXT4ifNOP is the same as ALIGNTEXT4, but only if the space is + * guaranteed to be filled with NOPs. Otherwise it does nothing. + */ +#define ALIGNTEXT32ifNOP CHOICE(.align 32, .align ARG2(5,0x90), /*can't do it*/) +#define ALIGNTEXT16ifNOP CHOICE(.align 16, .align ARG2(4,0x90), /*can't do it*/) +#define ALIGNTEXT8ifNOP CHOICE(.align 8, .align ARG2(3,0x90), /*can't do it*/) +#define ALIGNTEXT4ifNOP CHOICE(.align 4, .align ARG2(2,0x90), /*can't do it*/) +#define ALIGNDATA32 CHOICE(.align 32, .align ARG2(5,0x0), .align 32) +#define ALIGNDATA16 CHOICE(.align 16, .align ARG2(4,0x0), .align 16) +#define ALIGNDATA8 CHOICE(.align 8, .align ARG2(3,0x0), .align 8) +#define ALIGNDATA4 CHOICE(.align 4, .align ARG2(2,0x0), .align 4) +#define ALIGNDATA2 CHOICE(.align 2, .align ARG2(1,0x0), .align 2) +#endif /* __AOUT__ */ +#define FILE(s) CHOICE(.file s, .file s, .file s) +#define STRING(s) CHOICE(.string s, .asciz s, .asciz s) +#define D_LONG CHOICE(.long, .long, .data4) +#define D_WORD CHOICE(.value, .short, .data2) +#define D_BYTE CHOICE(.byte, .byte, .data1) +#define SPACE CHOICE(.comm, .space, .space) +#define COMM CHOICE(.comm, .comm, .comm) +#define SEG_DATA CHOICE(.data, .data, .sect .data) +#define SEG_TEXT CHOICE(.text, .text, .sect .text) +#define SEG_BSS CHOICE(.bss, .bss, .sect .bss) + +#ifdef GNU_ASSEMBLER +#define D_SPACE(n) . = . + n +#else +#define D_SPACE(n) .space n +#endif + +/* Addressing Modes */ +/* Immediate Mode */ +#define ADDR(a) CHOICE(CONCAT($,a), $a, a) +#define CONST(a) CHOICE(CONCAT($,a), $a, a) + +/* Indirect Mode */ +#define CONTENT(a) CHOICE(a, a, (a)) /* take contents of variable */ +#define REGIND(a) CHOICE((a), (a), (a)) /* Register a indirect */ +/* Register b indirect plus displacement a */ +#define REGOFF(a, b) CHOICE(a(b), a(b), a(b)) +/* Reg indirect Base + Index + Displacement - this is mainly for 16-bit mode + * which has no scaling + */ +#define REGBID(b,i,d) CHOICE(d(b,i), d(b,i), d(b)(i)) +/* Reg indirect Base + (Index * Scale) */ +#define REGBIS(b,i,s) CHOICE((b,i,s), (b,i,s), (b)(i*s)) +/* Reg indirect Base + (Index * Scale) + Displacement */ +#define REGBISD(b,i,s,d) CHOICE(d(b,i,s), d(b,i,s), d(b)(i*s)) +/* Displaced Scaled Index: */ +#define REGDIS(d,i,s) CHOICE(d(,i,s), d(,i,s), d(i * s)) +/* Indexed Base: */ +#define REGBI(b,i) CHOICE((b,i), (b,i), (b)(i)) +/* Displaced Base: */ +#define REGDB(d,b) CHOICE(d(b), d(b), d(b)) +/* Variable indirect: */ +#define VARINDIRECT(var) CHOICE(*var, *var, (var)) +/* Use register contents as jump/call target: */ +#define CODEPTR(reg) CHOICE(*reg, *reg, reg) + +/* For expressions requiring bracketing + * eg. (CRT0_PM | CRT_EM) + */ + +#define EXPR(a) CHOICE([a], (a), [a]) +#define ENOT(a) CHOICE(0!a, ~a, ~a) +#define EMUL(a,b) CHOICE(a\*b, a*b, a*b) +#define EDIV(a,b) CHOICE(a\/b, a/b, a/b) + +/* + * We have to beat the problem of commas within arguments to choice. + * eg. choice (add a,b, add b,a) will get argument mismatch. Luckily ANSI + * and other known cpp definitions evaluate arguments before substitution + * so the following works. + */ +#define ARG2(a, b) a,b +#define ARG3(a,b,c) a,b,c + +/* Redefine assembler commands */ +#define AAA CHOICE(aaa, aaa, aaa) +#define AAD CHOICE(aad, aad, aad) +#define AAM CHOICE(aam, aam, aam) +#define AAS CHOICE(aas, aas, aas) +#define ADC_L(a, b) CHOICE(adcl ARG2(a,b), adcl ARG2(a,b), _LTOG adc ARG2(b,a)) +#define ADC_W(a, b) CHOICE(adcw ARG2(a,b), adcw ARG2(a,b), _WTOG adc ARG2(b,a)) +#define ADC_B(a, b) CHOICE(adcb ARG2(a,b), adcb ARG2(a,b), adcb ARG2(b,a)) +#define ADD_L(a, b) CHOICE(addl ARG2(a,b), addl ARG2(a,b), _LTOG add ARG2(b,a)) +#define ADD_W(a, b) CHOICE(addw ARG2(a,b), addw ARG2(a,b), _WTOG add ARG2(b,a)) +#define ADD_B(a, b) CHOICE(addb ARG2(a,b), addb ARG2(a,b), addb ARG2(b,a)) +#define AND_L(a, b) CHOICE(andl ARG2(a,b), andl ARG2(a,b), _LTOG and ARG2(b,a)) +#define AND_W(a, b) CHOICE(andw ARG2(a,b), andw ARG2(a,b), _WTOG and ARG2(b,a)) +#define AND_B(a, b) CHOICE(andb ARG2(a,b), andb ARG2(a,b), andb ARG2(b,a)) +#define ARPL(a,b) CHOICE(arpl ARG2(a,b), arpl ARG2(a,b), arpl ARG2(b,a)) +#define BOUND_L(a, b) CHOICE(boundl ARG2(a,b), boundl ARG2(b,a), _LTOG bound ARG2(b,a)) +#define BOUND_W(a, b) CHOICE(boundw ARG2(a,b), boundw ARG2(b,a), _WTOG bound ARG2(b,a)) +#define BSF_L(a, b) CHOICE(bsfl ARG2(a,b), bsfl ARG2(a,b), _LTOG bsf ARG2(b,a)) +#define BSF_W(a, b) CHOICE(bsfw ARG2(a,b), bsfw ARG2(a,b), _WTOG bsf ARG2(b,a)) +#define BSR_L(a, b) CHOICE(bsrl ARG2(a,b), bsrl ARG2(a,b), _LTOG bsr ARG2(b,a)) +#define BSR_W(a, b) CHOICE(bsrw ARG2(a,b), bsrw ARG2(a,b), _WTOG bsr ARG2(b,a)) +#define BT_L(a, b) CHOICE(btl ARG2(a,b), btl ARG2(a,b), _LTOG bt ARG2(b,a)) +#define BT_W(a, b) CHOICE(btw ARG2(a,b), btw ARG2(a,b), _WTOG bt ARG2(b,a)) +#define BTC_L(a, b) CHOICE(btcl ARG2(a,b), btcl ARG2(a,b), _LTOG btc ARG2(b,a)) +#define BTC_W(a, b) CHOICE(btcw ARG2(a,b), btcw ARG2(a,b), _WTOG btc ARG2(b,a)) +#define BTR_L(a, b) CHOICE(btrl ARG2(a,b), btrl ARG2(a,b), _LTOG btr ARG2(b,a)) +#define BTR_W(a, b) CHOICE(btrw ARG2(a,b), btrw ARG2(a,b), _WTOG btr ARG2(b,a)) +#define BTS_L(a, b) CHOICE(btsl ARG2(a,b), btsl ARG2(a,b), _LTOG bts ARG2(b,a)) +#define BTS_W(a, b) CHOICE(btsw ARG2(a,b), btsw ARG2(a,b), _WTOG bts ARG2(b,a)) +#define CALL(a) CHOICE(call a, call a, call a) +#define CALLF(s,a) CHOICE(lcall ARG2(s,a), lcall ARG2(s,a), callf s:a) +#define CBW CHOICE(cbtw, cbw, cbw) +#define CWDE CHOICE(cwtd, cwde, cwde) +#define CLC CHOICE(clc, clc, clc) +#define CLD CHOICE(cld, cld, cld) +#define CLI CHOICE(cli, cli, cli) +#define CLTS CHOICE(clts, clts, clts) +#define CMC CHOICE(cmc, cmc, cmc) +#define CMP_L(a, b) CHOICE(cmpl ARG2(a,b), cmpl ARG2(a,b), _LTOG cmp ARG2(b,a)) +#define CMP_W(a, b) CHOICE(cmpw ARG2(a,b), cmpw ARG2(a,b), _WTOG cmp ARG2(b,a)) +#define CMP_B(a, b) CHOICE(cmpb ARG2(a,b), cmpb ARG2(a,b), cmpb ARG2(b,a)) +#define CMPS_L CHOICE(cmpsl, cmpsl, _LTOG cmps) +#define CMPS_W CHOICE(cmpsw, cmpsw, _WTOG cmps) +#define CMPS_B CHOICE(cmpsb, cmpsb, cmpsb) +#define CWD CHOICE(cwtl, cwd, cwd) +#define CDQ CHOICE(cltd, cdq, cdq) +#define DAA CHOICE(daa, daa, daa) +#define DAS CHOICE(das, das, das) +#define DEC_L(a) CHOICE(decl a, decl a, _LTOG dec a) +#define DEC_W(a) CHOICE(decw a, decw a, _WTOG dec a) +#define DEC_B(a) CHOICE(decb a, decb a, decb a) +#define DIV_L(a) CHOICE(divl a, divl a, div a) +#define DIV_W(a) CHOICE(divw a, divw a, div a) +#define DIV_B(a) CHOICE(divb a, divb a, divb a) +#define ENTER(a,b) CHOICE(enter ARG2(a,b), enter ARG2(a,b), enter ARG2(b,a)) +#define HLT CHOICE(hlt, hlt, hlt) +#define IDIV_L(a) CHOICE(idivl a, idivl a, _LTOG idiv a) +#define IDIV_W(a) CHOICE(idivw a, idivw a, _WTOG idiv a) +#define IDIV_B(a) CHOICE(idivb a, idivb a, idivb a) +/* More forms than this for imul!! */ +#define IMUL_L(a, b) CHOICE(imull ARG2(a,b), imull ARG2(a,b), _LTOG imul ARG2(b,a)) +#define IMUL_W(a, b) CHOICE(imulw ARG2(a,b), imulw ARG2(a,b), _WTOG imul ARG2(b,a)) +#define IMUL_B(a) CHOICE(imulb a, imulb a, imulb a) +#define IN_L CHOICE(inl (DX), inl ARG2(DX,EAX), _LTOG in DX) +#define IN_W CHOICE(inw (DX), inw ARG2(DX,AX), _WTOG in DX) +#define IN_B CHOICE(inb (DX), inb ARG2(DX,AL), inb DX) +/* Please AS code writer: use the following ONLY, if you refer to ports<256 + * directly, but not in IN1_W(DX), for instance, even if IN1_ looks nicer + */ +#if defined (sun) +#define IN1_L(a) CHOICE(inl (a), inl ARG2(a,EAX), _LTOG in a) +#define IN1_W(a) CHOICE(inw (a), inw ARG2(a,AX), _WTOG in a) +#define IN1_B(a) CHOICE(inb (a), inb ARG2(a,AL), inb a) +#else +#define IN1_L(a) CHOICE(inl a, inl ARG2(a,EAX), _LTOG in a) +#define IN1_W(a) CHOICE(inw a, inw ARG2(a,AX), _WTOG in a) +#define IN1_B(a) CHOICE(inb a, inb ARG2(a,AL), inb a) +#endif +#define INC_L(a) CHOICE(incl a, incl a, _LTOG inc a) +#define INC_W(a) CHOICE(incw a, incw a, _WTOG inc a) +#define INC_B(a) CHOICE(incb a, incb a, incb a) +#define INS_L CHOICE(insl, insl, _LTOG ins) +#define INS_W CHOICE(insw, insw, _WTOG ins) +#define INS_B CHOICE(insb, insb, insb) +#define INT(a) CHOICE(int a, int a, int a) +#define INT3 CHOICE(int CONST(3), int3, int CONST(3)) +#define INTO CHOICE(into, into, into) +#define IRET CHOICE(iret, iret, iret) +#define IRETD CHOICE(iret, iret, iretd) +#define JA(a) CHOICE(ja a, ja a, ja a) +#define JAE(a) CHOICE(jae a, jae a, jae a) +#define JB(a) CHOICE(jb a, jb a, jb a) +#define JBE(a) CHOICE(jbe a, jbe a, jbe a) +#define JC(a) CHOICE(jc a, jc a, jc a) +#define JE(a) CHOICE(je a, je a, je a) +#define JG(a) CHOICE(jg a, jg a, jg a) +#define JGE(a) CHOICE(jge a, jge a, jge a) +#define JL(a) CHOICE(jl a, jl a, jl a) +#define JLE(a) CHOICE(jle a, jle a, jle a) +#define JNA(a) CHOICE(jna a, jna a, jna a) +#define JNAE(a) CHOICE(jnae a, jnae a, jnae a) +#define JNB(a) CHOICE(jnb a, jnb a, jnb a) +#define JNBE(a) CHOICE(jnbe a, jnbe a, jnbe a) +#define JNC(a) CHOICE(jnc a, jnc a, jnc a) +#define JNE(a) CHOICE(jne a, jne a, jne a) +#define JNG(a) CHOICE(jng a, jng a, jng a) +#define JNGE(a) CHOICE(jnge a, jnge a, jnge a) +#define JNL(a) CHOICE(jnl a, jnl a, jnl a) +#define JNLE(a) CHOICE(jnle a, jnle a, jnle a) +#define JNO(a) CHOICE(jno a, jno a, jno a) +#define JNP(a) CHOICE(jnp a, jnp a, jnp a) +#define JNS(a) CHOICE(jns a, jns a, jns a) +#define JNZ(a) CHOICE(jnz a, jnz a, jnz a) +#define JO(a) CHOICE(jo a, jo a, jo a) +#define JP(a) CHOICE(jp a, jp a, jp a) +#define JPE(a) CHOICE(jpe a, jpe a, jpe a) +#define JPO(a) CHOICE(jpo a, jpo a, jpo a) +#define JS(a) CHOICE(js a, js a, js a) +#define JZ(a) CHOICE(jz a, jz a, jz a) +#define JMP(a) CHOICE(jmp a, jmp a, jmp a) +#define JMPF(s,a) CHOICE(ljmp ARG2(s,a), ljmp ARG2(s,a), jmpf s:a) +#define LAHF CHOICE(lahf, lahf, lahf) +#if !defined(_REAL_MODE) && !defined(_V86_MODE) +#define LAR(a, b) CHOICE(lar ARG2(a, b), lar ARG2(a, b), lar ARG2(b, a)) +#endif +#define LEA_L(a, b) CHOICE(leal ARG2(a,b), leal ARG2(a,b), _LTOG lea ARG2(b,a)) +#define LEA_W(a, b) CHOICE(leaw ARG2(a,b), leaw ARG2(a,b), _WTOG lea ARG2(b,a)) +#define LEAVE CHOICE(leave, leave, leave) +#define LGDT(a) CHOICE(lgdt a, lgdt a, lgdt a) +#define LIDT(a) CHOICE(lidt a, lidt a, lidt a) +#define LDS(a, b) CHOICE(ldsl ARG2(a,b), lds ARG2(a,b), lds ARG2(b,a)) +#define LES(a, b) CHOICE(lesl ARG2(a,b), les ARG2(a,b), les ARG2(b,a)) +#define LFS(a, b) CHOICE(lfsl ARG2(a,b), lfs ARG2(a,b), lfs ARG2(b,a)) +#define LGS(a, b) CHOICE(lgsl ARG2(a,b), lgs ARG2(a,b), lgs ARG2(b,a)) +#define LSS(a, b) CHOICE(lssl ARG2(a,b), lss ARG2(a,b), lss ARG2(b,a)) +#define LLDT(a) CHOICE(lldt a, lldt a, lldt a) +#define LMSW(a) CHOICE(lmsw a, lmsw a, lmsw a) +#define LOCK CHOICE(lock, lock, lock) +#define LODS_L CHOICE(lodsl, lodsl, _LTOG lods) +#define LODS_W CHOICE(lodsw, lodsw, _WTOG lods) +#define LODS_B CHOICE(lodsb, lodsb, lodsb) +#define LOOP(a) CHOICE(loop a, loop a, loop a) +#define LOOPE(a) CHOICE(loope a, loope a, loope a) +#define LOOPZ(a) CHOICE(loopz a, loopz a, loopz a) +#define LOOPNE(a) CHOICE(loopne a, loopne a, loopne a) +#define LOOPNZ(a) CHOICE(loopnz a, loopnz a, loopnz a) +#if !defined(_REAL_MODE) && !defined(_V86_MODE) +#define LSL(a, b) CHOICE(lsl ARG2(a,b), lsl ARG2(a,b), lsl ARG2(b,a)) +#endif +#define LTR(a) CHOICE(ltr a, ltr a, ltr a) +#define MOV_SR(a, b) CHOICE(movw ARG2(a,b), mov ARG2(a,b), mov ARG2(b,a)) +#define MOV_L(a, b) CHOICE(movl ARG2(a,b), movl ARG2(a,b), _LTOG mov ARG2(b,a)) +#define MOV_W(a, b) CHOICE(movw ARG2(a,b), movw ARG2(a,b), _WTOG mov ARG2(b,a)) +#define MOV_B(a, b) CHOICE(movb ARG2(a,b), movb ARG2(a,b), movb ARG2(b,a)) +#define MOVS_L CHOICE(movsl, movsl, _LTOG movs) +#define MOVS_W CHOICE(movsw, movsw, _WTOG movs) +#define MOVS_B CHOICE(movsb, movsb, movsb) +#define MOVSX_BL(a, b) CHOICE(movsbl ARG2(a,b), movsbl ARG2(a,b), movsx ARG2(b,a)) +#define MOVSX_BW(a, b) CHOICE(movsbw ARG2(a,b), movsbw ARG2(a,b), movsx ARG2(b,a)) +#define MOVSX_WL(a, b) CHOICE(movswl ARG2(a,b), movswl ARG2(a,b), movsx ARG2(b,a)) +#define MOVZX_BL(a, b) CHOICE(movzbl ARG2(a,b), movzbl ARG2(a,b), movzx ARG2(b,a)) +#define MOVZX_BW(a, b) CHOICE(movzbw ARG2(a,b), movzbw ARG2(a,b), movzx ARG2(b,a)) +#define MOVZX_WL(a, b) CHOICE(movzwl ARG2(a,b), movzwl ARG2(a,b), movzx ARG2(b,a)) +#define MUL_L(a) CHOICE(mull a, mull a, _LTOG mul a) +#define MUL_W(a) CHOICE(mulw a, mulw a, _WTOG mul a) +#define MUL_B(a) CHOICE(mulb a, mulb a, mulb a) +#define NEG_L(a) CHOICE(negl a, negl a, _LTOG neg a) +#define NEG_W(a) CHOICE(negw a, negw a, _WTOG neg a) +#define NEG_B(a) CHOICE(negb a, negb a, negb a) +#define NOP CHOICE(nop, nop, nop) +#define NOT_L(a) CHOICE(notl a, notl a, _LTOG not a) +#define NOT_W(a) CHOICE(notw a, notw a, _WTOG not a) +#define NOT_B(a) CHOICE(notb a, notb a, notb a) +#define OR_L(a,b) CHOICE(orl ARG2(a,b), orl ARG2(a,b), _LTOG or ARG2(b,a)) +#define OR_W(a,b) CHOICE(orw ARG2(a,b), orw ARG2(a,b), _WTOG or ARG2(b,a)) +#define OR_B(a,b) CHOICE(orb ARG2(a,b), orb ARG2(a,b), orb ARG2(b,a)) +#define OUT_L CHOICE(outl (DX), outl ARG2(EAX,DX), _LTOG out DX) +#define OUT_W CHOICE(outw (DX), outw ARG2(AX,DX), _WTOG out DX) +#define OUT_B CHOICE(outb (DX), outb ARG2(AL,DX), outb DX) +/* Please AS code writer: use the following ONLY, if you refer to ports<256 + * directly, but not in OUT1_W(DX), for instance, even if OUT1_ looks nicer + */ +#define OUT1_L(a) CHOICE(outl (a), outl ARG2(EAX,a), _LTOG out a) +#define OUT1_W(a) CHOICE(outw (a), outw ARG2(AX,a), _WTOG out a) +#define OUT1_B(a) CHOICE(outb (a), outb ARG2(AL,a), outb a) +#define OUTS_L CHOICE(outsl, outsl, _LTOG outs) +#define OUTS_W CHOICE(outsw, outsw, _WTOG outs) +#define OUTS_B CHOICE(outsb, outsb, outsb) +#define POP_SR(a) CHOICE(pop a, pop a, pop a) +#define POP_L(a) CHOICE(popl a, popl a, _LTOG pop a) +#define POP_W(a) CHOICE(popw a, popw a, _WTOG pop a) +#define POPA_L CHOICE(popal, popal, _LTOG popa) +#define POPA_W CHOICE(popaw, popaw, _WTOG popa) +#define POPF_L CHOICE(popfl, popfl, _LTOG popf) +#define POPF_W CHOICE(popfw, popfw, _WTOG popf) +#define PUSH_SR(a) CHOICE(push a, push a, push a) +#define PUSH_L(a) CHOICE(pushl a, pushl a, _LTOG push a) +#define PUSH_W(a) CHOICE(pushw a, pushw a, _WTOG push a) +#define PUSH_B(a) CHOICE(push a, pushb a, push a) +#define PUSHA_L CHOICE(pushal, pushal, _LTOG pusha) +#define PUSHA_W CHOICE(pushaw, pushaw, _WTOG pusha) +#define PUSHF_L CHOICE(pushfl, pushfl, _LTOG pushf) +#define PUSHF_W CHOICE(pushfw, pushfw, _WTOG pushf) +#define RCL_L(a, b) CHOICE(rcll ARG2(a,b), rcll ARG2(a,b), _LTOG rcl ARG2(b,a)) +#define RCL_W(a, b) CHOICE(rclw ARG2(a,b), rclw ARG2(a,b), _WTOG rcl ARG2(b,a)) +#define RCL_B(a, b) CHOICE(rclb ARG2(a,b), rclb ARG2(a,b), rclb ARG2(b,a)) +#define RCR_L(a, b) CHOICE(rcrl ARG2(a,b), rcrl ARG2(a,b), _LTOG rcr ARG2(b,a)) +#define RCR_W(a, b) CHOICE(rcrw ARG2(a,b), rcrw ARG2(a,b), _WTOG rcr ARG2(b,a)) +#define RCR_B(a, b) CHOICE(rcrb ARG2(a,b), rcrb ARG2(a,b), rcrb ARG2(b,a)) +#define ROL_L(a, b) CHOICE(roll ARG2(a,b), roll ARG2(a,b), _LTOG rol ARG2(b,a)) +#define ROL_W(a, b) CHOICE(rolw ARG2(a,b), rolw ARG2(a,b), _WTOG rol ARG2(b,a)) +#define ROL_B(a, b) CHOICE(rolb ARG2(a,b), rolb ARG2(a,b), rolb ARG2(b,a)) +#define ROR_L(a, b) CHOICE(rorl ARG2(a,b), rorl ARG2(a,b), _LTOG ror ARG2(b,a)) +#define ROR_W(a, b) CHOICE(rorw ARG2(a,b), rorw ARG2(a,b), _WTOG ror ARG2(b,a)) +#define ROR_B(a, b) CHOICE(rorb ARG2(a,b), rorb ARG2(a,b), rorb ARG2(b,a)) +#define REP CHOICE(rep ;, rep ;, repe) +#define REPE CHOICE(repz ;, repe ;, repe) +#define REPNE CHOICE(repnz ;, repne ;, repne) +#define REPNZ REPNE +#define REPZ REPE +#define RET CHOICE(ret, ret, ret) +#define SAHF CHOICE(sahf, sahf, sahf) +#define SAL_L(a, b) CHOICE(sall ARG2(a,b), sall ARG2(a,b), _LTOG sal ARG2(b,a)) +#define SAL_W(a, b) CHOICE(salw ARG2(a,b), salw ARG2(a,b), _WTOG sal ARG2(b,a)) +#define SAL_B(a, b) CHOICE(salb ARG2(a,b), salb ARG2(a,b), salb ARG2(b,a)) +#define SAR_L(a, b) CHOICE(sarl ARG2(a,b), sarl ARG2(a,b), _LTOG sar ARG2(b,a)) +#define SAR_W(a, b) CHOICE(sarw ARG2(a,b), sarw ARG2(a,b), _WTOG sar ARG2(b,a)) +#define SAR_B(a, b) CHOICE(sarb ARG2(a,b), sarb ARG2(a,b), sarb ARG2(b,a)) +#define SBB_L(a, b) CHOICE(sbbl ARG2(a,b), sbbl ARG2(a,b), _LTOG sbb ARG2(b,a)) +#define SBB_W(a, b) CHOICE(sbbw ARG2(a,b), sbbw ARG2(a,b), _WTOG sbb ARG2(b,a)) +#define SBB_B(a, b) CHOICE(sbbb ARG2(a,b), sbbb ARG2(a,b), sbbb ARG2(b,a)) +#define SCAS_L CHOICE(scasl, scasl, _LTOG scas) +#define SCAS_W CHOICE(scasw, scasw, _WTOG scas) +#define SCAS_B CHOICE(scasb, scasb, scasb) +#define SETA(a) CHOICE(seta a, seta a, seta a) +#define SETAE(a) CHOICE(setae a, setae a, setae a) +#define SETB(a) CHOICE(setb a, setb a, setb a) +#define SETBE(a) CHOICE(setbe a, setbe a, setbe a) +#define SETC(a) CHOICE(setc a, setb a, setb a) +#define SETE(a) CHOICE(sete a, sete a, sete a) +#define SETG(a) CHOICE(setg a, setg a, setg a) +#define SETGE(a) CHOICE(setge a, setge a, setge a) +#define SETL(a) CHOICE(setl a, setl a, setl a) +#define SETLE(a) CHOICE(setle a, setle a, setle a) +#define SETNA(a) CHOICE(setna a, setna a, setna a) +#define SETNAE(a) CHOICE(setnae a, setnae a, setnae a) +#define SETNB(a) CHOICE(setnb a, setnb a, setnb a) +#define SETNBE(a) CHOICE(setnbe a, setnbe a, setnbe a) +#define SETNC(a) CHOICE(setnc a, setnb a, setnb a) +#define SETNE(a) CHOICE(setne a, setne a, setne a) +#define SETNG(a) CHOICE(setng a, setng a, setng a) +#define SETNGE(a) CHOICE(setnge a, setnge a, setnge a) +#define SETNL(a) CHOICE(setnl a, setnl a, setnl a) +#define SETNLE(a) CHOICE(setnle a, setnle a, setnle a) +#define SETNO(a) CHOICE(setno a, setno a, setno a) +#define SETNP(a) CHOICE(setnp a, setnp a, setnp a) +#define SETNS(a) CHOICE(setns a, setns a, setna a) +#define SETNZ(a) CHOICE(setnz a, setnz a, setnz a) +#define SETO(a) CHOICE(seto a, seto a, seto a) +#define SETP(a) CHOICE(setp a, setp a, setp a) +#define SETPE(a) CHOICE(setpe a, setpe a, setpe a) +#define SETPO(a) CHOICE(setpo a, setpo a, setpo a) +#define SETS(a) CHOICE(sets a, sets a, seta a) +#define SETZ(a) CHOICE(setz a, setz a, setz a) +#define SGDT(a) CHOICE(sgdt a, sgdt a, sgdt a) +#define SIDT(a) CHOICE(sidt a, sidt a, sidt a) +#define SHL_L(a, b) CHOICE(shll ARG2(a,b), shll ARG2(a,b), _LTOG shl ARG2(b,a)) +#define SHL_W(a, b) CHOICE(shlw ARG2(a,b), shlw ARG2(a,b), _WTOG shl ARG2(b,a)) +#define SHL_B(a, b) CHOICE(shlb ARG2(a,b), shlb ARG2(a,b), shlb ARG2(b,a)) +#define SHLD_L(a,b,c) CHOICE(shldl ARG3(a,b,c), shldl ARG3(a,b,c), _LTOG shld ARG3(c,b,a)) +#define SHLD2_L(a,b) CHOICE(shldl ARG2(a,b), shldl ARG3(CL,a,b), _LTOG shld ARG3(b,a,CL)) +#define SHLD_W(a,b,c) CHOICE(shldw ARG3(a,b,c), shldw ARG3(a,b,c), _WTOG shld ARG3(c,b,a)) +#define SHLD2_W(a,b) CHOICE(shldw ARG2(a,b), shldw ARG3(CL,a,b), _WTOG shld ARG3(b,a,CL)) +#define SHR_L(a, b) CHOICE(shrl ARG2(a,b), shrl ARG2(a,b), _LTOG shr ARG2(b,a)) +#define SHR_W(a, b) CHOICE(shrw ARG2(a,b), shrw ARG2(a,b), _WTOG shr ARG2(b,a)) +#define SHR_B(a, b) CHOICE(shrb ARG2(a,b), shrb ARG2(a,b), shrb ARG2(b,a)) +#define SHRD_L(a,b,c) CHOICE(shrdl ARG3(a,b,c), shrdl ARG3(a,b,c), _LTOG shrd ARG3(c,b,a)) +#define SHRD2_L(a,b) CHOICE(shrdl ARG2(a,b), shrdl ARG3(CL,a,b), _LTOG shrd ARG3(b,a,CL)) +#define SHRD_W(a,b,c) CHOICE(shrdw ARG3(a,b,c), shrdw ARG3(a,b,c), _WTOG shrd ARG3(c,b,a)) +#define SHRD2_W(a,b) CHOICE(shrdw ARG2(a,b), shrdw ARG3(CL,a,b), _WTOG shrd ARG3(b,a,CL)) +#define SLDT(a) CHOICE(sldt a, sldt a, sldt a) +#define SMSW(a) CHOICE(smsw a, smsw a, smsw a) +#define STC CHOICE(stc, stc, stc) +#define STD CHOICE(std, std, std) +#define STI CHOICE(sti, sti, sti) +#define STOS_L CHOICE(stosl, stosl, _LTOG stos) +#define STOS_W CHOICE(stosw, stosw, _WTOG stos) +#define STOS_B CHOICE(stosb, stosb, stosb) +#define STR(a) CHOICE(str a, str a, str a) +#define SUB_L(a, b) CHOICE(subl ARG2(a,b), subl ARG2(a,b), _LTOG sub ARG2(b,a)) +#define SUB_W(a, b) CHOICE(subw ARG2(a,b), subw ARG2(a,b), _WTOG sub ARG2(b,a)) +#define SUB_B(a, b) CHOICE(subb ARG2(a,b), subb ARG2(a,b), subb ARG2(b,a)) +#define TEST_L(a, b) CHOICE(testl ARG2(a,b), testl ARG2(a,b), _LTOG test ARG2(b,a)) +#define TEST_W(a, b) CHOICE(testw ARG2(a,b), testw ARG2(a,b), _WTOG test ARG2(b,a)) +#define TEST_B(a, b) CHOICE(testb ARG2(a,b), testb ARG2(a,b), testb ARG2(b,a)) +#define VERR(a) CHOICE(verr a, verr a, verr a) +#define VERW(a) CHOICE(verw a, verw a, verw a) +#define WAIT CHOICE(wait, wait, wait) +#define XCHG_L(a, b) CHOICE(xchgl ARG2(a,b), xchgl ARG2(a,b), _LTOG xchg ARG2(b,a)) +#define XCHG_W(a, b) CHOICE(xchgw ARG2(a,b), xchgw ARG2(a,b), _WTOG xchg ARG2(b,a)) +#define XCHG_B(a, b) CHOICE(xchgb ARG2(a,b), xchgb ARG2(a,b), xchgb ARG2(b,a)) +#define XLAT CHOICE(xlat, xlat, xlat) +#define XOR_L(a, b) CHOICE(xorl ARG2(a,b), xorl ARG2(a,b), _LTOG xor ARG2(b,a)) +#define XOR_W(a, b) CHOICE(xorw ARG2(a,b), xorw ARG2(a,b), _WTOG xor ARG2(b,a)) +#define XOR_B(a, b) CHOICE(xorb ARG2(a,b), xorb ARG2(a,b), xorb ARG2(b,a)) + + +/* Floating Point Instructions */ +#define F2XM1 CHOICE(f2xm1, f2xm1, f2xm1) +#define FABS CHOICE(fabs, fabs, fabs) +#define FADD_D(a) CHOICE(faddl a, faddl a, faddd a) +#define FADD_S(a) CHOICE(fadds a, fadds a, fadds a) +#define FADD2(a, b) CHOICE(fadd ARG2(a,b), fadd ARG2(a,b), fadd ARG2(b,a)) +#define FADDP(a, b) CHOICE(faddp ARG2(a,b), faddp ARG2(a,b), faddp ARG2(b,a)) +#define FIADD_L(a) CHOICE(fiaddl a, fiaddl a, fiaddl a) +#define FIADD_W(a) CHOICE(fiadd a, fiadds a, fiadds a) +#define FBLD(a) CHOICE(fbld a, fbld a, fbld a) +#define FBSTP(a) CHOICE(fbstp a, fbstp a, fbstp a) +#define FCHS CHOICE(fchs, fchs, fchs) +#define FCLEX CHOICE(fclex, wait; fnclex, wait; fclex) +#define FNCLEX CHOICE(fnclex, fnclex, fclex) +#define FCOM(a) CHOICE(fcom a, fcom a, fcom a) +#define FCOM_D(a) CHOICE(fcoml a, fcoml a, fcomd a) +#define FCOM_S(a) CHOICE(fcoms a, fcoms a, fcoms a) +#define FCOMP(a) CHOICE(fcomp a, fcomp a, fcomp a) +#define FCOMP_D(a) CHOICE(fcompl a, fcompl a, fcompd a) +#define FCOMP_S(a) CHOICE(fcomps a, fcomps a, fcomps a) +#define FCOMPP CHOICE(fcompp, fcompp, fcompp) +#define FCOS CHOICE(fcos, fcos, fcos) +#define FDECSTP CHOICE(fdecstp, fdecstp, fdecstp) +#define FDIV_D(a) CHOICE(fdivl a, fdivl a, fdivd a) +#define FDIV_S(a) CHOICE(fdivs a, fdivs a, fdivs a) +#define FDIV2(a, b) CHOICE(fdiv ARG2(a,b), fdiv ARG2(a,b), fdiv ARG2(b,a)) +#define FDIVP(a, b) CHOICE(fdivp ARG2(a,b), fdivp ARG2(a,b), fdivp ARG2(b,a)) +#define FIDIV_L(a) CHOICE(fidivl a, fidivl a, fidivl a) +#define FIDIV_W(a) CHOICE(fidiv a, fidivs a, fidivs a) +#define FDIVR_D(a) CHOICE(fdivrl a, fdivrl a, fdivrd a) +#define FDIVR_S(a) CHOICE(fdivrs a, fdivrs a, fdivrs a) +#define FDIVR2(a, b) CHOICE(fdivr ARG2(a,b), fdivr ARG2(a,b), fdivr ARG2(b,a)) +#define FDIVRP(a, b) CHOICE(fdivrp ARG2(a,b), fdivrp ARG2(a,b), fdivrp ARG2(b,a)) +#define FIDIVR_L(a) CHOICE(fidivrl a, fidivrl a, fidivrl a) +#define FIDIVR_W(a) CHOICE(fidivr a, fidivrs a, fidivrs a) +#define FFREE(a) CHOICE(ffree a, ffree a, ffree a) +#define FICOM_L(a) CHOICE(ficoml a, ficoml a, ficoml a) +#define FICOM_W(a) CHOICE(ficom a, ficoms a, ficoms a) +#define FICOMP_L(a) CHOICE(ficompl a, ficompl a, ficompl a) +#define FICOMP_W(a) CHOICE(ficomp a, ficomps a, ficomps a) +#define FILD_Q(a) CHOICE(fildll a, fildq a, fildq a) +#define FILD_L(a) CHOICE(fildl a, fildl a, fildl a) +#define FILD_W(a) CHOICE(fild a, filds a, filds a) +#define FINCSTP CHOICE(fincstp, fincstp, fincstp) +#define FINIT CHOICE(finit, wait; fninit, wait; finit) +#define FNINIT CHOICE(fninit, fninit, finit) +#define FIST_L(a) CHOICE(fistl a, fistl a, fistl a) +#define FIST_W(a) CHOICE(fist a, fists a, fists a) +#define FISTP_Q(a) CHOICE(fistpll a, fistpq a, fistpq a) +#define FISTP_L(a) CHOICE(fistpl a, fistpl a, fistpl a) +#define FISTP_W(a) CHOICE(fistp a, fistps a, fistps a) +#define FLD_X(a) CHOICE(fldt a, fldt a, fldx a) /* 80 bit data type! */ +#define FLD_D(a) CHOICE(fldl a, fldl a, fldd a) +#define FLD_S(a) CHOICE(flds a, flds a, flds a) +#define FLD1 CHOICE(fld1, fld1, fld1) +#define FLDL2T CHOICE(fldl2t, fldl2t, fldl2t) +#define FLDL2E CHOICE(fldl2e, fldl2e, fldl2e) +#define FLDPI CHOICE(fldpi, fldpi, fldpi) +#define FLDLG2 CHOICE(fldlg2, fldlg2, fldlg2) +#define FLDLN2 CHOICE(fldln2, fldln2, fldln2) +#define FLDZ CHOICE(fldz, fldz, fldz) +#define FLDCW(a) CHOICE(fldcw a, fldcw a, fldcw a) +#define FLDENV(a) CHOICE(fldenv a, fldenv a, fldenv a) +#define FMUL_S(a) CHOICE(fmuls a, fmuls a, fmuls a) +#define FMUL_D(a) CHOICE(fmull a, fmull a, fmuld a) +#define FMUL2(a, b) CHOICE(fmul ARG2(a,b), fmul ARG2(a,b), fmul ARG2(b,a)) +#define FMULP(a, b) CHOICE(fmulp ARG2(a,b), fmulp ARG2(a,b), fmulp ARG2(b,a)) +#define FIMUL_L(a) CHOICE(fimull a, fimull a, fimull a) +#define FIMUL_W(a) CHOICE(fimul a, fimuls a, fimuls a) +#define FNOP CHOICE(fnop, fnop, fnop) +#define FPATAN CHOICE(fpatan, fpatan, fpatan) +#define FPREM CHOICE(fprem, fprem, fprem) +#define FPREM1 CHOICE(fprem1, fprem1, fprem1) +#define FPTAN CHOICE(fptan, fptan, fptan) +#define FRNDINT CHOICE(frndint, frndint, frndint) +#define FRSTOR(a) CHOICE(frstor a, frstor a, frstor a) +#define FSAVE(a) CHOICE(fsave a, wait; fnsave a, wait; fsave a) +#define FNSAVE(a) CHOICE(fnsave a, fnsave a, fsave a) +#define FSCALE CHOICE(fscale, fscale, fscale) +#define FSIN CHOICE(fsin, fsin, fsin) +#define FSINCOS CHOICE(fsincos, fsincos, fsincos) +#define FSQRT CHOICE(fsqrt, fsqrt, fsqrt) +#define FST_D(a) CHOICE(fstl a, fstl a, fstd a) +#define FST_S(a) CHOICE(fsts a, fsts a, fsts a) +#define FSTP_X(a) CHOICE(fstpt a, fstpt a, fstpx a) +#define FSTP_D(a) CHOICE(fstpl a, fstpl a, fstpd a) +#define FSTP_S(a) CHOICE(fstps a, fstps a, fstps a) +#define FSTP(a) CHOICE(fstp a, fstp a, fstp a) +#define FSTCW(a) CHOICE(fstcw a, wait; fnstcw a, wait; fstcw a) +#define FNSTCW(a) CHOICE(fnstcw a, fnstcw a, fstcw a) +#define FSTENV(a) CHOICE(fstenv a, wait; fnstenv a, fstenv a) +#define FNSTENV(a) CHOICE(fnstenv a, fnstenv a, fstenv a) +#define FSTSW(a) CHOICE(fstsw a, wait; fnstsw a, wait; fstsw a) +#define FNSTSW(a) CHOICE(fnstsw a, fnstsw a, fstsw a) +#define FSUB_S(a) CHOICE(fsubs a, fsubs a, fsubs a) +#define FSUB_D(a) CHOICE(fsubl a, fsubl a, fsubd a) +#define FSUB2(a, b) CHOICE(fsub ARG2(a,b), fsub ARG2(a,b), fsub ARG2(b,a)) +#define FSUBP(a, b) CHOICE(fsubp ARG2(a,b), fsubp ARG2(a,b), fsubp ARG2(b,a)) +#define FISUB_L(a) CHOICE(fisubl a, fisubl a, fisubl a) +#define FISUB_W(a) CHOICE(fisub a, fisubs a, fisubs a) +#define FSUBR_S(a) CHOICE(fsubrs a, fsubrs a, fsubrs a) +#define FSUBR_D(a) CHOICE(fsubrl a, fsubrl a, fsubrd a) +#define FSUBR2(a, b) CHOICE(fsubr ARG2(a,b), fsubr ARG2(a,b), fsubr ARG2(b,a)) +#define FSUBRP(a, b) CHOICE(fsubrp ARG2(a,b), fsubrp ARG2(a,b), fsubrp ARG2(b,a)) +#define FISUBR_L(a) CHOICE(fisubrl a, fisubrl a, fisubrl a) +#define FISUBR_W(a) CHOICE(fisubr a, fisubrs a, fisubrs a) +#define FTST CHOICE(ftst, ftst, ftst) +#define FUCOM(a) CHOICE(fucom a, fucom a, fucom a) +#define FUCOMP(a) CHOICE(fucomp a, fucomp a, fucomp a) +#define FUCOMPP CHOICE(fucompp, fucompp, fucompp) +#define FWAIT CHOICE(wait, wait, wait) +#define FXAM CHOICE(fxam, fxam, fxam) +#define FXCH(a) CHOICE(fxch a, fxch a, fxch a) +#define FXTRACT CHOICE(fxtract, fxtract, fxtract) +#define FYL2X CHOICE(fyl2x, fyl2x, fyl2x) +#define FYL2XP1 CHOICE(fyl2xp1, fyl2xp1, fyl2xp1) + +/* New instructions */ +#define CPUID CHOICE(D_BYTE ARG2(15, 162), cpuid, D_BYTE ARG2(15, 162)) +#define RDTSC CHOICE(D_BYTE ARG2(15, 49), rdtsc, D_BYTE ARG2(15, 49)) + +#else /* NASM_ASSEMBLER || MASM_ASSEMBLER is defined */ + + /****************************************/ + /* */ + /* Intel style assemblers. */ + /* (NASM and MASM) */ + /* */ + /****************************************/ + +#define P_EAX EAX +#define L_EAX EAX +#define W_AX AX +#define B_AH AH +#define B_AL AL + +#define P_EBX EBX +#define L_EBX EBX +#define W_BX BX +#define B_BH BH +#define B_BL BL + +#define P_ECX ECX +#define L_ECX ECX +#define W_CX CX +#define B_CH CH +#define B_CL CL + +#define P_EDX EDX +#define L_EDX EDX +#define W_DX DX +#define B_DH DH +#define B_DL DL + +#define P_EBP EBP +#define L_EBP EBP +#define W_BP BP + +#define P_ESI ESI +#define L_ESI ESI +#define W_SI SI + +#define P_EDI EDI +#define L_EDI EDI +#define W_DI DI + +#define P_ESP ESP +#define L_ESP ESP +#define W_SP SP + +#define W_CS CS +#define W_SS SS +#define W_DS DS +#define W_ES ES +#define W_FS FS +#define W_GS GS + +#define X_ST ST +#define D_ST ST +#define L_ST ST + +#define P_MM0 mm0 +#define P_MM1 mm1 +#define P_MM2 mm2 +#define P_MM3 mm3 +#define P_MM4 mm4 +#define P_MM5 mm5 +#define P_MM6 mm6 +#define P_MM7 mm7 + +#define P_XMM0 xmm0 +#define P_XMM1 xmm1 +#define P_XMM2 xmm2 +#define P_XMM3 xmm3 +#define P_XMM4 xmm4 +#define P_XMM5 xmm5 +#define P_XMM6 xmm6 +#define P_XMM7 xmm7 + +#define CONCAT(x, y) x ## y +#define CONCAT3(x, y, z) x ## y ## z + +#if defined(NASM_ASSEMBLER) + +#define ST(n) st ## n +#define ST0 st0 + +#define TBYTE_PTR tword +#define QWORD_PTR qword +#define DWORD_PTR dword +#define WORD_PTR word +#define BYTE_PTR byte + +#define OFFSET + +#define GLOBL GLOBAL +#define ALIGNTEXT32 ALIGN 32 +#define ALIGNTEXT16 ALIGN 16 +#define ALIGNTEXT8 ALIGN 8 +#define ALIGNTEXT4 ALIGN 4 +#define ALIGNTEXT2 ALIGN 2 +#define ALIGNTEXT32ifNOP ALIGN 32 +#define ALIGNTEXT16ifNOP ALIGN 16 +#define ALIGNTEXT8ifNOP ALIGN 8 +#define ALIGNTEXT4ifNOP ALIGN 4 +#define ALIGNDATA32 ALIGN 32 +#define ALIGNDATA16 ALIGN 16 +#define ALIGNDATA8 ALIGN 8 +#define ALIGNDATA4 ALIGN 4 +#define ALIGNDATA2 ALIGN 2 +#define FILE(s) +#define STRING(s) db s +#define D_LONG dd +#define D_WORD dw +#define D_BYTE db +/* #define SPACE */ +/* #define COMM */ +#if defined(__WATCOMC__) +SECTION _TEXT public align=16 class=CODE use32 flat +SECTION _DATA public align=16 class=DATA use32 flat +#define SEG_TEXT SECTION _TEXT +#define SEG_DATA SECTION _DATA +#define SEG_BSS SECTION .bss +#else +#define SEG_DATA SECTION .data +#define SEG_TEXT SECTION .text +#define SEG_BSS SECTION .bss +#endif + +#define D_SPACE(n) db n REP 0 + +#define AS_BEGIN + +/* Jcc's should be handled better than this... */ +#define NEAR near + +#else /* MASM */ + +#define TBYTE_PTR tbyte ptr +#define QWORD_PTR qword ptr +#define DWORD_PTR dword ptr +#define WORD_PTR word ptr +#define BYTE_PTR byte ptr + +#define OFFSET offset + +#define GLOBL GLOBAL +#define ALIGNTEXT32 ALIGN 32 +#define ALIGNTEXT16 ALIGN 16 +#define ALIGNTEXT8 ALIGN 8 +#define ALIGNTEXT4 ALIGN 4 +#define ALIGNTEXT2 ALIGN 2 +#define ALIGNTEXT32ifNOP ALIGN 32 +#define ALIGNTEXT16ifNOP ALIGN 16 +#define ALIGNTEXT8ifNOP ALIGN 8 +#define ALIGNTEXT4ifNOP ALIGN 4 +#define ALIGNDATA32 ALIGN 32 +#define ALIGNDATA16 ALIGN 16 +#define ALIGNDATA8 ALIGN 8 +#define ALIGNDATA4 ALIGN 4 +#define ALIGNDATA2 ALIGN 2 +#define FILE(s) +#define STRING(s) db s +#define D_LONG dd +#define D_WORD dw +#define D_BYTE db +/* #define SPACE */ +/* #define COMM */ +#define SEG_DATA .DATA +#define SEG_TEXT .CODE +#define SEG_BSS .DATA + +#define D_SPACE(n) db n REP 0 + +#define AS_BEGIN + +#define NEAR + +#endif + +#if defined(Lynx) || (defined(SYSV) || defined(SVR4)) \ + || (defined(__linux__) || defined(__OS2ELF__)) && defined(__ELF__) \ + || (defined(__FreeBSD__) && __FreeBSD__ >= 3) \ + || (defined(__NetBSD__) && defined(__ELF__)) +#define GLNAME(a) a +#else +#define GLNAME(a) CONCAT(_, a) +#endif + +/* + * Addressing Modes + */ + +/* Immediate Mode */ +#define P_ADDR(a) OFFSET a +#define X_ADDR(a) OFFSET a +#define D_ADDR(a) OFFSET a +#define L_ADDR(a) OFFSET a +#define W_ADDR(a) OFFSET a +#define B_ADDR(a) OFFSET a + +#define P_CONST(a) a +#define X_CONST(a) a +#define D_CONST(a) a +#define L_CONST(a) a +#define W_CONST(a) a +#define B_CONST(a) a + +/* Indirect Mode */ +#ifdef NASM_ASSEMBLER +#define P_CONTENT(a) [a] +#define X_CONTENT(a) TBYTE_PTR [a] +#define D_CONTENT(a) QWORD_PTR [a] +#define L_CONTENT(a) DWORD_PTR [a] +#define W_CONTENT(a) WORD_PTR [a] +#define B_CONTENT(a) BYTE_PTR [a] +#else +#define P_CONTENT(a) a +#define X_CONTENT(a) TBYTE_PTR a +#define D_CONTENT(a) QWORD_PTR a +#define L_CONTENT(a) DWORD_PTR a +#define W_CONTENT(a) WORD_PTR a +#define B_CONTENT(a) BYTE_PTR a +#endif + +/* Register a indirect */ +#define P_REGIND(a) [a] +#define X_REGIND(a) TBYTE_PTR [a] +#define D_REGIND(a) QWORD_PTR [a] +#define L_REGIND(a) DWORD_PTR [a] +#define W_REGIND(a) WORD_PTR [a] +#define B_REGIND(a) BYTE_PTR [a] + +/* Register b indirect plus displacement a */ +#define P_REGOFF(a, b) [b + a] +#define X_REGOFF(a, b) TBYTE_PTR [b + a] +#define D_REGOFF(a, b) QWORD_PTR [b + a] +#define L_REGOFF(a, b) DWORD_PTR [b + a] +#define W_REGOFF(a, b) WORD_PTR [b + a] +#define B_REGOFF(a, b) BYTE_PTR [b + a] + +/* Reg indirect Base + Index + Displacement - this is mainly for 16-bit mode + * which has no scaling + */ +#define P_REGBID(b, i, d) [b + i + d] +#define X_REGBID(b, i, d) TBYTE_PTR [b + i + d] +#define D_REGBID(b, i, d) QWORD_PTR [b + i + d] +#define L_REGBID(b, i, d) DWORD_PTR [b + i + d] +#define W_REGBID(b, i, d) WORD_PTR [b + i + d] +#define B_REGBID(b, i, d) BYTE_PTR [b + i + d] + +/* Reg indirect Base + (Index * Scale) */ +#define P_REGBIS(b, i, s) [b + i * s] +#define X_REGBIS(b, i, s) TBYTE_PTR [b + i * s] +#define D_REGBIS(b, i, s) QWORD_PTR [b + i * s] +#define L_REGBIS(b, i, s) DWORD_PTR [b + i * s] +#define W_REGBIS(b, i, s) WORD_PTR [b + i * s] +#define B_REGBIS(b, i, s) BYTE_PTR [b + i * s] + +/* Reg indirect Base + (Index * Scale) + Displacement */ +#define P_REGBISD(b, i, s, d) [b + i * s + d] +#define X_REGBISD(b, i, s, d) TBYTE_PTR [b + i * s + d] +#define D_REGBISD(b, i, s, d) QWORD_PTR [b + i * s + d] +#define L_REGBISD(b, i, s, d) DWORD_PTR [b + i * s + d] +#define W_REGBISD(b, i, s, d) WORD_PTR [b + i * s + d] +#define B_REGBISD(b, i, s, d) BYTE_PTR [b + i * s + d] + +/* Displaced Scaled Index: */ +#define P_REGDIS(d, i, s) [i * s + d] +#define X_REGDIS(d, i, s) TBYTE_PTR [i * s + d] +#define D_REGDIS(d, i, s) QWORD_PTR [i * s + d] +#define L_REGDIS(d, i, s) DWORD_PTR [i * s + d] +#define W_REGDIS(d, i, s) WORD_PTR [i * s + d] +#define B_REGDIS(d, i, s) BYTE_PTR [i * s + d] + +/* Indexed Base: */ +#define P_REGBI(b, i) [b + i] +#define X_REGBI(b, i) TBYTE_PTR [b + i] +#define D_REGBI(b, i) QWORD_PTR [b + i] +#define L_REGBI(b, i) DWORD_PTR [b + i] +#define W_REGBI(b, i) WORD_PTR [b + i] +#define B_REGBI(b, i) BYTE_PTR [b + i] + +/* Displaced Base: */ +#define P_REGDB(d, b) [b + d] +#define X_REGDB(d, b) TBYTE_PTR [b + d] +#define D_REGDB(d, b) QWORD_PTR [b + d] +#define L_REGDB(d, b) DWORD_PTR [b + d] +#define W_REGDB(d, b) WORD_PTR [b + d] +#define B_REGDB(d, b) BYTE_PTR [b + d] + +/* Variable indirect: */ +#define VARINDIRECT(var) [var] + +/* Use register contents as jump/call target: */ +#define CODEPTR(reg) P_(reg) + +/* + * Redefine assembler commands + */ + +#define P_(a) P_ ## a +#define X_(a) X_ ## a +#define D_(a) D_ ## a +#define SR_(a) W_ ## a +#define S_(a) L_ ## a +#define L_(a) L_ ## a +#define W_(a) W_ ## a +#define B_(a) B_ ## a + +#define AAA aaa +#define AAD aad +#define AAM aam +#define AAS aas +#define ADC_L(a, b) adc L_(b), L_(a) +#define ADC_W(a, b) adc W_(b), W_(a) +#define ADC_B(a, b) adc B_(b), B_(a) +#define ADD_L(a, b) add L_(b), L_(a) +#define ADD_W(a, b) add W_(b), W_(a) +#define ADD_B(a, b) add B_(b), B_(a) +#define AND_L(a, b) and L_(b), L_(a) +#define AND_W(a, b) and W_(b), W_(a) +#define AND_B(a, b) and B_(b), B_(a) +#define ARPL(a,b) arpl W_(b), a +#define BOUND_L(a, b) bound L_(b), L_(a) +#define BOUND_W(a, b) bound W_(b), W_(a) +#define BSF_L(a, b) bsf L_(b), L_(a) +#define BSF_W(a, b) bsf W_(b), W_(a) +#define BSR_L(a, b) bsr L_(b), L_(a) +#define BSR_W(a, b) bsr W_(b), W_(a) +#define BT_L(a, b) bt L_(b), L_(a) +#define BT_W(a, b) bt W_(b), W_(a) +#define BTC_L(a, b) btc L_(b), L_(a) +#define BTC_W(a, b) btc W_(b), W_(a) +#define BTR_L(a, b) btr L_(b), L_(a) +#define BTR_W(a, b) btr W_(b), W_(a) +#define BTS_L(a, b) bts L_(b), L_(a) +#define BTS_W(a, b) bts W_(b), W_(a) +#define CALL(a) call a +#define CALLF(s,a) call far s:a +#define CBW cbw +#define CWDE cwde +#define CLC clc +#define CLD cld +#define CLI cli +#define CLTS clts +#define CMC cmc +#define CMP_L(a, b) cmp L_(b), L_(a) +#define CMP_W(a, b) cmp W_(b), W_(a) +#define CMP_B(a, b) cmp B_(b), B_(a) +#define CMPS_L cmpsd +#define CMPS_W cmpsw +#define CMPS_B cmpsb +#define CPUID cpuid +#define CWD cwd +#define CDQ cdq +#define DAA daa +#define DAS das +#define DEC_L(a) dec L_(a) +#define DEC_W(a) dec W_(a) +#define DEC_B(a) dec B_(a) +#define DIV_L(a) div L_(a) +#define DIV_W(a) div W_(a) +#define DIV_B(a) div B_(a) +#define ENTER(a,b) enter b, a +#define HLT hlt +#define IDIV_L(a) idiv L_(a) +#define IDIV_W(a) idiv W_(a) +#define IDIV_B(a) idiv B_(a) +#define IMUL_L(a, b) imul L_(b), L_(a) +#define IMUL_W(a, b) imul W_(b), W_(a) +#define IMUL_B(a) imul B_(a) +#define IN_L in EAX, DX +#define IN_W in AX, DX +#define IN_B in AL, DX +#define IN1_L(a) in1 L_(a) +#define IN1_W(a) in1 W_(a) +#define IN1_B(a) in1 B_(a) +#define INC_L(a) inc L_(a) +#define INC_W(a) inc W_(a) +#define INC_B(a) inc B_(a) +#define INS_L ins +#define INS_W ins +#define INS_B ins +#define INT(a) int B_(a) +#define INT3 int3 +#define INTO into +#define IRET iret +#define IRETD iretd +#define JA(a) ja NEAR a +#define JAE(a) jae NEAR a +#define JB(a) jb NEAR a +#define JBE(a) jbe NEAR a +#define JC(a) jc NEAR a +#define JE(a) je NEAR a +#define JG(a) jg NEAR a +#define JGE(a) jge NEAR a +#define JL(a) jl NEAR a +#define JLE(a) jle NEAR a +#define JNA(a) jna NEAR a +#define JNAE(a) jnae NEAR a +#define JNB(a) jnb NEAR a +#define JNBE(a) jnbe NEAR a +#define JNC(a) jnc NEAR a +#define JNE(a) jne NEAR a +#define JNG(a) jng NEAR a +#define JNGE(a) jnge NEAR a +#define JNL(a) jnl NEAR a +#define JNLE(a) jnle NEAR a +#define JNO(a) jno NEAR a +#define JNP(a) jnp NEAR a +#define JNS(a) jns NEAR a +#define JNZ(a) jnz NEAR a +#define JO(a) jo NEAR a +#define JP(a) jp NEAR a +#define JPE(a) jpe NEAR a +#define JPO(a) jpo NEAR a +#define JS(a) js NEAR a +#define JZ(a) jz NEAR a +#define JMP(a) jmp a +#define JMPF(s,a) jmp far s:a +#define LAHF lahf +#define LAR(a, b) lar b, a +#define LEA_L(a, b) lea P_(b), P_(a) +#define LEA_W(a, b) lea P_(b), P_(a) +#define LEAVE leave +#define LGDT(a) lgdt a +#define LIDT(a) lidt a +#define LDS(a, b) lds b, P_(a) +#define LES(a, b) les b, P_(a) +#define LFS(a, b) lfs b, P_(a) +#define LGS(a, b) lgs b, P_(a) +#define LSS(a, b) lss b, P_(a) +#define LLDT(a) lldt a +#define LMSW(a) lmsw a +#define LOCK lock +#define LODS_L lodsd +#define LODS_W lodsw +#define LODS_B lodsb +#define LOOP(a) loop a +#define LOOPE(a) loope a +#define LOOPZ(a) loopz a +#define LOOPNE(a) loopne a +#define LOOPNZ(a) loopnz a +#define LSL(a, b) lsl b, a +#define LTR(a) ltr a +#define MOV_SR(a, b) mov SR_(b), SR_(a) +#define MOV_L(a, b) mov L_(b), L_(a) +#define MOV_W(a, b) mov W_(b), W_(a) +#define MOV_B(a, b) mov B_(b), B_(a) +#define MOVS_L movsd +#define MOVS_W movsw +#define MOVS_B movsb +#define MOVSX_BL(a, b) movsx B_(b), B_(a) +#define MOVSX_BW(a, b) movsx B_(b), B_(a) +#define MOVSX_WL(a, b) movsx W_(b), W_(a) +#define MOVZX_BL(a, b) movzx B_(b), B_(a) +#define MOVZX_BW(a, b) movzx B_(b), B_(a) +#define MOVZX_WL(a, b) movzx W_(b), W_(a) +#define MUL_L(a) mul L_(a) +#define MUL_W(a) mul W_(a) +#define MUL_B(a) mul B_(a) +#define NEG_L(a) neg L_(a) +#define NEG_W(a) neg W_(a) +#define NEG_B(a) neg B_(a) +#define NOP nop +#define NOT_L(a) not L_(a) +#define NOT_W(a) not W_(a) +#define NOT_B(a) not B_(a) +#define OR_L(a,b) or L_(b), L_(a) +#define OR_W(a,b) or W_(b), W_(a) +#define OR_B(a,b) or B_(b), B_(a) +#define OUT_L out DX, EAX +#define OUT_W out DX, AX +#define OUT_B out DX, AL +#define OUT1_L(a) out1 L_(a) +#define OUT1_W(a) out1 W_(a) +#define OUT1_B(a) out1 B_(a) +#define OUTS_L outsd +#define OUTS_W outsw +#define OUTS_B outsb +#define POP_SR(a) pop SR_(a) +#define POP_L(a) pop L_(a) +#define POP_W(a) pop W_(a) +#define POPA_L popad +#define POPA_W popa +#define POPF_L popfd +#define POPF_W popf +#define PUSH_SR(a) push SR_(a) +#define PUSH_L(a) push L_(a) +#define PUSH_W(a) push W_(a) +#define PUSH_B(a) push B_(a) +#define PUSHA_L pushad +#define PUSHA_W pusha +#define PUSHF_L pushfd +#define PUSHF_W pushf +#define RCL_L(a, b) rcl L_(b), L_(a) +#define RCL_W(a, b) rcl W_(b), W_(a) +#define RCL_B(a, b) rcl B_(b), B_(a) +#define RCR_L(a, b) rcr L_(b), L_(a) +#define RCR_W(a, b) rcr W_(b), W_(a) +#define RCR_B(a, b) rcr B_(b), B_(a) +#define RDTSC rdtsc +#define ROL_L(a, b) rol L_(b), L_(a) +#define ROL_W(a, b) rol W_(b), W_(a) +#define ROL_B(a, b) rol B_(b), B_(a) +#define ROR_L(a, b) ror L_(b), L_(a) +#define ROR_W(a, b) ror W_(b), W_(a) +#define ROR_B(a, b) ror B_(b), B_(a) +#define REP rep +#define REPE repe +#define REPNE repne +#define REPNZ REPNE +#define REPZ REPE +#define RET ret +#define SAHF sahf +#define SAL_L(a, b) sal L_(b), B_(a) +#define SAL_W(a, b) sal W_(b), B_(a) +#define SAL_B(a, b) sal B_(b), B_(a) +#define SAR_L(a, b) sar L_(b), B_(a) +#define SAR_W(a, b) sar W_(b), B_(a) +#define SAR_B(a, b) sar B_(b), B_(a) +#define SBB_L(a, b) sbb L_(b), L_(a) +#define SBB_W(a, b) sbb W_(b), W_(a) +#define SBB_B(a, b) sbb B_(b), B_(a) +#define SCAS_L scas +#define SCAS_W scas +#define SCAS_B scas +#define SETA(a) seta a +#define SETAE(a) setae a +#define SETB(a) setb a +#define SETBE(a) setbe a +#define SETC(a) setc a +#define SETE(a) sete a +#define SETG(a) setg a +#define SETGE(a) setge a +#define SETL(a) setl a +#define SETLE(a) setle a +#define SETNA(a) setna a +#define SETNAE(a) setnae a +#define SETNB(a) setnb a +#define SETNBE(a) setnbe a +#define SETNC(a) setnc a +#define SETNE(a) setne a +#define SETNG(a) setng a +#define SETNGE(a) setnge a +#define SETNL(a) setnl a +#define SETNLE(a) setnle a +#define SETNO(a) setno a +#define SETNP(a) setnp a +#define SETNS(a) setns a +#define SETNZ(a) setnz a +#define SETO(a) seto a +#define SETP(a) setp a +#define SETPE(a) setpe a +#define SETPO(a) setpo a +#define SETS(a) sets a +#define SETZ(a) setz a +#define SGDT(a) sgdt a +#define SIDT(a) sidt a +#define SHL_L(a, b) shl L_(b), B_(a) +#define SHL_W(a, b) shl W_(b), B_(a) +#define SHL_B(a, b) shl B_(b), B_(a) +#define SHLD_L(a,b,c) shld +#define SHLD2_L(a,b) shld L_(b), L_(a) +#define SHLD_W(a,b,c) shld +#define SHLD2_W(a,b) shld W_(b), W_(a) +#define SHR_L(a, b) shr L_(b), B_(a) +#define SHR_W(a, b) shr W_(b), B_(a) +#define SHR_B(a, b) shr B_(b), B_(a) +#define SHRD_L(a,b,c) shrd +#define SHRD2_L(a,b) shrd L_(b), L_(a) +#define SHRD_W(a,b,c) shrd +#define SHRD2_W(a,b) shrd W_(b), W_(a) +#define SLDT(a) sldt a +#define SMSW(a) smsw a +#define STC stc +#define STD std +#define STI sti +#define STOS_L stosd +#define STOS_W stosw +#define STOS_B stosb +#define STR(a) str a +#define SUB_L(a, b) sub L_(b), L_(a) +#define SUB_W(a, b) sub W_(b), W_(a) +#define SUB_B(a, b) sub B_(b), B_(a) +#define TEST_L(a, b) test L_(b), L_(a) +#define TEST_W(a, b) test W_(b), W_(a) +#define TEST_B(a, b) test B_(b), B_(a) +#define VERR(a) verr a +#define VERW(a) verw a +#define WAIT wait +#define XCHG_L(a, b) xchg L_(b), L_(a) +#define XCHG_W(a, b) xchg W_(b), W_(a) +#define XCHG_B(a, b) xchg B_(b), B_(a) +#define XLAT xlat +#define XOR_L(a, b) xor L_(b), L_(a) +#define XOR_W(a, b) xor W_(b), W_(a) +#define XOR_B(a, b) xor B_(b), B_(a) + + +/* Floating Point Instructions */ +#define F2XM1 f2xm1 +#define FABS fabs +#define FADD_D(a) fadd D_(a) +#define FADD_S(a) fadd S_(a) +#define FADD2(a, b) fadd b, a +#define FADDP(a, b) faddp b, a +#define FIADD_L(a) fiadd L_(a) +#define FIADD_W(a) fiadd W_(a) +#define FBLD(a) fbld a +#define FBSTP(a) fbstp a +#define FCHS fchs +#define FCLEX fclex +#define FNCLEX fnclex +#define FCOM(a) fcom a +#define FCOM_D(a) fcom D_(a) +#define FCOM_S(a) fcom S_(a) +#define FCOMP(a) fcomp a +#define FCOMP_D(a) fcomp D_(a) +#define FCOMP_S(a) fcomp S_(a) +#define FCOMPP fcompp +#define FCOS fcos +#define FDECSTP fdecstp +#define FDIV_D(a) fdiv D_(a) +#define FDIV_S(a) fdiv S_(a) +#define FDIV2(a, b) fdiv b, a +#define FDIVP(a, b) fdivp b, a +#define FIDIV_L(a) fidiv L_(a) +#define FIDIV_W(a) fidiv W_(a) +#define FDIVR_D(a) fdivr D_(a) +#define FDIVR_S(a) fdivr S_(a) +#define FDIVR2(a, b) fdivr b, a +#define FDIVRP(a, b) fdivrp b, a +#define FIDIVR_L(a) fidivr L_(a) +#define FIDIVR_W(a) fidivr W_(a) +#define FFREE(a) ffree a +#define FICOM_L(a) ficom L_(a) +#define FICOM_W(a) ficom W_(a) +#define FICOMP_L(a) ficomp L_(a) +#define FICOMP_W(a) ficomp W_(a) +#define FILD_Q(a) fild D_(a) +#define FILD_L(a) fild L_(a) +#define FILD_W(a) fild W_(a) +#define FINCSTP fincstp +#define FINIT finit +#define FNINIT fninit +#define FIST_L(a) fist L_(a) +#define FIST_W(a) fist W_(a) +#define FISTP_Q(a) fistp D_(a) +#define FISTP_L(a) fistp L_(a) +#define FISTP_W(a) fistp W_(a) +#define FLD_X(a) fld X_(a) +#define FLD_D(a) fld D_(a) +#define FLD_S(a) fld S_(a) +#define FLD1 fld1 +#define FLDL2T fldl2t +#define FLDL2E fldl2e +#define FLDPI fldpi +#define FLDLG2 fldlg2 +#define FLDLN2 fldln2 +#define FLDZ fldz +#define FLDCW(a) fldcw a +#define FLDENV(a) fldenv a +#define FMUL_S(a) fmul S_(a) +#define FMUL_D(a) fmul D_(a) +#define FMUL2(a, b) fmul b, a +#define FMULP(a, b) fmulp b, a +#define FIMUL_L(a) fimul L_(a) +#define FIMUL_W(a) fimul W_(a) +#define FNOP fnop +#define FPATAN fpatan +#define FPREM fprem +#define FPREM1 fprem1 +#define FPTAN fptan +#define FRNDINT frndint +#define FRSTOR(a) frstor a +#define FSAVE(a) fsave a +#define FNSAVE(a) fnsave a +#define FSCALE fscale +#define FSIN fsin +#define FSINCOS fsincos +#define FSQRT fsqrt +#define FST_D(a) fst D_(a) +#define FST_S(a) fst S_(a) +#define FSTP_X(a) fstp X_(a) +#define FSTP_D(a) fstp D_(a) +#define FSTP_S(a) fstp S_(a) +#define FSTP(a) fstp a +#define FSTCW(a) fstcw a +#define FNSTCW(a) fnstcw a +#define FSTENV(a) fstenv a +#define FNSTENV(a) fnstenv a +#define FSTSW(a) fstsw a +#define FNSTSW(a) fnstsw a +#define FSUB_S(a) fsub S_(a) +#define FSUB_D(a) fsub D_(a) +#define FSUB2(a, b) fsub b, a +#define FSUBP(a, b) fsubp b, a +#define FISUB_L(a) fisub L_(a) +#define FISUB_W(a) fisub W_(a) +#define FSUBR_S(a) fsubr S_(a) +#define FSUBR_D(a) fsubr D_(a) +#define FSUBR2(a, b) fsubr b, a +#define FSUBRP(a, b) fsubrp b, a +#define FISUBR_L(a) fisubr L_(a) +#define FISUBR_W(a) fisubr W_(a) +#define FTST ftst +#define FUCOM(a) fucom a +#define FUCOMP(a) fucomp a +#define FUCOMPP fucompp +#define FWAIT fwait +#define FXAM fxam +#define FXCH(a) fxch a +#define FXTRACT fxtract +#define FYL2X fyl2x +#define FYL2XP1 fyl2xp1 + +#endif /* NASM_ASSEMBLER, MASM_ASSEMBLER */ + + /****************************************/ + /* */ + /* Extensions to x86 insn set - */ + /* MMX, 3DNow! */ + /* */ + /****************************************/ + +#if defined(NASM_ASSEMBLER) || defined(MASM_ASSEMBLER) +#define P_ARG1(a) P_ ## a +#define P_ARG2(a, b) P_ ## b, P_ ## a +#define P_ARG3(a, b, c) P_ ## c, P_ ## b, P_ ## a +#else +#define P_ARG1(a) a +#define P_ARG2(a, b) a, b +#define P_ARG3(a, b, c) a, b, c +#endif + +/* MMX */ +#define MOVD(a, b) movd P_ARG2(a, b) +#define MOVQ(a, b) movq P_ARG2(a, b) + +#define PADDB(a, b) paddb P_ARG2(a, b) +#define PADDW(a, b) paddw P_ARG2(a, b) +#define PADDD(a, b) paddd P_ARG2(a, b) + +#define PADDSB(a, b) paddsb P_ARG2(a, b) +#define PADDSW(a, b) paddsw P_ARG2(a, b) + +#define PADDUSB(a, b) paddusb P_ARG2(a, b) +#define PADDUSW(a, b) paddusw P_ARG2(a, b) + +#define PSUBB(a, b) psubb P_ARG2(a, b) +#define PSUBW(a, b) psubw P_ARG2(a, b) +#define PSUBD(a, b) psubd P_ARG2(a, b) + +#define PSUBSB(a, b) psubsb P_ARG2(a, b) +#define PSUBSW(a, b) psubsw P_ARG2(a, b) + +#define PSUBUSB(a, b) psubusb P_ARG2(a, b) +#define PSUBUSW(a, b) psubusw P_ARG2(a, b) + +#define PCMPEQB(a, b) pcmpeqb P_ARG2(a, b) +#define PCMPEQW(a, b) pcmpeqw P_ARG2(a, b) +#define PCMPEQD(a, b) pcmpeqd P_ARG2(a, b) + +#define PCMPGTB(a, b) pcmpgtb P_ARG2(a, b) +#define PCMPGTW(a, b) pcmpgtw P_ARG2(a, b) +#define PCMPGTD(a, b) pcmpgtd P_ARG2(a, b) + +#define PMULHW(a, b) pmulhw P_ARG2(a, b) +#define PMULLW(a, b) pmullw P_ARG2(a, b) + +#define PMADDWD(a, b) pmaddwd P_ARG2(a, b) + +#define PAND(a, b) pand P_ARG2(a, b) + +#define PANDN(a, b) pandn P_ARG2(a, b) + +#define POR(a, b) por P_ARG2(a, b) + +#define PXOR(a, b) pxor P_ARG2(a, b) + +#define PSRAW(a, b) psraw P_ARG2(a, b) +#define PSRAD(a, b) psrad P_ARG2(a, b) + +#define PSRLW(a, b) psrlw P_ARG2(a, b) +#define PSRLD(a, b) psrld P_ARG2(a, b) +#define PSRLQ(a, b) psrlq P_ARG2(a, b) + +#define PSLLW(a, b) psllw P_ARG2(a, b) +#define PSLLD(a, b) pslld P_ARG2(a, b) +#define PSLLQ(a, b) psllq P_ARG2(a, b) + +#define PACKSSWB(a, b) packsswb P_ARG2(a, b) +#define PACKSSDW(a, b) packssdw P_ARG2(a, b) +#define PACKUSWB(a, b) packuswb P_ARG2(a, b) + +#define PUNPCKHBW(a, b) punpckhbw P_ARG2(a, b) +#define PUNPCKHWD(a, b) punpckhwd P_ARG2(a, b) +#define PUNPCKHDQ(a, b) punpckhdq P_ARG2(a, b) +#define PUNPCKLBW(a, b) punpcklbw P_ARG2(a, b) +#define PUNPCKLWD(a, b) punpcklwd P_ARG2(a, b) +#define PUNPCKLDQ(a, b) punpckldq P_ARG2(a, b) + +#define EMMS emms + +/* AMD 3DNow! */ +#define PAVGUSB(a, b) pavgusb P_ARG2(a, b) +#define PFADD(a, b) pfadd P_ARG2(a, b) +#define PFSUB(a, b) pfsub P_ARG2(a, b) +#define PFSUBR(a, b) pfsubr P_ARG2(a, b) +#define PFACC(a, b) pfacc P_ARG2(a, b) +#define PFCMPGE(a, b) pfcmpge P_ARG2(a, b) +#define PFCMPGT(a, b) pfcmpgt P_ARG2(a, b) +#define PFCMPEQ(a, b) pfcmpeq P_ARG2(a, b) +#define PFMIN(a, b) pfmin P_ARG2(a, b) +#define PFMAX(a, b) pfmax P_ARG2(a, b) +#define PI2FD(a, b) pi2fd P_ARG2(a, b) +#define PF2ID(a, b) pf2id P_ARG2(a, b) +#define PFRCP(a, b) pfrcp P_ARG2(a, b) +#define PFRSQRT(a, b) pfrsqrt P_ARG2(a, b) +#define PFMUL(a, b) pfmul P_ARG2(a, b) +#define PFRCPIT1(a, b) pfrcpit1 P_ARG2(a, b) +#define PFRSQIT1(a, b) pfrsqit1 P_ARG2(a, b) +#define PFRCPIT2(a, b) pfrcpit2 P_ARG2(a, b) +#define PMULHRW(a, b) pmulhrw P_ARG2(a, b) + +#define FEMMS femms +#define PREFETCH(a) prefetch P_ARG1(a) +#define PREFETCHW(a) prefetchw P_ARG1(a) + +/* Intel SSE */ +#define ADDPS(a, b) addps P_ARG2(a, b) +#define ADDSS(a, b) addss P_ARG2(a, b) +#define ANDNPS(a, b) andnps P_ARG2(a, b) +#define ANDPS(a, b) andps P_ARG2(a, b) +/* NASM only knows the pseudo ops for these. +#define CMPPS(a, b, c) cmpps P_ARG3(a, b, c) +#define CMPSS(a, b, c) cmpss P_ARG3(a, b, c) +*/ +#define CMPEQPS(a, b) cmpeqps P_ARG2(a, b) +#define CMPLTPS(a, b) cmpltps P_ARG2(a, b) +#define CMPLEPS(a, b) cmpleps P_ARG2(a, b) +#define CMPUNORDPS(a, b) cmpunordps P_ARG2(a, b) +#define CMPNEQPS(a, b) cmpneqps P_ARG2(a, b) +#define CMPNLTPS(a, b) cmpnltps P_ARG2(a, b) +#define CMPNLEPS(a, b) cmpnleps P_ARG2(a, b) +#define CMPORDPS(a, b) cmpordps P_ARG2(a, b) +#define CMPEQSS(a, b) cmpeqss P_ARG2(a, b) +#define CMPLTSS(a, b) cmpltss P_ARG2(a, b) +#define CMPLESS(a, b) cmpless P_ARG2(a, b) +#define CMPUNORDSS(a, b) cmpunordss P_ARG2(a, b) +#define CMPNEQSS(a, b) cmpneqss P_ARG2(a, b) +#define CMPNLTSS(a, b) cmpnltss P_ARG2(a, b) +#define CMPNLESS(a, b) cmpnless P_ARG2(a, b) +#define CMPORDSS(a, b) cmpordss P_ARG2(a, b) +#define COMISS(a, b) comiss P_ARG2(a, b) +#define CVTPI2PS(a, b) cvtpi2ps P_ARG2(a, b) +#define CVTPS2PI(a, b) cvtps2pi P_ARG2(a, b) +#define CVTSI2SS(a, b) cvtsi2ss P_ARG2(a, b) +#define CVTSS2SI(a, b) cvtss2si P_ARG2(a, b) +#define CVTTPS2PI(a, b) cvttps2pi P_ARG2(a, b) +#define CVTTSS2SI(a, b) cvttss2si P_ARG2(a, b) +#define DIVPS(a, b) divps P_ARG2(a, b) +#define DIVSS(a, b) divss P_ARG2(a, b) +#define FXRSTOR(a) fxrstor P_ARG1(a) +#define FXSAVE(a) fxsave P_ARG1(a) +#define LDMXCSR(a) ldmxcsr P_ARG1(a) +#define MAXPS(a, b) maxps P_ARG2(a, b) +#define MAXSS(a, b) maxss P_ARG2(a, b) +#define MINPS(a, b) minps P_ARG2(a, b) +#define MINSS(a, b) minss P_ARG2(a, b) +#define MOVAPS(a, b) movaps P_ARG2(a, b) +#define MOVHLPS(a, b) movhlps P_ARG2(a, b) +#define MOVHPS(a, b) movhps P_ARG2(a, b) +#define MOVLHPS(a, b) movlhps P_ARG2(a, b) +#define MOVLPS(a, b) movlps P_ARG2(a, b) +#define MOVMSKPS(a, b) movmskps P_ARG2(a, b) +#define MOVNTPS(a, b) movntps P_ARG2(a, b) +#define MOVNTQ(a, b) movntq P_ARG2(a, b) +#define MOVSS(a, b) movss P_ARG2(a, b) +#define MOVUPS(a, b) movups P_ARG2(a, b) +#define MULPS(a, b) mulps P_ARG2(a, b) +#define MULSS(a, b) mulss P_ARG2(a, b) +#define ORPS(a, b) orps P_ARG2(a, b) +#define RCPPS(a, b) rcpps P_ARG2(a, b) +#define RCPSS(a, b) rcpss P_ARG2(a, b) +#define RSQRTPS(a, b) rsqrtps P_ARG2(a, b) +#define RSQRTSS(a, b) rsqrtss P_ARG2(a, b) +#define SHUFPS(a, b, c) shufps P_ARG3(a, b, c) +#define SQRTPS(a, b) sqrtps P_ARG2(a, b) +#define SQRTSS(a, b) sqrtss P_ARG2(a, b) +#define STMXCSR(a) stmxcsr P_ARG1(a) +#define SUBPS(a, b) subps P_ARG2(a, b) +#define UCOMISS(a, b) ucomiss P_ARG2(a, b) +#define UNPCKHPS(a, b) unpckhps P_ARG2(a, b) +#define UNPCKLPS(a, b) unpcklps P_ARG2(a, b) +#define XORPS(a, b) xorps P_ARG2(a, b) + +#define PREFETCHNTA(a) prefetchnta P_ARG1(a) +#define PREFETCHT0(a) prefetcht0 P_ARG1(a) +#define PREFETCHT1(a) prefetcht1 P_ARG1(a) +#define PREFETCHT2(a) prefetcht2 P_ARG1(a) +#define SFENCE sfence + +/* Added by BrianP for FreeBSD (per David Dawes) */ +#if !defined(NASM_ASSEMBLER) && !defined(MASM_ASSEMBLER) && !defined(__bsdi__) +#define LLBL(a) CONCAT(.L,a) +#define LLBL2(a,b) CONCAT3(.L,a,b) +#else +#define LLBL(a) a +#define LLBL2(a,b) CONCAT(a,b) +#endif + +/* Segment overrides */ +#define SEGCS D_BYTE 46 +#define SEGDS D_BYTE 62 +#define SEGES D_BYTE 38 +#define SEGFS D_BYTE 100 +#define SEGGS D_BYTE 101 + +/* Temporary labels: valid until next non-local label */ +#ifdef NASM_ASSEMBLER +#define TLBL(a) CONCAT(.,a) +#else +#define TLBL(a) CONCAT(a,$) +#endif + +/* Hidden symbol visibility support. + * If we build with gcc's -fvisibility=hidden flag, we'll need to change + * the symbol visibility mode to 'default'. + */ +#if defined(GNU_ASSEMBLER) && !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) +# define HIDDEN(x) .hidden x +#elif defined(__GNUC__) && !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) +# pragma GCC visibility push(default) +# define HIDDEN(x) .hidden x +#else +# define HIDDEN(x) +#endif + +#endif /* __ASSYNTAX_H__ */ diff --git a/mesalib/src/mesa/x86/clip_args.h b/mesalib/src/mesa/x86/clip_args.h index faa3c6f87..796611fbf 100644 --- a/mesalib/src/mesa/x86/clip_args.h +++ b/mesalib/src/mesa/x86/clip_args.h @@ -1,59 +1,59 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * Clip test function interface for assembly code. Simply define
- * FRAME_OFFSET to the number of bytes pushed onto the stack before
- * using the ARG_* argument macros.
- *
- * Gareth Hughes
- */
-
-#ifndef __CLIP_ARGS_H__
-#define __CLIP_ARGS_H__
-
-/*
- * Offsets for clip_func arguments
- *
- * typedef GLvector4f *(*clip_func)( GLvector4f *clip_vec,
- * GLvector4f *proj_vec,
- * GLubyte clipMask[],
- * GLubyte *orMask,
- * GLubyte *andMask );
- */
-
-#define OFFSET_SOURCE 4
-#define OFFSET_DEST 8
-#define OFFSET_CLIP 12
-#define OFFSET_OR 16
-#define OFFSET_AND 20
-
-#define ARG_SOURCE REGOFF(FRAME_OFFSET+OFFSET_SOURCE, ESP)
-#define ARG_DEST REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP)
-#define ARG_CLIP REGOFF(FRAME_OFFSET+OFFSET_CLIP, ESP)
-#define ARG_OR REGOFF(FRAME_OFFSET+OFFSET_OR, ESP)
-#define ARG_AND REGOFF(FRAME_OFFSET+OFFSET_AND, ESP)
-
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Clip test function interface for assembly code. Simply define + * FRAME_OFFSET to the number of bytes pushed onto the stack before + * using the ARG_* argument macros. + * + * Gareth Hughes + */ + +#ifndef __CLIP_ARGS_H__ +#define __CLIP_ARGS_H__ + +/* + * Offsets for clip_func arguments + * + * typedef GLvector4f *(*clip_func)( GLvector4f *clip_vec, + * GLvector4f *proj_vec, + * GLubyte clipMask[], + * GLubyte *orMask, + * GLubyte *andMask ); + */ + +#define OFFSET_SOURCE 4 +#define OFFSET_DEST 8 +#define OFFSET_CLIP 12 +#define OFFSET_OR 16 +#define OFFSET_AND 20 + +#define ARG_SOURCE REGOFF(FRAME_OFFSET+OFFSET_SOURCE, ESP) +#define ARG_DEST REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP) +#define ARG_CLIP REGOFF(FRAME_OFFSET+OFFSET_CLIP, ESP) +#define ARG_OR REGOFF(FRAME_OFFSET+OFFSET_OR, ESP) +#define ARG_AND REGOFF(FRAME_OFFSET+OFFSET_AND, ESP) + +#endif diff --git a/mesalib/src/mesa/x86/common_x86.c b/mesalib/src/mesa/x86/common_x86.c index f5b9357cc..b70ee5084 100644 --- a/mesalib/src/mesa/x86/common_x86.c +++ b/mesalib/src/mesa/x86/common_x86.c @@ -1,336 +1,336 @@ -/*
- * Mesa 3-D graphics library
- * Version: 6.5.1
- *
- * Copyright (C) 1999-2006 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file common_x86.c
- *
- * Check CPU capabilities & initialize optimized funtions for this particular
- * processor.
- *
- * Changed by Andre Werthmann for using the new SSE functions.
- *
- * \author Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
- * \author Andre Werthmann <wertmann@cs.uni-potsdam.de>
- */
-
-/* XXX these includes should probably go into imports.h or glheader.h */
-#if defined(USE_SSE_ASM) && defined(__linux__)
-#include <linux/version.h>
-#endif
-#if defined(USE_SSE_ASM) && defined(__FreeBSD__)
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#endif
-#if defined(USE_SSE_ASM) && defined(__OpenBSD__)
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <machine/cpu.h>
-#endif
-
-#include "main/imports.h"
-#include "common_x86_asm.h"
-
-
-/** Bitmask of X86_FEATURE_x bits */
-int _mesa_x86_cpu_features = 0x0;
-
-static int detection_debug = GL_FALSE;
-
-/* No reason for this to be public.
- */
-extern GLuint _ASMAPI _mesa_x86_has_cpuid(void);
-extern void _ASMAPI _mesa_x86_cpuid(GLuint op, GLuint *reg_eax, GLuint *reg_ebx, GLuint *reg_ecx, GLuint *reg_edx);
-extern GLuint _ASMAPI _mesa_x86_cpuid_eax(GLuint op);
-extern GLuint _ASMAPI _mesa_x86_cpuid_ebx(GLuint op);
-extern GLuint _ASMAPI _mesa_x86_cpuid_ecx(GLuint op);
-extern GLuint _ASMAPI _mesa_x86_cpuid_edx(GLuint op);
-
-
-#if defined(USE_SSE_ASM)
-/*
- * We must verify that the Streaming SIMD Extensions are truly supported
- * on this processor before we go ahead and hook out the optimized code.
- *
- * However, I have been told by Alan Cox that all 2.4 (and later) Linux
- * kernels provide full SSE support on all processors that expose SSE via
- * the CPUID mechanism.
- */
-
-/* These are assembly functions: */
-extern void _mesa_test_os_sse_support( void );
-extern void _mesa_test_os_sse_exception_support( void );
-
-
-#if defined(WIN32)
-#ifndef STATUS_FLOAT_MULTIPLE_TRAPS
-# define STATUS_FLOAT_MULTIPLE_TRAPS (0xC00002B5L)
-#endif
-static LONG WINAPI ExceptionFilter(LPEXCEPTION_POINTERS exp)
-{
- PEXCEPTION_RECORD rec = exp->ExceptionRecord;
- PCONTEXT ctx = exp->ContextRecord;
-
- if ( rec->ExceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION ) {
- _mesa_debug(NULL, "EXCEPTION_ILLEGAL_INSTRUCTION\n" );
- _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
- } else if ( rec->ExceptionCode == STATUS_FLOAT_MULTIPLE_TRAPS ) {
- _mesa_debug(NULL, "STATUS_FLOAT_MULTIPLE_TRAPS\n");
- /* Windows seems to clear the exception flag itself, we just have to increment Eip */
- } else {
- _mesa_debug(NULL, "UNEXPECTED EXCEPTION (0x%08x), terminating!\n" );
- return EXCEPTION_EXECUTE_HANDLER;
- }
-
- if ( (ctx->ContextFlags & CONTEXT_CONTROL) != CONTEXT_CONTROL ) {
- _mesa_debug(NULL, "Context does not contain control registers, terminating!\n");
- return EXCEPTION_EXECUTE_HANDLER;
- }
- ctx->Eip += 3;
-
- return EXCEPTION_CONTINUE_EXECUTION;
-}
-#endif /* WIN32 */
-
-
-/**
- * Check if SSE is supported.
- * If not, turn off the X86_FEATURE_XMM flag in _mesa_x86_cpu_features.
- */
-void _mesa_check_os_sse_support( void )
-{
-#if defined(__FreeBSD__)
- {
- int ret, enabled;
- unsigned int len;
- len = sizeof(enabled);
- ret = sysctlbyname("hw.instruction_sse", &enabled, &len, NULL, 0);
- if (ret || !enabled)
- _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
- }
-#elif defined (__NetBSD__)
- {
- int ret, enabled;
- size_t len = sizeof(enabled);
- ret = sysctlbyname("machdep.sse", &enabled, &len, (void *)NULL, 0);
- if (ret || !enabled)
- _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
- }
-#elif defined(__OpenBSD__)
- {
- int mib[2];
- int ret, enabled;
- size_t len = sizeof(enabled);
-
- mib[0] = CTL_MACHDEP;
- mib[1] = CPU_SSE;
-
- ret = sysctl(mib, 2, &enabled, &len, NULL, 0);
- if (ret || !enabled)
- _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
- }
-#elif defined(WIN32)
- LPTOP_LEVEL_EXCEPTION_FILTER oldFilter;
-
- /* Install our ExceptionFilter */
- oldFilter = SetUnhandledExceptionFilter( ExceptionFilter );
-
- if ( cpu_has_xmm ) {
- _mesa_debug(NULL, "Testing OS support for SSE...\n");
-
- _mesa_test_os_sse_support();
-
- if ( cpu_has_xmm ) {
- _mesa_debug(NULL, "Yes.\n");
- } else {
- _mesa_debug(NULL, "No!\n");
- }
- }
-
- if ( cpu_has_xmm ) {
- _mesa_debug(NULL, "Testing OS support for SSE unmasked exceptions...\n");
-
- _mesa_test_os_sse_exception_support();
-
- if ( cpu_has_xmm ) {
- _mesa_debug(NULL, "Yes.\n");
- } else {
- _mesa_debug(NULL, "No!\n");
- }
- }
-
- /* Restore previous exception filter */
- SetUnhandledExceptionFilter( oldFilter );
-
- if ( cpu_has_xmm ) {
- _mesa_debug(NULL, "Tests of OS support for SSE passed.\n");
- } else {
- _mesa_debug(NULL, "Tests of OS support for SSE failed!\n");
- }
-#else
- /* Do nothing on other platforms for now.
- */
- if (detection_debug)
- _mesa_debug(NULL, "Not testing OS support for SSE, leaving enabled.\n");
-#endif /* __FreeBSD__ */
-}
-
-#endif /* USE_SSE_ASM */
-
-
-/**
- * Initialize the _mesa_x86_cpu_features bitfield.
- * This is a no-op if called more than once.
- */
-void
-_mesa_get_x86_features(void)
-{
- static int called = 0;
-
- if (called)
- return;
-
- called = 1;
-
-#ifdef USE_X86_ASM
- _mesa_x86_cpu_features = 0x0;
-
- if (_mesa_getenv( "MESA_NO_ASM")) {
- return;
- }
-
- if (!_mesa_x86_has_cpuid()) {
- _mesa_debug(NULL, "CPUID not detected\n");
- }
- else {
- GLuint cpu_features;
- GLuint cpu_ext_features;
- GLuint cpu_ext_info;
- char cpu_vendor[13];
- GLuint result;
-
- /* get vendor name */
- _mesa_x86_cpuid(0, &result, (GLuint *)(cpu_vendor + 0), (GLuint *)(cpu_vendor + 8), (GLuint *)(cpu_vendor + 4));
- cpu_vendor[12] = '\0';
-
- if (detection_debug)
- _mesa_debug(NULL, "CPU vendor: %s\n", cpu_vendor);
-
- /* get cpu features */
- cpu_features = _mesa_x86_cpuid_edx(1);
-
- if (cpu_features & X86_CPU_FPU)
- _mesa_x86_cpu_features |= X86_FEATURE_FPU;
- if (cpu_features & X86_CPU_CMOV)
- _mesa_x86_cpu_features |= X86_FEATURE_CMOV;
-
-#ifdef USE_MMX_ASM
- if (cpu_features & X86_CPU_MMX)
- _mesa_x86_cpu_features |= X86_FEATURE_MMX;
-#endif
-
-#ifdef USE_SSE_ASM
- if (cpu_features & X86_CPU_XMM)
- _mesa_x86_cpu_features |= X86_FEATURE_XMM;
- if (cpu_features & X86_CPU_XMM2)
- _mesa_x86_cpu_features |= X86_FEATURE_XMM2;
-#endif
-
- /* query extended cpu features */
- if ((cpu_ext_info = _mesa_x86_cpuid_eax(0x80000000)) > 0x80000000) {
- if (cpu_ext_info >= 0x80000001) {
-
- cpu_ext_features = _mesa_x86_cpuid_edx(0x80000001);
-
- if (cpu_features & X86_CPU_MMX) {
-
-#ifdef USE_3DNOW_ASM
- if (cpu_ext_features & X86_CPUEXT_3DNOW)
- _mesa_x86_cpu_features |= X86_FEATURE_3DNOW;
- if (cpu_ext_features & X86_CPUEXT_3DNOW_EXT)
- _mesa_x86_cpu_features |= X86_FEATURE_3DNOWEXT;
-#endif
-
-#ifdef USE_MMX_ASM
- if (cpu_ext_features & X86_CPUEXT_MMX_EXT)
- _mesa_x86_cpu_features |= X86_FEATURE_MMXEXT;
-#endif
- }
- }
-
- /* query cpu name */
- if (cpu_ext_info >= 0x80000002) {
- GLuint ofs;
- char cpu_name[49];
- for (ofs = 0; ofs < 3; ofs++)
- _mesa_x86_cpuid(0x80000002+ofs, (GLuint *)(cpu_name + (16*ofs)+0), (GLuint *)(cpu_name + (16*ofs)+4), (GLuint *)(cpu_name + (16*ofs)+8), (GLuint *)(cpu_name + (16*ofs)+12));
- cpu_name[48] = '\0'; /* the name should be NULL terminated, but just to be sure */
-
- if (detection_debug)
- _mesa_debug(NULL, "CPU name: %s\n", cpu_name);
- }
- }
-
- }
-
-#ifdef USE_MMX_ASM
- if ( cpu_has_mmx ) {
- if ( _mesa_getenv( "MESA_NO_MMX" ) == 0 ) {
- if (detection_debug)
- _mesa_debug(NULL, "MMX cpu detected.\n");
- } else {
- _mesa_x86_cpu_features &= ~(X86_FEATURE_MMX);
- }
- }
-#endif
-
-#ifdef USE_3DNOW_ASM
- if ( cpu_has_3dnow ) {
- if ( _mesa_getenv( "MESA_NO_3DNOW" ) == 0 ) {
- if (detection_debug)
- _mesa_debug(NULL, "3DNow! cpu detected.\n");
- } else {
- _mesa_x86_cpu_features &= ~(X86_FEATURE_3DNOW);
- }
- }
-#endif
-
-#ifdef USE_SSE_ASM
- if ( cpu_has_xmm ) {
- if ( _mesa_getenv( "MESA_NO_SSE" ) == 0 ) {
- if (detection_debug)
- _mesa_debug(NULL, "SSE cpu detected.\n");
- if ( _mesa_getenv( "MESA_FORCE_SSE" ) == 0 ) {
- _mesa_check_os_sse_support();
- }
- } else {
- _mesa_debug(NULL, "SSE cpu detected, but switched off by user.\n");
- _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
- }
- }
-#endif
-
-#endif /* USE_X86_ASM */
-
- (void) detection_debug;
-}
+/* + * Mesa 3-D graphics library + * Version: 6.5.1 + * + * Copyright (C) 1999-2006 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * \file common_x86.c + * + * Check CPU capabilities & initialize optimized funtions for this particular + * processor. + * + * Changed by Andre Werthmann for using the new SSE functions. + * + * \author Holger Waechtler <holger@akaflieg.extern.tu-berlin.de> + * \author Andre Werthmann <wertmann@cs.uni-potsdam.de> + */ + +/* XXX these includes should probably go into imports.h or glheader.h */ +#if defined(USE_SSE_ASM) && defined(__linux__) +#include <linux/version.h> +#endif +#if defined(USE_SSE_ASM) && defined(__FreeBSD__) +#include <sys/types.h> +#include <sys/sysctl.h> +#endif +#if defined(USE_SSE_ASM) && defined(__OpenBSD__) +#include <sys/param.h> +#include <sys/sysctl.h> +#include <machine/cpu.h> +#endif + +#include "main/imports.h" +#include "common_x86_asm.h" + + +/** Bitmask of X86_FEATURE_x bits */ +int _mesa_x86_cpu_features = 0x0; + +static int detection_debug = GL_FALSE; + +/* No reason for this to be public. + */ +extern GLuint _ASMAPI _mesa_x86_has_cpuid(void); +extern void _ASMAPI _mesa_x86_cpuid(GLuint op, GLuint *reg_eax, GLuint *reg_ebx, GLuint *reg_ecx, GLuint *reg_edx); +extern GLuint _ASMAPI _mesa_x86_cpuid_eax(GLuint op); +extern GLuint _ASMAPI _mesa_x86_cpuid_ebx(GLuint op); +extern GLuint _ASMAPI _mesa_x86_cpuid_ecx(GLuint op); +extern GLuint _ASMAPI _mesa_x86_cpuid_edx(GLuint op); + + +#if defined(USE_SSE_ASM) +/* + * We must verify that the Streaming SIMD Extensions are truly supported + * on this processor before we go ahead and hook out the optimized code. + * + * However, I have been told by Alan Cox that all 2.4 (and later) Linux + * kernels provide full SSE support on all processors that expose SSE via + * the CPUID mechanism. + */ + +/* These are assembly functions: */ +extern void _mesa_test_os_sse_support( void ); +extern void _mesa_test_os_sse_exception_support( void ); + + +#if defined(WIN32) +#ifndef STATUS_FLOAT_MULTIPLE_TRAPS +# define STATUS_FLOAT_MULTIPLE_TRAPS (0xC00002B5L) +#endif +static LONG WINAPI ExceptionFilter(LPEXCEPTION_POINTERS exp) +{ + PEXCEPTION_RECORD rec = exp->ExceptionRecord; + PCONTEXT ctx = exp->ContextRecord; + + if ( rec->ExceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION ) { + _mesa_debug(NULL, "EXCEPTION_ILLEGAL_INSTRUCTION\n" ); + _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM); + } else if ( rec->ExceptionCode == STATUS_FLOAT_MULTIPLE_TRAPS ) { + _mesa_debug(NULL, "STATUS_FLOAT_MULTIPLE_TRAPS\n"); + /* Windows seems to clear the exception flag itself, we just have to increment Eip */ + } else { + _mesa_debug(NULL, "UNEXPECTED EXCEPTION (0x%08x), terminating!\n" ); + return EXCEPTION_EXECUTE_HANDLER; + } + + if ( (ctx->ContextFlags & CONTEXT_CONTROL) != CONTEXT_CONTROL ) { + _mesa_debug(NULL, "Context does not contain control registers, terminating!\n"); + return EXCEPTION_EXECUTE_HANDLER; + } + ctx->Eip += 3; + + return EXCEPTION_CONTINUE_EXECUTION; +} +#endif /* WIN32 */ + + +/** + * Check if SSE is supported. + * If not, turn off the X86_FEATURE_XMM flag in _mesa_x86_cpu_features. + */ +void _mesa_check_os_sse_support( void ) +{ +#if defined(__FreeBSD__) + { + int ret, enabled; + unsigned int len; + len = sizeof(enabled); + ret = sysctlbyname("hw.instruction_sse", &enabled, &len, NULL, 0); + if (ret || !enabled) + _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM); + } +#elif defined (__NetBSD__) + { + int ret, enabled; + size_t len = sizeof(enabled); + ret = sysctlbyname("machdep.sse", &enabled, &len, (void *)NULL, 0); + if (ret || !enabled) + _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM); + } +#elif defined(__OpenBSD__) + { + int mib[2]; + int ret, enabled; + size_t len = sizeof(enabled); + + mib[0] = CTL_MACHDEP; + mib[1] = CPU_SSE; + + ret = sysctl(mib, 2, &enabled, &len, NULL, 0); + if (ret || !enabled) + _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM); + } +#elif defined(WIN32) + LPTOP_LEVEL_EXCEPTION_FILTER oldFilter; + + /* Install our ExceptionFilter */ + oldFilter = SetUnhandledExceptionFilter( ExceptionFilter ); + + if ( cpu_has_xmm ) { + _mesa_debug(NULL, "Testing OS support for SSE...\n"); + + _mesa_test_os_sse_support(); + + if ( cpu_has_xmm ) { + _mesa_debug(NULL, "Yes.\n"); + } else { + _mesa_debug(NULL, "No!\n"); + } + } + + if ( cpu_has_xmm ) { + _mesa_debug(NULL, "Testing OS support for SSE unmasked exceptions...\n"); + + _mesa_test_os_sse_exception_support(); + + if ( cpu_has_xmm ) { + _mesa_debug(NULL, "Yes.\n"); + } else { + _mesa_debug(NULL, "No!\n"); + } + } + + /* Restore previous exception filter */ + SetUnhandledExceptionFilter( oldFilter ); + + if ( cpu_has_xmm ) { + _mesa_debug(NULL, "Tests of OS support for SSE passed.\n"); + } else { + _mesa_debug(NULL, "Tests of OS support for SSE failed!\n"); + } +#else + /* Do nothing on other platforms for now. + */ + if (detection_debug) + _mesa_debug(NULL, "Not testing OS support for SSE, leaving enabled.\n"); +#endif /* __FreeBSD__ */ +} + +#endif /* USE_SSE_ASM */ + + +/** + * Initialize the _mesa_x86_cpu_features bitfield. + * This is a no-op if called more than once. + */ +void +_mesa_get_x86_features(void) +{ + static int called = 0; + + if (called) + return; + + called = 1; + +#ifdef USE_X86_ASM + _mesa_x86_cpu_features = 0x0; + + if (_mesa_getenv( "MESA_NO_ASM")) { + return; + } + + if (!_mesa_x86_has_cpuid()) { + _mesa_debug(NULL, "CPUID not detected\n"); + } + else { + GLuint cpu_features; + GLuint cpu_ext_features; + GLuint cpu_ext_info; + char cpu_vendor[13]; + GLuint result; + + /* get vendor name */ + _mesa_x86_cpuid(0, &result, (GLuint *)(cpu_vendor + 0), (GLuint *)(cpu_vendor + 8), (GLuint *)(cpu_vendor + 4)); + cpu_vendor[12] = '\0'; + + if (detection_debug) + _mesa_debug(NULL, "CPU vendor: %s\n", cpu_vendor); + + /* get cpu features */ + cpu_features = _mesa_x86_cpuid_edx(1); + + if (cpu_features & X86_CPU_FPU) + _mesa_x86_cpu_features |= X86_FEATURE_FPU; + if (cpu_features & X86_CPU_CMOV) + _mesa_x86_cpu_features |= X86_FEATURE_CMOV; + +#ifdef USE_MMX_ASM + if (cpu_features & X86_CPU_MMX) + _mesa_x86_cpu_features |= X86_FEATURE_MMX; +#endif + +#ifdef USE_SSE_ASM + if (cpu_features & X86_CPU_XMM) + _mesa_x86_cpu_features |= X86_FEATURE_XMM; + if (cpu_features & X86_CPU_XMM2) + _mesa_x86_cpu_features |= X86_FEATURE_XMM2; +#endif + + /* query extended cpu features */ + if ((cpu_ext_info = _mesa_x86_cpuid_eax(0x80000000)) > 0x80000000) { + if (cpu_ext_info >= 0x80000001) { + + cpu_ext_features = _mesa_x86_cpuid_edx(0x80000001); + + if (cpu_features & X86_CPU_MMX) { + +#ifdef USE_3DNOW_ASM + if (cpu_ext_features & X86_CPUEXT_3DNOW) + _mesa_x86_cpu_features |= X86_FEATURE_3DNOW; + if (cpu_ext_features & X86_CPUEXT_3DNOW_EXT) + _mesa_x86_cpu_features |= X86_FEATURE_3DNOWEXT; +#endif + +#ifdef USE_MMX_ASM + if (cpu_ext_features & X86_CPUEXT_MMX_EXT) + _mesa_x86_cpu_features |= X86_FEATURE_MMXEXT; +#endif + } + } + + /* query cpu name */ + if (cpu_ext_info >= 0x80000002) { + GLuint ofs; + char cpu_name[49]; + for (ofs = 0; ofs < 3; ofs++) + _mesa_x86_cpuid(0x80000002+ofs, (GLuint *)(cpu_name + (16*ofs)+0), (GLuint *)(cpu_name + (16*ofs)+4), (GLuint *)(cpu_name + (16*ofs)+8), (GLuint *)(cpu_name + (16*ofs)+12)); + cpu_name[48] = '\0'; /* the name should be NULL terminated, but just to be sure */ + + if (detection_debug) + _mesa_debug(NULL, "CPU name: %s\n", cpu_name); + } + } + + } + +#ifdef USE_MMX_ASM + if ( cpu_has_mmx ) { + if ( _mesa_getenv( "MESA_NO_MMX" ) == 0 ) { + if (detection_debug) + _mesa_debug(NULL, "MMX cpu detected.\n"); + } else { + _mesa_x86_cpu_features &= ~(X86_FEATURE_MMX); + } + } +#endif + +#ifdef USE_3DNOW_ASM + if ( cpu_has_3dnow ) { + if ( _mesa_getenv( "MESA_NO_3DNOW" ) == 0 ) { + if (detection_debug) + _mesa_debug(NULL, "3DNow! cpu detected.\n"); + } else { + _mesa_x86_cpu_features &= ~(X86_FEATURE_3DNOW); + } + } +#endif + +#ifdef USE_SSE_ASM + if ( cpu_has_xmm ) { + if ( _mesa_getenv( "MESA_NO_SSE" ) == 0 ) { + if (detection_debug) + _mesa_debug(NULL, "SSE cpu detected.\n"); + if ( _mesa_getenv( "MESA_FORCE_SSE" ) == 0 ) { + _mesa_check_os_sse_support(); + } + } else { + _mesa_debug(NULL, "SSE cpu detected, but switched off by user.\n"); + _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM); + } + } +#endif + +#endif /* USE_X86_ASM */ + + (void) detection_debug; +} diff --git a/mesalib/src/mesa/x86/common_x86_asm.S b/mesalib/src/mesa/x86/common_x86_asm.S index 1d61bde58..ea4047a0e 100644 --- a/mesalib/src/mesa/x86/common_x86_asm.S +++ b/mesalib/src/mesa/x86/common_x86_asm.S @@ -1,220 +1,220 @@ -/*
- * Mesa 3-D graphics library
- * Version: 6.3
- *
- * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * Check extended CPU capabilities. Now justs returns the raw CPUID
- * feature information, allowing the higher level code to interpret the
- * results.
- *
- * Written by Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
- *
- * Cleaned up and simplified by Gareth Hughes <gareth@valinux.com>
- *
- */
-
-/*
- * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
- * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces
- * in there will break the build on some platforms.
- */
-
-#include "matypes.h"
-#include "assyntax.h"
-#include "common_x86_features.h"
-
- SEG_TEXT
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_x86_has_cpuid)
-HIDDEN(_mesa_x86_has_cpuid)
-GLNAME(_mesa_x86_has_cpuid):
-
- /* Test for the CPUID command. If the ID Flag bit in EFLAGS
- * (bit 21) is writable, the CPUID command is present */
- PUSHF_L
- POP_L (EAX)
- MOV_L (EAX, ECX)
- XOR_L (CONST(0x00200000), EAX)
- PUSH_L (EAX)
- POPF_L
- PUSHF_L
- POP_L (EAX)
-
- /* Verify the ID Flag bit has been written. */
- CMP_L (ECX, EAX)
- SETNE (AL)
- XOR_L (CONST(0xff), EAX)
-
- RET
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_x86_cpuid)
-HIDDEN(_mesa_x86_cpuid)
-GLNAME(_mesa_x86_cpuid):
-
- MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */
- PUSH_L (EDI)
- PUSH_L (EBX)
-
- CPUID
-
- MOV_L (REGOFF(16, ESP), EDI) /* *eax */
- MOV_L (EAX, REGIND(EDI))
- MOV_L (REGOFF(20, ESP), EDI) /* *ebx */
- MOV_L (EBX, REGIND(EDI))
- MOV_L (REGOFF(24, ESP), EDI) /* *ecx */
- MOV_L (ECX, REGIND(EDI))
- MOV_L (REGOFF(28, ESP), EDI) /* *edx */
- MOV_L (EDX, REGIND(EDI))
-
- POP_L (EBX)
- POP_L (EDI)
- RET
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_x86_cpuid_eax)
-HIDDEN(_mesa_x86_cpuid_eax)
-GLNAME(_mesa_x86_cpuid_eax):
-
- MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */
- PUSH_L (EBX)
-
- CPUID
-
- POP_L (EBX)
- RET
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_x86_cpuid_ebx)
-HIDDEN(_mesa_x86_cpuid_ebx)
-GLNAME(_mesa_x86_cpuid_ebx):
-
- MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */
- PUSH_L (EBX)
-
- CPUID
- MOV_L (EBX, EAX) /* return EBX */
-
- POP_L (EBX)
- RET
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_x86_cpuid_ecx)
-HIDDEN(_mesa_x86_cpuid_ecx)
-GLNAME(_mesa_x86_cpuid_ecx):
-
- MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */
- PUSH_L (EBX)
-
- CPUID
- MOV_L (ECX, EAX) /* return ECX */
-
- POP_L (EBX)
- RET
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_x86_cpuid_edx)
-HIDDEN(_mesa_x86_cpuid_edx)
-GLNAME(_mesa_x86_cpuid_edx):
-
- MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */
- PUSH_L (EBX)
-
- CPUID
- MOV_L (EDX, EAX) /* return EDX */
-
- POP_L (EBX)
- RET
-
-#ifdef USE_SSE_ASM
-/* Execute an SSE instruction to see if the operating system correctly
- * supports SSE. A signal handler for SIGILL should have been set
- * before calling this function, otherwise this could kill the client
- * application.
- *
- * -----> !!!! ATTENTION DEVELOPERS !!!! <-----
- *
- * If you're debugging with gdb and you get stopped in this function,
- * just type 'continue'! Execution will proceed normally.
- * See freedesktop.org bug #1709 for more info.
- */
-ALIGNTEXT4
-GLOBL GLNAME( _mesa_test_os_sse_support )
-HIDDEN(_mesa_test_os_sse_support)
-GLNAME( _mesa_test_os_sse_support ):
-
- XORPS ( XMM0, XMM0 )
-
- RET
-
-
-/* Perform an SSE divide-by-zero to see if the operating system
- * correctly supports unmasked SIMD FPU exceptions. Signal handlers for
- * SIGILL and SIGFPE should have been set before calling this function,
- * otherwise this could kill the client application.
- */
-ALIGNTEXT4
-GLOBL GLNAME( _mesa_test_os_sse_exception_support )
-HIDDEN(_mesa_test_os_sse_exception_support)
-GLNAME( _mesa_test_os_sse_exception_support ):
-
- PUSH_L ( EBP )
- MOV_L ( ESP, EBP )
- SUB_L ( CONST( 8 ), ESP )
-
- /* Save the original MXCSR register value.
- */
- STMXCSR ( REGOFF( -4, EBP ) )
-
- /* Unmask the divide-by-zero exception and perform one.
- */
- STMXCSR ( REGOFF( -8, EBP ) )
- AND_L ( CONST( 0xfffffdff ), REGOFF( -8, EBP ) )
- LDMXCSR ( REGOFF( -8, EBP ) )
-
- XORPS ( XMM0, XMM0 )
-
- PUSH_L ( CONST( 0x3f800000 ) )
- PUSH_L ( CONST( 0x3f800000 ) )
- PUSH_L ( CONST( 0x3f800000 ) )
- PUSH_L ( CONST( 0x3f800000 ) )
-
- MOVUPS ( REGIND( ESP ), XMM1 )
-
- DIVPS ( XMM0, XMM1 )
-
- /* Restore the original MXCSR register value.
- */
- LDMXCSR ( REGOFF( -4, EBP ) )
-
- LEAVE
- RET
-
-#endif
-
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+/* + * Mesa 3-D graphics library + * Version: 6.3 + * + * Copyright (C) 1999-2004 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Check extended CPU capabilities. Now justs returns the raw CPUID + * feature information, allowing the higher level code to interpret the + * results. + * + * Written by Holger Waechtler <holger@akaflieg.extern.tu-berlin.de> + * + * Cleaned up and simplified by Gareth Hughes <gareth@valinux.com> + * + */ + +/* + * NOTE: Avoid using spaces in between '(' ')' and arguments, especially + * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces + * in there will break the build on some platforms. + */ + +#include "matypes.h" +#include "assyntax.h" +#include "common_x86_features.h" + + SEG_TEXT + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_x86_has_cpuid) +HIDDEN(_mesa_x86_has_cpuid) +GLNAME(_mesa_x86_has_cpuid): + + /* Test for the CPUID command. If the ID Flag bit in EFLAGS + * (bit 21) is writable, the CPUID command is present */ + PUSHF_L + POP_L (EAX) + MOV_L (EAX, ECX) + XOR_L (CONST(0x00200000), EAX) + PUSH_L (EAX) + POPF_L + PUSHF_L + POP_L (EAX) + + /* Verify the ID Flag bit has been written. */ + CMP_L (ECX, EAX) + SETNE (AL) + XOR_L (CONST(0xff), EAX) + + RET + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_x86_cpuid) +HIDDEN(_mesa_x86_cpuid) +GLNAME(_mesa_x86_cpuid): + + MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */ + PUSH_L (EDI) + PUSH_L (EBX) + + CPUID + + MOV_L (REGOFF(16, ESP), EDI) /* *eax */ + MOV_L (EAX, REGIND(EDI)) + MOV_L (REGOFF(20, ESP), EDI) /* *ebx */ + MOV_L (EBX, REGIND(EDI)) + MOV_L (REGOFF(24, ESP), EDI) /* *ecx */ + MOV_L (ECX, REGIND(EDI)) + MOV_L (REGOFF(28, ESP), EDI) /* *edx */ + MOV_L (EDX, REGIND(EDI)) + + POP_L (EBX) + POP_L (EDI) + RET + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_x86_cpuid_eax) +HIDDEN(_mesa_x86_cpuid_eax) +GLNAME(_mesa_x86_cpuid_eax): + + MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */ + PUSH_L (EBX) + + CPUID + + POP_L (EBX) + RET + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_x86_cpuid_ebx) +HIDDEN(_mesa_x86_cpuid_ebx) +GLNAME(_mesa_x86_cpuid_ebx): + + MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */ + PUSH_L (EBX) + + CPUID + MOV_L (EBX, EAX) /* return EBX */ + + POP_L (EBX) + RET + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_x86_cpuid_ecx) +HIDDEN(_mesa_x86_cpuid_ecx) +GLNAME(_mesa_x86_cpuid_ecx): + + MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */ + PUSH_L (EBX) + + CPUID + MOV_L (ECX, EAX) /* return ECX */ + + POP_L (EBX) + RET + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_x86_cpuid_edx) +HIDDEN(_mesa_x86_cpuid_edx) +GLNAME(_mesa_x86_cpuid_edx): + + MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */ + PUSH_L (EBX) + + CPUID + MOV_L (EDX, EAX) /* return EDX */ + + POP_L (EBX) + RET + +#ifdef USE_SSE_ASM +/* Execute an SSE instruction to see if the operating system correctly + * supports SSE. A signal handler for SIGILL should have been set + * before calling this function, otherwise this could kill the client + * application. + * + * -----> !!!! ATTENTION DEVELOPERS !!!! <----- + * + * If you're debugging with gdb and you get stopped in this function, + * just type 'continue'! Execution will proceed normally. + * See freedesktop.org bug #1709 for more info. + */ +ALIGNTEXT4 +GLOBL GLNAME( _mesa_test_os_sse_support ) +HIDDEN(_mesa_test_os_sse_support) +GLNAME( _mesa_test_os_sse_support ): + + XORPS ( XMM0, XMM0 ) + + RET + + +/* Perform an SSE divide-by-zero to see if the operating system + * correctly supports unmasked SIMD FPU exceptions. Signal handlers for + * SIGILL and SIGFPE should have been set before calling this function, + * otherwise this could kill the client application. + */ +ALIGNTEXT4 +GLOBL GLNAME( _mesa_test_os_sse_exception_support ) +HIDDEN(_mesa_test_os_sse_exception_support) +GLNAME( _mesa_test_os_sse_exception_support ): + + PUSH_L ( EBP ) + MOV_L ( ESP, EBP ) + SUB_L ( CONST( 8 ), ESP ) + + /* Save the original MXCSR register value. + */ + STMXCSR ( REGOFF( -4, EBP ) ) + + /* Unmask the divide-by-zero exception and perform one. + */ + STMXCSR ( REGOFF( -8, EBP ) ) + AND_L ( CONST( 0xfffffdff ), REGOFF( -8, EBP ) ) + LDMXCSR ( REGOFF( -8, EBP ) ) + + XORPS ( XMM0, XMM0 ) + + PUSH_L ( CONST( 0x3f800000 ) ) + PUSH_L ( CONST( 0x3f800000 ) ) + PUSH_L ( CONST( 0x3f800000 ) ) + PUSH_L ( CONST( 0x3f800000 ) ) + + MOVUPS ( REGIND( ESP ), XMM1 ) + + DIVPS ( XMM0, XMM1 ) + + /* Restore the original MXCSR register value. + */ + LDMXCSR ( REGOFF( -4, EBP ) ) + + LEAVE + RET + +#endif + + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/common_x86_asm.h b/mesalib/src/mesa/x86/common_x86_asm.h index efda4a0c8..0d39e3d23 100644 --- a/mesalib/src/mesa/x86/common_x86_asm.h +++ b/mesalib/src/mesa/x86/common_x86_asm.h @@ -1,53 +1,53 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * Check CPU capabilities & initialize optimized funtions for this particular
- * processor.
- *
- * Written by Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
- * Changed by Andre Werthmann <wertmann@cs.uni-potsdam.de> for using the
- * new SSE functions
- *
- * Reimplemented by Gareth Hughes in a more
- * future-proof manner, based on code in the Linux kernel.
- */
-
-#ifndef __COMMON_X86_ASM_H__
-#define __COMMON_X86_ASM_H__
-
-/* Do not reference mtypes.h from this file.
- */
-#include "common_x86_features.h"
-
-extern int _mesa_x86_cpu_features;
-
-extern void _mesa_get_x86_features(void);
-
-extern void _mesa_check_os_sse_support(void);
-
-extern void _mesa_init_all_x86_transform_asm( void );
-
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Check CPU capabilities & initialize optimized funtions for this particular + * processor. + * + * Written by Holger Waechtler <holger@akaflieg.extern.tu-berlin.de> + * Changed by Andre Werthmann <wertmann@cs.uni-potsdam.de> for using the + * new SSE functions + * + * Reimplemented by Gareth Hughes in a more + * future-proof manner, based on code in the Linux kernel. + */ + +#ifndef __COMMON_X86_ASM_H__ +#define __COMMON_X86_ASM_H__ + +/* Do not reference mtypes.h from this file. + */ +#include "common_x86_features.h" + +extern int _mesa_x86_cpu_features; + +extern void _mesa_get_x86_features(void); + +extern void _mesa_check_os_sse_support(void); + +extern void _mesa_init_all_x86_transform_asm( void ); + +#endif diff --git a/mesalib/src/mesa/x86/common_x86_features.h b/mesalib/src/mesa/x86/common_x86_features.h index 2482d00ab..676af8c1f 100644 --- a/mesalib/src/mesa/x86/common_x86_features.h +++ b/mesalib/src/mesa/x86/common_x86_features.h @@ -1,67 +1,67 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 5.1
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * x86 CPUID feature information. The raw data is returned by
- * _mesa_identify_x86_cpu_features() and interpreted with the cpu_has_*
- * helper macros.
- *
- * Gareth Hughes
- */
-
-#ifndef __COMMON_X86_FEATURES_H__
-#define __COMMON_X86_FEATURES_H__
-
-#define X86_FEATURE_FPU (1<<0)
-#define X86_FEATURE_CMOV (1<<1)
-#define X86_FEATURE_MMXEXT (1<<2)
-#define X86_FEATURE_MMX (1<<3)
-#define X86_FEATURE_FXSR (1<<4)
-#define X86_FEATURE_XMM (1<<5)
-#define X86_FEATURE_XMM2 (1<<6)
-#define X86_FEATURE_3DNOWEXT (1<<7)
-#define X86_FEATURE_3DNOW (1<<8)
-
-/* standard X86 CPU features */
-#define X86_CPU_FPU (1<<0)
-#define X86_CPU_CMOV (1<<15)
-#define X86_CPU_MMX (1<<23)
-#define X86_CPU_XMM (1<<25)
-#define X86_CPU_XMM2 (1<<26)
-
-/* extended X86 CPU features */
-#define X86_CPUEXT_MMX_EXT (1<<22)
-#define X86_CPUEXT_3DNOW_EXT (1<<30)
-#define X86_CPUEXT_3DNOW (1<<31)
-
-#define cpu_has_mmx (_mesa_x86_cpu_features & X86_FEATURE_MMX)
-#define cpu_has_mmxext (_mesa_x86_cpu_features & X86_FEATURE_MMXEXT)
-#define cpu_has_xmm (_mesa_x86_cpu_features & X86_FEATURE_XMM)
-#define cpu_has_xmm2 (_mesa_x86_cpu_features & X86_FEATURE_XMM2)
-#define cpu_has_3dnow (_mesa_x86_cpu_features & X86_FEATURE_3DNOW)
-#define cpu_has_3dnowext (_mesa_x86_cpu_features & X86_FEATURE_3DNOWEXT)
-
-#endif
-
+ +/* + * Mesa 3-D graphics library + * Version: 5.1 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * x86 CPUID feature information. The raw data is returned by + * _mesa_identify_x86_cpu_features() and interpreted with the cpu_has_* + * helper macros. + * + * Gareth Hughes + */ + +#ifndef __COMMON_X86_FEATURES_H__ +#define __COMMON_X86_FEATURES_H__ + +#define X86_FEATURE_FPU (1<<0) +#define X86_FEATURE_CMOV (1<<1) +#define X86_FEATURE_MMXEXT (1<<2) +#define X86_FEATURE_MMX (1<<3) +#define X86_FEATURE_FXSR (1<<4) +#define X86_FEATURE_XMM (1<<5) +#define X86_FEATURE_XMM2 (1<<6) +#define X86_FEATURE_3DNOWEXT (1<<7) +#define X86_FEATURE_3DNOW (1<<8) + +/* standard X86 CPU features */ +#define X86_CPU_FPU (1<<0) +#define X86_CPU_CMOV (1<<15) +#define X86_CPU_MMX (1<<23) +#define X86_CPU_XMM (1<<25) +#define X86_CPU_XMM2 (1<<26) + +/* extended X86 CPU features */ +#define X86_CPUEXT_MMX_EXT (1<<22) +#define X86_CPUEXT_3DNOW_EXT (1<<30) +#define X86_CPUEXT_3DNOW (1<<31) + +#define cpu_has_mmx (_mesa_x86_cpu_features & X86_FEATURE_MMX) +#define cpu_has_mmxext (_mesa_x86_cpu_features & X86_FEATURE_MMXEXT) +#define cpu_has_xmm (_mesa_x86_cpu_features & X86_FEATURE_XMM) +#define cpu_has_xmm2 (_mesa_x86_cpu_features & X86_FEATURE_XMM2) +#define cpu_has_3dnow (_mesa_x86_cpu_features & X86_FEATURE_3DNOW) +#define cpu_has_3dnowext (_mesa_x86_cpu_features & X86_FEATURE_3DNOWEXT) + +#endif + diff --git a/mesalib/src/mesa/x86/mmx.h b/mesalib/src/mesa/x86/mmx.h index 28a43d18d..74e9979d3 100644 --- a/mesalib/src/mesa/x86/mmx.h +++ b/mesalib/src/mesa/x86/mmx.h @@ -1,59 +1,59 @@ -/*
- * Mesa 3-D graphics library
- * Version: 6.5.2
- *
- * Copyright (C) 1999-2006 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-#ifndef ASM_MMX_H
-#define ASM_MMX_H
-
-#include "main/compiler.h"
-#include "main/glheader.h"
-
-struct gl_context;
-
-extern void _ASMAPI
-_mesa_mmx_blend_transparency( struct gl_context *ctx, GLuint n, const GLubyte mask[],
- GLvoid *rgba, const GLvoid *dest,
- GLenum chanType );
-
-extern void _ASMAPI
-_mesa_mmx_blend_add( struct gl_context *ctx, GLuint n, const GLubyte mask[],
- GLvoid *rgba, const GLvoid *dest,
- GLenum chanType );
-
-extern void _ASMAPI
-_mesa_mmx_blend_min( struct gl_context *ctx, GLuint n, const GLubyte mask[],
- GLvoid *rgba, const GLvoid *dest,
- GLenum chanType );
-
-extern void _ASMAPI
-_mesa_mmx_blend_max( struct gl_context *ctx, GLuint n, const GLubyte mask[],
- GLvoid *rgba, const GLvoid *dest,
- GLenum chanType );
-
-extern void _ASMAPI
-_mesa_mmx_blend_modulate( struct gl_context *ctx, GLuint n, const GLubyte mask[],
- GLvoid *rgba, const GLvoid *dest,
- GLenum chanType );
-
-#endif
+/* + * Mesa 3-D graphics library + * Version: 6.5.2 + * + * Copyright (C) 1999-2006 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +#ifndef ASM_MMX_H +#define ASM_MMX_H + +#include "main/compiler.h" +#include "main/glheader.h" + +struct gl_context; + +extern void _ASMAPI +_mesa_mmx_blend_transparency( struct gl_context *ctx, GLuint n, const GLubyte mask[], + GLvoid *rgba, const GLvoid *dest, + GLenum chanType ); + +extern void _ASMAPI +_mesa_mmx_blend_add( struct gl_context *ctx, GLuint n, const GLubyte mask[], + GLvoid *rgba, const GLvoid *dest, + GLenum chanType ); + +extern void _ASMAPI +_mesa_mmx_blend_min( struct gl_context *ctx, GLuint n, const GLubyte mask[], + GLvoid *rgba, const GLvoid *dest, + GLenum chanType ); + +extern void _ASMAPI +_mesa_mmx_blend_max( struct gl_context *ctx, GLuint n, const GLubyte mask[], + GLvoid *rgba, const GLvoid *dest, + GLenum chanType ); + +extern void _ASMAPI +_mesa_mmx_blend_modulate( struct gl_context *ctx, GLuint n, const GLubyte mask[], + GLvoid *rgba, const GLvoid *dest, + GLenum chanType ); + +#endif diff --git a/mesalib/src/mesa/x86/mmx_blend.S b/mesalib/src/mesa/x86/mmx_blend.S index 1e3954730..eeaf43ea9 100644 --- a/mesalib/src/mesa/x86/mmx_blend.S +++ b/mesalib/src/mesa/x86/mmx_blend.S @@ -1,402 +1,402 @@ - ;
-/*
- * Written by Jos� Fonseca <j_r_fonseca@yahoo.co.uk>
- */
-
-
-#ifdef USE_MMX_ASM
-#include "assyntax.h"
-#include "matypes.h"
-
-/* integer multiplication - alpha plus one
- *
- * makes the following approximation to the division (Sree)
- *
- * rgb*a/255 ~= (rgb*(a+1)) >> 256
- *
- * which is the fastest method that satisfies the following OpenGL criteria
- *
- * 0*0 = 0 and 255*255 = 255
- *
- * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making
- *
- * PCMPEQW ( MX1, MX1 )
- */
-#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \
- PSUBW ( MX1, MA1 ) /* a1 + 1 | a1 + 1 | a1 + 1 | a1 + 1 */ ;\
- PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
- ;\
-TWO(PSUBW ( MX1, MA2 )) /* a2 + 1 | a2 + 1 | a2 + 1 | a2 + 1 */ ;\
-TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
- ;\
- PSRLW ( CONST(8), MA1 ) /* t1 >> 8 ~= t1/255 */ ;\
-TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 ~= t2/255 */
-
-
-/* integer multiplication - geometric series
- *
- * takes the geometric series approximation to the division
- *
- * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
- *
- * in this case just the first two terms to fit in 16bit arithmetic
- *
- * t/255 ~= (t + (t >> 8)) >> 8
- *
- * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254,
- * so the special case a = 255 must be accounted or roundoff must be used
- */
-#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \
- PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
-TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
- ;\
- MOVQ ( MA1, MP1 ) ;\
- PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\
- ;\
-TWO(MOVQ ( MA2, MP2 )) ;\
-TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\
- ;\
- PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
- PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
- ;\
-TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
-TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
-
-
-/* integer multiplication - geometric series plus rounding
- *
- * when using a geometric series division instead of truncating the result
- * use roundoff in the approximation (Jim Blinn)
- *
- * t = rgb*a + 0x80
- *
- * achieving the exact results
- *
- * note that M80 is register with the 0x0080008000800080 constant
- */
-#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \
- PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
- PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\
- ;\
-TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
-TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\
- ;\
- MOVQ ( MA1, MP1 ) ;\
- PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\
- ;\
-TWO(MOVQ ( MA2, MP2 )) ;\
-TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\
- ;\
- PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
- PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
- ;\
-TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
-TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
-
-
-/* linear interpolation - geometric series
- */
-#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \
- PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
- PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
- PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
- ;\
-TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
-TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
-TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
- ;\
- MOVQ ( MA1, MP1 ) ;\
- PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\
- ;\
-TWO(MOVQ ( MA2, MP2 )) ;\
-TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\
- ;\
- PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
-TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
- ;\
- PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
-TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
- ;\
- PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
-TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
-
-
-/* linear interpolation - geometric series with roundoff
- *
- * this is a generalization of Blinn's formula to signed arithmetic
- *
- * note that M80 is a register with the 0x0080008000800080 constant
- */
-#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \
- PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
- PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
- PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
- ;\
-TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
-TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
-TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
- ;\
- PSRLW ( CONST(15), MP1 ) /* q1 > p1 ? 1 : 0 */ ;\
-TWO(PSRLW ( CONST(15), MP2 )) /* q2 > q2 ? 1 : 0 */ ;\
- ;\
- PSLLW ( CONST(8), MP1 ) /* q1 > p1 ? 0x100 : 0 */ ;\
-TWO(PSLLW ( CONST(8), MP2 )) /* q2 > q2 ? 0x100 : 0 */ ;\
- ;\
- PSUBW ( MP1, MA1 ) /* t1 -=? 0x100 */ ;\
-TWO(PSUBW ( MP2, MA2 )) /* t2 -=? 0x100 */ ;\
- ;\
- PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\
-TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\
- ;\
- MOVQ ( MA1, MP1 ) ;\
- PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\
- ;\
-TWO(MOVQ ( MA2, MP2 )) ;\
-TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\
- ;\
- PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
-TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
- ;\
- PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
-TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
- ;\
- PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
-TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
-
-
-/* linear interpolation - geometric series with correction
- *
- * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
- *
- * t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8
- *
- * note that although is faster than rounding off it doesn't give always the exact results
- */
-#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \
- PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
- PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
- PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
- ;\
-TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
-TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
-TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
- ;\
- MOVQ ( MA1, MP1 ) ;\
- PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\
- ;\
-TWO(MOVQ ( MA2, MP2 )) ;\
-TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\
- ;\
- PADDW ( MA1, MP1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
- PSRLW ( CONST(7), MA1 ) /* t1 >> 15 */ ;\
- ;\
-TWO(PADDW ( MA2, MP2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
-TWO(PSRLW ( CONST(7), MA2 )) /* t2 >> 15 */ ;\
- ;\
- PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ ;\
-TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ ;\
- ;\
- PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
-TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
- ;\
- PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
-TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
-
-
-/* common blending setup code
- *
- * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making
- *
- * PXOR ( M00, M00 )
- */
-#define GMB_LOAD(rgba, dest, MPP, MQQ) \
-ONE(MOVD ( REGIND(rgba), MPP )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\
-ONE(MOVD ( REGIND(dest), MQQ )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\
- ;\
-TWO(MOVQ ( REGIND(rgba), MPP )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\
-TWO(MOVQ ( REGIND(dest), MQQ )) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */
-
-#define GMB_UNPACK(MP1, MQ1, MP2, MQ2, M00) \
-TWO(MOVQ ( MP1, MP2 )) ;\
-TWO(MOVQ ( MQ1, MQ2 )) ;\
- ;\
- PUNPCKLBW ( M00, MQ1 ) /* qa1 | qb1 | qg1 | qr1 */ ;\
-TWO(PUNPCKHBW ( M00, MQ2 )) /* qa2 | qb2 | qg2 | qr2 */ ;\
- PUNPCKLBW ( M00, MP1 ) /* pa1 | pb1 | pg1 | pr1 */ ;\
-TWO(PUNPCKHBW ( M00, MP2 )) /* pa2 | pb2 | pg2 | pr2 */
-
-#define GMB_ALPHA(MP1, MA1, MP2, MA2) \
- MOVQ ( MP1, MA1 ) ;\
-TWO(MOVQ ( MP2, MA2 )) ;\
- ;\
- PUNPCKHWD ( MA1, MA1 ) /* pa1 | pa1 | | */ ;\
-TWO(PUNPCKHWD ( MA2, MA2 )) /* pa2 | pa2 | | */ ;\
- PUNPCKHDQ ( MA1, MA1 ) /* pa1 | pa1 | pa1 | pa1 */ ;\
-TWO(PUNPCKHDQ ( MA2, MA2 )) /* pa2 | pa2 | pa2 | pa2 */
-
-#define GMB_PACK( MS1, MS2 ) \
- PACKUSWB ( MS2, MS1 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\
-
-#define GMB_STORE(rgba, MSS ) \
-ONE(MOVD ( MSS, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\
-TWO(MOVQ ( MSS, REGIND(rgba) )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */
-
-/* Kevin F. Quinn <kevquinn@gentoo.org> 2 July 2006
- * Replace data segment constants with text-segment
- * constants (via pushl/movq)
- SEG_DATA
-
-ALIGNDATA8
-const_0080:
- D_LONG 0x00800080, 0x00800080
-
-const_80:
- D_LONG 0x80808080, 0x80808080
-*/
-#define const_0080_l 0x00800080
-#define const_0080_h 0x00800080
-#define const_80_l 0x80808080
-#define const_80_h 0x80808080
-
- SEG_TEXT
-
-
-/* Blend transparency function
- */
-
-#define TAG(x) CONCAT(x,_transparency)
-#define LLTAG(x) LLBL2(x,_transparency)
-
-#define INIT \
- PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
-
-#define MAIN( rgba, dest ) \
- GMB_LOAD( rgba, dest, MM1, MM2 ) ;\
- GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\
- GMB_ALPHA( MM1, MM3, MM4, MM6 ) ;\
- GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 ) ;\
- GMB_PACK( MM3, MM6 ) ;\
- GMB_STORE( rgba, MM3 )
-
-#include "mmx_blendtmp.h"
-
-
-/* Blend add function
- *
- * FIXME: Add some loop unrolling here...
- */
-
-#define TAG(x) CONCAT(x,_add)
-#define LLTAG(x) LLBL2(x,_add)
-
-#define INIT
-
-#define MAIN( rgba, dest ) \
-ONE(MOVD ( REGIND(rgba), MM1 )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\
-ONE(MOVD ( REGIND(dest), MM2 )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\
-ONE(PADDUSB ( MM2, MM1 )) ;\
-ONE(MOVD ( MM1, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\
- ;\
-TWO(MOVQ ( REGIND(rgba), MM1 )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\
-TWO(PADDUSB ( REGIND(dest), MM1 )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\
-TWO(MOVQ ( MM1, REGIND(rgba) ))
-
-#include "mmx_blendtmp.h"
-
-
-/* Blend min function
- */
-
-#define TAG(x) CONCAT(x,_min)
-#define LLTAG(x) LLBL2(x,_min)
-
-/* Kevin F. Quinn 2nd July 2006
- * Replace data segment constants with text-segment instructions
-#define INIT \
- MOVQ ( CONTENT(const_80), MM7 )
- */
-#define INIT \
- PUSH_L ( CONST(const_80_h) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\
- PUSH_L ( CONST(const_80_l) ) ;\
- MOVQ ( REGIND(ESP), MM7 ) ;\
- ADD_L ( CONST(8), ESP)
-
-#define MAIN( rgba, dest ) \
- GMB_LOAD( rgba, dest, MM1, MM2 ) ;\
- MOVQ ( MM1, MM3 ) ;\
- MOVQ ( MM2, MM4 ) ;\
- PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\
- PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\
- PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\
- PAND ( MM4, MM1 ) /* q > p ? p : 0 */ ;\
- PANDN ( MM2, MM4 ) /* q > p ? 0 : q */ ;\
- POR ( MM1, MM4 ) /* q > p ? p : q */ ;\
- GMB_STORE( rgba, MM4 )
-
-#include "mmx_blendtmp.h"
-
-
-/* Blend max function
- */
-
-#define TAG(x) CONCAT(x,_max)
-#define LLTAG(x) LLBL2(x,_max)
-
-/* Kevin F. Quinn 2nd July 2006
- * Replace data segment constants with text-segment instructions
-#define INIT \
- MOVQ ( CONTENT(const_80), MM7 )
- */
-#define INIT \
- PUSH_L ( CONST(const_80_l) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\
- PUSH_L ( CONST(const_80_h) ) ;\
- MOVQ ( REGIND(ESP), MM7 ) ;\
- ADD_L ( CONST(8), ESP)
-
-#define MAIN( rgba, dest ) \
- GMB_LOAD( rgba, dest, MM1, MM2 ) ;\
- MOVQ ( MM1, MM3 ) ;\
- MOVQ ( MM2, MM4 ) ;\
- PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\
- PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\
- PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\
- PAND ( MM4, MM2 ) /* q > p ? q : 0 */ ;\
- PANDN ( MM1, MM4 ) /* q > p ? 0 : p */ ;\
- POR ( MM2, MM4 ) /* q > p ? p : q */ ;\
- GMB_STORE( rgba, MM4 )
-
-#include "mmx_blendtmp.h"
-
-
-/* Blend modulate function
- */
-
-#define TAG(x) CONCAT(x,_modulate)
-#define LLTAG(x) LLBL2(x,_modulate)
-
-/* Kevin F. Quinn 2nd July 2006
- * Replace data segment constants with text-segment instructions
-#define INIT \
- MOVQ ( CONTENT(const_0080), MM7 )
- */
-#define INIT \
- PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ ;\
- PUSH_L ( CONST(const_0080_l) ) /* 0x0080 | 0x0080 | 0x0080 | 0x0080 */ ;\
- PUSH_L ( CONST(const_0080_h) ) ;\
- MOVQ ( REGIND(ESP), MM7 ) ;\
- ADD_L ( CONST(8), ESP)
-
-#define MAIN( rgba, dest ) \
- GMB_LOAD( rgba, dest, MM1, MM2 ) ;\
- GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\
- GMB_MULT_GSR( MM1, MM2, MM4, MM5, MM7 ) ;\
- GMB_PACK( MM2, MM5 ) ;\
- GMB_STORE( rgba, MM2 )
-
-#include "mmx_blendtmp.h"
-
-#endif
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ ; +/* + * Written by Jos� Fonseca <j_r_fonseca@yahoo.co.uk> + */ + + +#ifdef USE_MMX_ASM +#include "assyntax.h" +#include "matypes.h" + +/* integer multiplication - alpha plus one + * + * makes the following approximation to the division (Sree) + * + * rgb*a/255 ~= (rgb*(a+1)) >> 256 + * + * which is the fastest method that satisfies the following OpenGL criteria + * + * 0*0 = 0 and 255*255 = 255 + * + * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making + * + * PCMPEQW ( MX1, MX1 ) + */ +#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \ + PSUBW ( MX1, MA1 ) /* a1 + 1 | a1 + 1 | a1 + 1 | a1 + 1 */ ;\ + PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ + ;\ +TWO(PSUBW ( MX1, MA2 )) /* a2 + 1 | a2 + 1 | a2 + 1 | a2 + 1 */ ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ + ;\ + PSRLW ( CONST(8), MA1 ) /* t1 >> 8 ~= t1/255 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 ~= t2/255 */ + + +/* integer multiplication - geometric series + * + * takes the geometric series approximation to the division + * + * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. + * + * in this case just the first two terms to fit in 16bit arithmetic + * + * t/255 ~= (t + (t >> 8)) >> 8 + * + * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254, + * so the special case a = 255 must be accounted or roundoff must be used + */ +#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \ + PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ + ;\ + MOVQ ( MA1, MP1 ) ;\ + PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ + ;\ +TWO(MOVQ ( MA2, MP2 )) ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ + ;\ + PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ + PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ + ;\ +TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ + + +/* integer multiplication - geometric series plus rounding + * + * when using a geometric series division instead of truncating the result + * use roundoff in the approximation (Jim Blinn) + * + * t = rgb*a + 0x80 + * + * achieving the exact results + * + * note that M80 is register with the 0x0080008000800080 constant + */ +#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \ + PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ + PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ + ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ +TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ + ;\ + MOVQ ( MA1, MP1 ) ;\ + PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ + ;\ +TWO(MOVQ ( MA2, MP2 )) ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ + ;\ + PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ + PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ + ;\ +TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ + + +/* linear interpolation - geometric series + */ +#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \ + PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ + PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ + PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ + ;\ +TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ +TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ + ;\ + MOVQ ( MA1, MP1 ) ;\ + PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ + ;\ +TWO(MOVQ ( MA2, MP2 )) ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ + ;\ + PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ +TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ + ;\ + PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ +TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ + ;\ + PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ + + +/* linear interpolation - geometric series with roundoff + * + * this is a generalization of Blinn's formula to signed arithmetic + * + * note that M80 is a register with the 0x0080008000800080 constant + */ +#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \ + PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ + PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ + PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ + ;\ +TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ +TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ + ;\ + PSRLW ( CONST(15), MP1 ) /* q1 > p1 ? 1 : 0 */ ;\ +TWO(PSRLW ( CONST(15), MP2 )) /* q2 > q2 ? 1 : 0 */ ;\ + ;\ + PSLLW ( CONST(8), MP1 ) /* q1 > p1 ? 0x100 : 0 */ ;\ +TWO(PSLLW ( CONST(8), MP2 )) /* q2 > q2 ? 0x100 : 0 */ ;\ + ;\ + PSUBW ( MP1, MA1 ) /* t1 -=? 0x100 */ ;\ +TWO(PSUBW ( MP2, MA2 )) /* t2 -=? 0x100 */ ;\ + ;\ + PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ +TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ + ;\ + MOVQ ( MA1, MP1 ) ;\ + PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ + ;\ +TWO(MOVQ ( MA2, MP2 )) ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ + ;\ + PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ +TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ + ;\ + PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ +TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ + ;\ + PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ + + +/* linear interpolation - geometric series with correction + * + * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria + * + * t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8 + * + * note that although is faster than rounding off it doesn't give always the exact results + */ +#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \ + PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ + PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ + PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ + ;\ +TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ +TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ + ;\ + MOVQ ( MA1, MP1 ) ;\ + PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ + ;\ +TWO(MOVQ ( MA2, MP2 )) ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ + ;\ + PADDW ( MA1, MP1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ + PSRLW ( CONST(7), MA1 ) /* t1 >> 15 */ ;\ + ;\ +TWO(PADDW ( MA2, MP2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ +TWO(PSRLW ( CONST(7), MA2 )) /* t2 >> 15 */ ;\ + ;\ + PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ ;\ +TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ ;\ + ;\ + PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ +TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ + ;\ + PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ + + +/* common blending setup code + * + * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making + * + * PXOR ( M00, M00 ) + */ +#define GMB_LOAD(rgba, dest, MPP, MQQ) \ +ONE(MOVD ( REGIND(rgba), MPP )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\ +ONE(MOVD ( REGIND(dest), MQQ )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\ + ;\ +TWO(MOVQ ( REGIND(rgba), MPP )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\ +TWO(MOVQ ( REGIND(dest), MQQ )) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */ + +#define GMB_UNPACK(MP1, MQ1, MP2, MQ2, M00) \ +TWO(MOVQ ( MP1, MP2 )) ;\ +TWO(MOVQ ( MQ1, MQ2 )) ;\ + ;\ + PUNPCKLBW ( M00, MQ1 ) /* qa1 | qb1 | qg1 | qr1 */ ;\ +TWO(PUNPCKHBW ( M00, MQ2 )) /* qa2 | qb2 | qg2 | qr2 */ ;\ + PUNPCKLBW ( M00, MP1 ) /* pa1 | pb1 | pg1 | pr1 */ ;\ +TWO(PUNPCKHBW ( M00, MP2 )) /* pa2 | pb2 | pg2 | pr2 */ + +#define GMB_ALPHA(MP1, MA1, MP2, MA2) \ + MOVQ ( MP1, MA1 ) ;\ +TWO(MOVQ ( MP2, MA2 )) ;\ + ;\ + PUNPCKHWD ( MA1, MA1 ) /* pa1 | pa1 | | */ ;\ +TWO(PUNPCKHWD ( MA2, MA2 )) /* pa2 | pa2 | | */ ;\ + PUNPCKHDQ ( MA1, MA1 ) /* pa1 | pa1 | pa1 | pa1 */ ;\ +TWO(PUNPCKHDQ ( MA2, MA2 )) /* pa2 | pa2 | pa2 | pa2 */ + +#define GMB_PACK( MS1, MS2 ) \ + PACKUSWB ( MS2, MS1 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\ + +#define GMB_STORE(rgba, MSS ) \ +ONE(MOVD ( MSS, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\ +TWO(MOVQ ( MSS, REGIND(rgba) )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ + +/* Kevin F. Quinn <kevquinn@gentoo.org> 2 July 2006 + * Replace data segment constants with text-segment + * constants (via pushl/movq) + SEG_DATA + +ALIGNDATA8 +const_0080: + D_LONG 0x00800080, 0x00800080 + +const_80: + D_LONG 0x80808080, 0x80808080 +*/ +#define const_0080_l 0x00800080 +#define const_0080_h 0x00800080 +#define const_80_l 0x80808080 +#define const_80_h 0x80808080 + + SEG_TEXT + + +/* Blend transparency function + */ + +#define TAG(x) CONCAT(x,_transparency) +#define LLTAG(x) LLBL2(x,_transparency) + +#define INIT \ + PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ + +#define MAIN( rgba, dest ) \ + GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ + GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\ + GMB_ALPHA( MM1, MM3, MM4, MM6 ) ;\ + GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 ) ;\ + GMB_PACK( MM3, MM6 ) ;\ + GMB_STORE( rgba, MM3 ) + +#include "mmx_blendtmp.h" + + +/* Blend add function + * + * FIXME: Add some loop unrolling here... + */ + +#define TAG(x) CONCAT(x,_add) +#define LLTAG(x) LLBL2(x,_add) + +#define INIT + +#define MAIN( rgba, dest ) \ +ONE(MOVD ( REGIND(rgba), MM1 )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\ +ONE(MOVD ( REGIND(dest), MM2 )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\ +ONE(PADDUSB ( MM2, MM1 )) ;\ +ONE(MOVD ( MM1, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\ + ;\ +TWO(MOVQ ( REGIND(rgba), MM1 )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\ +TWO(PADDUSB ( REGIND(dest), MM1 )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\ +TWO(MOVQ ( MM1, REGIND(rgba) )) + +#include "mmx_blendtmp.h" + + +/* Blend min function + */ + +#define TAG(x) CONCAT(x,_min) +#define LLTAG(x) LLBL2(x,_min) + +/* Kevin F. Quinn 2nd July 2006 + * Replace data segment constants with text-segment instructions +#define INIT \ + MOVQ ( CONTENT(const_80), MM7 ) + */ +#define INIT \ + PUSH_L ( CONST(const_80_h) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\ + PUSH_L ( CONST(const_80_l) ) ;\ + MOVQ ( REGIND(ESP), MM7 ) ;\ + ADD_L ( CONST(8), ESP) + +#define MAIN( rgba, dest ) \ + GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ + MOVQ ( MM1, MM3 ) ;\ + MOVQ ( MM2, MM4 ) ;\ + PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\ + PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\ + PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\ + PAND ( MM4, MM1 ) /* q > p ? p : 0 */ ;\ + PANDN ( MM2, MM4 ) /* q > p ? 0 : q */ ;\ + POR ( MM1, MM4 ) /* q > p ? p : q */ ;\ + GMB_STORE( rgba, MM4 ) + +#include "mmx_blendtmp.h" + + +/* Blend max function + */ + +#define TAG(x) CONCAT(x,_max) +#define LLTAG(x) LLBL2(x,_max) + +/* Kevin F. Quinn 2nd July 2006 + * Replace data segment constants with text-segment instructions +#define INIT \ + MOVQ ( CONTENT(const_80), MM7 ) + */ +#define INIT \ + PUSH_L ( CONST(const_80_l) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\ + PUSH_L ( CONST(const_80_h) ) ;\ + MOVQ ( REGIND(ESP), MM7 ) ;\ + ADD_L ( CONST(8), ESP) + +#define MAIN( rgba, dest ) \ + GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ + MOVQ ( MM1, MM3 ) ;\ + MOVQ ( MM2, MM4 ) ;\ + PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\ + PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\ + PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\ + PAND ( MM4, MM2 ) /* q > p ? q : 0 */ ;\ + PANDN ( MM1, MM4 ) /* q > p ? 0 : p */ ;\ + POR ( MM2, MM4 ) /* q > p ? p : q */ ;\ + GMB_STORE( rgba, MM4 ) + +#include "mmx_blendtmp.h" + + +/* Blend modulate function + */ + +#define TAG(x) CONCAT(x,_modulate) +#define LLTAG(x) LLBL2(x,_modulate) + +/* Kevin F. Quinn 2nd July 2006 + * Replace data segment constants with text-segment instructions +#define INIT \ + MOVQ ( CONTENT(const_0080), MM7 ) + */ +#define INIT \ + PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ ;\ + PUSH_L ( CONST(const_0080_l) ) /* 0x0080 | 0x0080 | 0x0080 | 0x0080 */ ;\ + PUSH_L ( CONST(const_0080_h) ) ;\ + MOVQ ( REGIND(ESP), MM7 ) ;\ + ADD_L ( CONST(8), ESP) + +#define MAIN( rgba, dest ) \ + GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ + GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\ + GMB_MULT_GSR( MM1, MM2, MM4, MM5, MM7 ) ;\ + GMB_PACK( MM2, MM5 ) ;\ + GMB_STORE( rgba, MM2 ) + +#include "mmx_blendtmp.h" + +#endif + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/mmx_blendtmp.h b/mesalib/src/mesa/x86/mmx_blendtmp.h index f282c0c21..8534792e2 100644 --- a/mesalib/src/mesa/x86/mmx_blendtmp.h +++ b/mesalib/src/mesa/x86/mmx_blendtmp.h @@ -1,114 +1,114 @@ -/*
- * Written by José Fonseca <j_r_fonseca@yahoo.co.uk>
- */
-
-
-/*
- * void _mesa_mmx_blend( struct gl_context *ctx,
- * GLuint n,
- * const GLubyte mask[],
- * GLchan rgba[][4],
- * CONST GLchan dest[][4] )
- *
- */
-ALIGNTEXT16
-GLOBL GLNAME( TAG(_mesa_mmx_blend) )
-HIDDEN( TAG(_mesa_mmx_blend) )
-GLNAME( TAG(_mesa_mmx_blend) ):
-
- PUSH_L ( EBP )
- MOV_L ( ESP, EBP )
- PUSH_L ( ESI )
- PUSH_L ( EDI )
- PUSH_L ( EBX )
-
- MOV_L ( REGOFF(12, EBP), ECX ) /* n */
- CMP_L ( CONST(0), ECX)
- JE ( LLTAG(GMB_return) )
-
- MOV_L ( REGOFF(16, EBP), EBX ) /* mask */
- MOV_L ( REGOFF(20, EBP), EDI ) /* rgba */
- MOV_L ( REGOFF(24, EBP), ESI ) /* dest */
-
- INIT
-
- TEST_L ( CONST(4), EDI ) /* align rgba on an 8-byte boundary */
- JZ ( LLTAG(GMB_align_end) )
-
- CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
- JE ( LLTAG(GMB_align_continue) )
-
- /* runin */
-#define ONE(x) x
-#define TWO(x)
- MAIN ( EDI, ESI )
-#undef ONE
-#undef TWO
-
-LLTAG(GMB_align_continue):
-
- DEC_L ( ECX ) /* n -= 1 */
- INC_L ( EBX ) /* mask += 1 */
- ADD_L ( CONST(4), EDI ) /* rgba += 1 */
- ADD_L ( CONST(4), ESI ) /* dest += 1 */
-
-LLTAG(GMB_align_end):
-
- CMP_L ( CONST(2), ECX)
- JB ( LLTAG(GMB_loop_end) )
-
-ALIGNTEXT16
-LLTAG(GMB_loop_begin):
-
- CMP_W ( CONST(0), REGIND(EBX) ) /* *mask == 0 && *(mask + 1) == 0 */
- JE ( LLTAG(GMB_loop_continue) )
-
- /* main loop */
-#define ONE(x)
-#define TWO(x) x
- MAIN ( EDI, ESI )
-#undef ONE
-#undef TWO
-
-LLTAG(GMB_loop_continue):
-
- DEC_L ( ECX )
- DEC_L ( ECX ) /* n -= 2 */
- ADD_L ( CONST(2), EBX ) /* mask += 2 */
- ADD_L ( CONST(8), EDI ) /* rgba += 2 */
- ADD_L ( CONST(8), ESI ) /* dest += 2 */
- CMP_L ( CONST(2), ECX )
- JAE ( LLTAG(GMB_loop_begin) )
-
-LLTAG(GMB_loop_end):
-
- CMP_L ( CONST(1), ECX )
- JB ( LLTAG(GMB_done) )
-
- CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
- JE ( LLTAG(GMB_done) )
-
- /* runout */
-#define ONE(x) x
-#define TWO(x)
- MAIN ( EDI, ESI )
-#undef ONE
-#undef TWO
-
-LLTAG(GMB_done):
-
- EMMS
-
-LLTAG(GMB_return):
-
- POP_L ( EBX )
- POP_L ( EDI )
- POP_L ( ESI )
- MOV_L ( EBP, ESP )
- POP_L ( EBP )
- RET
-
-#undef TAG
-#undef LLTAG
-#undef INIT
-#undef MAIN
+/* + * Written by José Fonseca <j_r_fonseca@yahoo.co.uk> + */ + + +/* + * void _mesa_mmx_blend( struct gl_context *ctx, + * GLuint n, + * const GLubyte mask[], + * GLchan rgba[][4], + * CONST GLchan dest[][4] ) + * + */ +ALIGNTEXT16 +GLOBL GLNAME( TAG(_mesa_mmx_blend) ) +HIDDEN( TAG(_mesa_mmx_blend) ) +GLNAME( TAG(_mesa_mmx_blend) ): + + PUSH_L ( EBP ) + MOV_L ( ESP, EBP ) + PUSH_L ( ESI ) + PUSH_L ( EDI ) + PUSH_L ( EBX ) + + MOV_L ( REGOFF(12, EBP), ECX ) /* n */ + CMP_L ( CONST(0), ECX) + JE ( LLTAG(GMB_return) ) + + MOV_L ( REGOFF(16, EBP), EBX ) /* mask */ + MOV_L ( REGOFF(20, EBP), EDI ) /* rgba */ + MOV_L ( REGOFF(24, EBP), ESI ) /* dest */ + + INIT + + TEST_L ( CONST(4), EDI ) /* align rgba on an 8-byte boundary */ + JZ ( LLTAG(GMB_align_end) ) + + CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */ + JE ( LLTAG(GMB_align_continue) ) + + /* runin */ +#define ONE(x) x +#define TWO(x) + MAIN ( EDI, ESI ) +#undef ONE +#undef TWO + +LLTAG(GMB_align_continue): + + DEC_L ( ECX ) /* n -= 1 */ + INC_L ( EBX ) /* mask += 1 */ + ADD_L ( CONST(4), EDI ) /* rgba += 1 */ + ADD_L ( CONST(4), ESI ) /* dest += 1 */ + +LLTAG(GMB_align_end): + + CMP_L ( CONST(2), ECX) + JB ( LLTAG(GMB_loop_end) ) + +ALIGNTEXT16 +LLTAG(GMB_loop_begin): + + CMP_W ( CONST(0), REGIND(EBX) ) /* *mask == 0 && *(mask + 1) == 0 */ + JE ( LLTAG(GMB_loop_continue) ) + + /* main loop */ +#define ONE(x) +#define TWO(x) x + MAIN ( EDI, ESI ) +#undef ONE +#undef TWO + +LLTAG(GMB_loop_continue): + + DEC_L ( ECX ) + DEC_L ( ECX ) /* n -= 2 */ + ADD_L ( CONST(2), EBX ) /* mask += 2 */ + ADD_L ( CONST(8), EDI ) /* rgba += 2 */ + ADD_L ( CONST(8), ESI ) /* dest += 2 */ + CMP_L ( CONST(2), ECX ) + JAE ( LLTAG(GMB_loop_begin) ) + +LLTAG(GMB_loop_end): + + CMP_L ( CONST(1), ECX ) + JB ( LLTAG(GMB_done) ) + + CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */ + JE ( LLTAG(GMB_done) ) + + /* runout */ +#define ONE(x) x +#define TWO(x) + MAIN ( EDI, ESI ) +#undef ONE +#undef TWO + +LLTAG(GMB_done): + + EMMS + +LLTAG(GMB_return): + + POP_L ( EBX ) + POP_L ( EDI ) + POP_L ( ESI ) + MOV_L ( EBP, ESP ) + POP_L ( EBP ) + RET + +#undef TAG +#undef LLTAG +#undef INIT +#undef MAIN diff --git a/mesalib/src/mesa/x86/norm_args.h b/mesalib/src/mesa/x86/norm_args.h index bad1a1975..5d352838b 100644 --- a/mesalib/src/mesa/x86/norm_args.h +++ b/mesalib/src/mesa/x86/norm_args.h @@ -1,57 +1,57 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * Normal transform function interface for assembly code. Simply define
- * FRAME_OFFSET to the number of bytes pushed onto the stack before
- * using the ARG_* argument macros.
- *
- * Gareth Hughes
- */
-
-#ifndef __NORM_ARGS_H__
-#define __NORM_ARGS_H__
-
-/* Offsets for normal_func arguments
- *
- * typedef void (*normal_func)( CONST GLmatrix *mat,
- * GLfloat scale,
- * CONST GLvector4f *in,
- * CONST GLfloat lengths[],
- * GLvector4f *dest );
- */
-#define OFFSET_MAT 4
-#define OFFSET_SCALE 8
-#define OFFSET_IN 12
-#define OFFSET_LENGTHS 16
-#define OFFSET_DEST 20
-
-#define ARG_MAT REGOFF(FRAME_OFFSET+OFFSET_MAT, ESP)
-#define ARG_SCALE REGOFF(FRAME_OFFSET+OFFSET_SCALE, ESP)
-#define ARG_IN REGOFF(FRAME_OFFSET+OFFSET_IN, ESP)
-#define ARG_LENGTHS REGOFF(FRAME_OFFSET+OFFSET_LENGTHS, ESP)
-#define ARG_DEST REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP)
-
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Normal transform function interface for assembly code. Simply define + * FRAME_OFFSET to the number of bytes pushed onto the stack before + * using the ARG_* argument macros. + * + * Gareth Hughes + */ + +#ifndef __NORM_ARGS_H__ +#define __NORM_ARGS_H__ + +/* Offsets for normal_func arguments + * + * typedef void (*normal_func)( CONST GLmatrix *mat, + * GLfloat scale, + * CONST GLvector4f *in, + * CONST GLfloat lengths[], + * GLvector4f *dest ); + */ +#define OFFSET_MAT 4 +#define OFFSET_SCALE 8 +#define OFFSET_IN 12 +#define OFFSET_LENGTHS 16 +#define OFFSET_DEST 20 + +#define ARG_MAT REGOFF(FRAME_OFFSET+OFFSET_MAT, ESP) +#define ARG_SCALE REGOFF(FRAME_OFFSET+OFFSET_SCALE, ESP) +#define ARG_IN REGOFF(FRAME_OFFSET+OFFSET_IN, ESP) +#define ARG_LENGTHS REGOFF(FRAME_OFFSET+OFFSET_LENGTHS, ESP) +#define ARG_DEST REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP) + +#endif diff --git a/mesalib/src/mesa/x86/read_rgba_span_x86.S b/mesalib/src/mesa/x86/read_rgba_span_x86.S index 04571afb7..3be4515b1 100644 --- a/mesalib/src/mesa/x86/read_rgba_span_x86.S +++ b/mesalib/src/mesa/x86/read_rgba_span_x86.S @@ -1,686 +1,686 @@ -/*
- * (C) Copyright IBM Corporation 2004
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file read_rgba_span_x86.S
- * Optimized routines to transfer pixel data from the framebuffer to a
- * buffer in main memory.
- *
- * \author Ian Romanick <idr@us.ibm.com>
- */
-
- .file "read_rgba_span_x86.S"
-#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
-/* Kevin F. Quinn 2nd July 2006
- * Replaced data segment constants with text-segment instructions.
- */
-#define LOAD_MASK(mvins,m1,m2) \
- pushl $0xff00ff00 ;\
- pushl $0xff00ff00 ;\
- pushl $0xff00ff00 ;\
- pushl $0xff00ff00 ;\
- mvins (%esp), m1 ;\
- pushl $0x00ff0000 ;\
- pushl $0x00ff0000 ;\
- pushl $0x00ff0000 ;\
- pushl $0x00ff0000 ;\
- mvins (%esp), m2 ;\
- addl $32, %esp
-
-/* I implemented these as macros because they appear in several places,
- * and I've tweaked them a number of times. I got tired of changing every
- * place they appear. :)
- */
-
-#define DO_ONE_PIXEL() \
- movl (%ebx), %eax ; \
- addl $4, %ebx ; \
- bswap %eax /* ARGB -> BGRA */ ; \
- rorl $8, %eax /* BGRA -> ABGR */ ; \
- movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
- addl $4, %ecx
-
-#define DO_ONE_LAST_PIXEL() \
- movl (%ebx), %eax ; \
- bswap %eax /* ARGB -> BGRA */ ; \
- rorl $8, %eax /* BGRA -> ABGR */ ; \
- movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
-
-
-/**
- * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
- *
- * \warning
- * This function assumes that the caller will issue the EMMS instruction
- * at the correct places.
- */
-
-.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
-#ifndef USE_DRICORE
-.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
-#endif
- .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
-_generic_read_RGBA_span_BGRA8888_REV_MMX:
- pushl %ebx
-
-#ifdef USE_INNER_EMMS
- emms
-#endif
- LOAD_MASK(movq,%mm1,%mm2)
-
- movl 8(%esp), %ebx /* source pointer */
- movl 16(%esp), %edx /* number of pixels to copy */
- movl 12(%esp), %ecx /* destination pointer */
-
- testl %edx, %edx
- jle .L20 /* Bail if there's nothing to do. */
-
- movl %ebx, %eax
-
- negl %eax
- sarl $2, %eax
- andl $1, %eax
- je .L17
-
- subl %eax, %edx
- DO_ONE_PIXEL()
-.L17:
-
- /* Would it be faster to unroll this loop once and process 4 pixels
- * per pass, instead of just two?
- */
-
- movl %edx, %eax
- shrl %eax
- jmp .L18
-.L19:
- movq (%ebx), %mm0
- addl $8, %ebx
-
- /* These 9 instructions do what PSHUFB (if there were such an
- * instruction) could do in 1. :(
- */
-
- movq %mm0, %mm3
- movq %mm0, %mm4
-
- pand %mm2, %mm3
- psllq $16, %mm4
- psrlq $16, %mm3
- pand %mm2, %mm4
-
- pand %mm1, %mm0
- por %mm4, %mm3
- por %mm3, %mm0
-
- movq %mm0, (%ecx)
- addl $8, %ecx
- subl $1, %eax
-.L18:
- jne .L19
-
-#ifdef USE_INNER_EMMS
- emms
-#endif
-
- /* At this point there are either 1 or 0 pixels remaining to be
- * converted. Convert the last pixel, if needed.
- */
-
- testl $1, %edx
- je .L20
-
- DO_ONE_LAST_PIXEL()
-
-.L20:
- popl %ebx
- ret
- .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
-
-
-/**
- * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
- * instructions are only actually used to read data from the framebuffer.
- * In practice, the speed-up is pretty small.
- *
- * \todo
- * Do some more testing and determine if there's any reason to have this
- * function in addition to the MMX version.
- *
- * \warning
- * This function assumes that the caller will issue the EMMS instruction
- * at the correct places.
- */
-
-.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
-#ifndef USE_DRICORE
-.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
-#endif
- .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
-_generic_read_RGBA_span_BGRA8888_REV_SSE:
- pushl %esi
- pushl %ebx
- pushl %ebp
-
-#ifdef USE_INNER_EMMS
- emms
-#endif
-
- LOAD_MASK(movq,%mm1,%mm2)
-
- movl 16(%esp), %ebx /* source pointer */
- movl 24(%esp), %edx /* number of pixels to copy */
- movl 20(%esp), %ecx /* destination pointer */
-
- testl %edx, %edx
- jle .L35 /* Bail if there's nothing to do. */
-
- movl %esp, %ebp
- subl $16, %esp
- andl $0xfffffff0, %esp
-
- movl %ebx, %eax
- movl %edx, %esi
-
- negl %eax
- andl $15, %eax
- sarl $2, %eax
- cmpl %edx, %eax
- cmovle %eax, %esi
-
- subl %esi, %edx
-
- testl $1, %esi
- je .L32
-
- DO_ONE_PIXEL()
-.L32:
-
- testl $2, %esi
- je .L31
-
- movq (%ebx), %mm0
- addl $8, %ebx
-
- movq %mm0, %mm3
- movq %mm0, %mm4
-
- pand %mm2, %mm3
- psllq $16, %mm4
- psrlq $16, %mm3
- pand %mm2, %mm4
-
- pand %mm1, %mm0
- por %mm4, %mm3
- por %mm3, %mm0
-
- movq %mm0, (%ecx)
- addl $8, %ecx
-.L31:
-
- movl %edx, %eax
- shrl $2, %eax
- jmp .L33
-.L34:
- movaps (%ebx), %xmm0
- addl $16, %ebx
-
- /* This would be so much better if we could just move directly from
- * an SSE register to an MMX register. Unfortunately, that
- * functionality wasn't introduced until SSE2 with the MOVDQ2Q
- * instruction.
- */
-
- movaps %xmm0, (%esp)
- movq (%esp), %mm0
- movq 8(%esp), %mm5
-
- movq %mm0, %mm3
- movq %mm0, %mm4
- movq %mm5, %mm6
- movq %mm5, %mm7
-
- pand %mm2, %mm3
- pand %mm2, %mm6
-
- psllq $16, %mm4
- psllq $16, %mm7
-
- psrlq $16, %mm3
- psrlq $16, %mm6
-
- pand %mm2, %mm4
- pand %mm2, %mm7
-
- pand %mm1, %mm0
- pand %mm1, %mm5
-
- por %mm4, %mm3
- por %mm7, %mm6
-
- por %mm3, %mm0
- por %mm6, %mm5
-
- movq %mm0, (%ecx)
- movq %mm5, 8(%ecx)
- addl $16, %ecx
-
- subl $1, %eax
-.L33:
- jne .L34
-
-#ifdef USE_INNER_EMMS
- emms
-#endif
- movl %ebp, %esp
-
- /* At this point there are either [0, 3] pixels remaining to be
- * converted.
- */
-
- testl $2, %edx
- je .L36
-
- movq (%ebx), %mm0
- addl $8, %ebx
-
- movq %mm0, %mm3
- movq %mm0, %mm4
-
- pand %mm2, %mm3
- psllq $16, %mm4
- psrlq $16, %mm3
- pand %mm2, %mm4
-
- pand %mm1, %mm0
- por %mm4, %mm3
- por %mm3, %mm0
-
- movq %mm0, (%ecx)
- addl $8, %ecx
-.L36:
-
- testl $1, %edx
- je .L35
-
- DO_ONE_LAST_PIXEL()
-.L35:
- popl %ebp
- popl %ebx
- popl %esi
- ret
- .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
-
-
-/**
- * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
- */
-
- .text
-.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
-#ifndef USE_DRICORE
-.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
-#endif
- .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
-_generic_read_RGBA_span_BGRA8888_REV_SSE2:
- pushl %esi
- pushl %ebx
-
- LOAD_MASK(movdqu,%xmm1,%xmm2)
-
- movl 12(%esp), %ebx /* source pointer */
- movl 20(%esp), %edx /* number of pixels to copy */
- movl 16(%esp), %ecx /* destination pointer */
-
- movl %ebx, %eax
- movl %edx, %esi
-
- testl %edx, %edx
- jle .L46 /* Bail if there's nothing to do. */
-
- /* If the source pointer isn't a multiple of 16 we have to process
- * a few pixels the "slow" way to get the address aligned for
- * the SSE fetch intsructions.
- */
-
- negl %eax
- andl $15, %eax
- sarl $2, %eax
-
- cmpl %edx, %eax
- cmovbe %eax, %esi
- subl %esi, %edx
-
- testl $1, %esi
- je .L41
-
- DO_ONE_PIXEL()
-.L41:
- testl $2, %esi
- je .L40
-
- movq (%ebx), %xmm0
- addl $8, %ebx
-
- movdqa %xmm0, %xmm3
- movdqa %xmm0, %xmm4
- andps %xmm1, %xmm0
-
- andps %xmm2, %xmm3
- pslldq $2, %xmm4
- psrldq $2, %xmm3
- andps %xmm2, %xmm4
-
- orps %xmm4, %xmm3
- orps %xmm3, %xmm0
-
- movq %xmm0, (%ecx)
- addl $8, %ecx
-.L40:
-
- /* Would it be worth having a specialized version of this loop for
- * the case where the destination is 16-byte aligned? That version
- * would be identical except that it could use movedqa instead of
- * movdqu.
- */
-
- movl %edx, %eax
- shrl $2, %eax
- jmp .L42
-.L43:
- movdqa (%ebx), %xmm0
- addl $16, %ebx
-
- movdqa %xmm0, %xmm3
- movdqa %xmm0, %xmm4
- andps %xmm1, %xmm0
-
- andps %xmm2, %xmm3
- pslldq $2, %xmm4
- psrldq $2, %xmm3
- andps %xmm2, %xmm4
-
- orps %xmm4, %xmm3
- orps %xmm3, %xmm0
-
- movdqu %xmm0, (%ecx)
- addl $16, %ecx
- subl $1, %eax
-.L42:
- jne .L43
-
-
- /* There may be upto 3 pixels remaining to be copied. Take care
- * of them now. We do the 2 pixel case first because the data
- * will be aligned.
- */
-
- testl $2, %edx
- je .L47
-
- movq (%ebx), %xmm0
- addl $8, %ebx
-
- movdqa %xmm0, %xmm3
- movdqa %xmm0, %xmm4
- andps %xmm1, %xmm0
-
- andps %xmm2, %xmm3
- pslldq $2, %xmm4
- psrldq $2, %xmm3
- andps %xmm2, %xmm4
-
- orps %xmm4, %xmm3
- orps %xmm3, %xmm0
-
- movq %xmm0, (%ecx)
- addl $8, %ecx
-.L47:
-
- testl $1, %edx
- je .L46
-
- DO_ONE_LAST_PIXEL()
-.L46:
-
- popl %ebx
- popl %esi
- ret
- .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
-
-
-
-#define MASK_565_L 0x07e0f800
-#define MASK_565_H 0x0000001f
-/* Setting SCALE_ADJUST to 5 gives a perfect match with the
- * classic C implementation in Mesa. Setting SCALE_ADJUST
- * to 0 is slightly faster but at a small cost to accuracy.
- */
-#define SCALE_ADJUST 5
-#if SCALE_ADJUST == 5
-#define PRESCALE_L 0x00100001
-#define PRESCALE_H 0x00000200
-#define SCALE_L 0x40C620E8
-#define SCALE_H 0x0000839d
-#elif SCALE_ADJUST == 0
-#define PRESCALE_L 0x00200001
-#define PRESCALE_H 0x00000800
-#define SCALE_L 0x01040108
-#define SCALE_H 0x00000108
-#else
-#error SCALE_ADJUST must either be 5 or 0.
-#endif
-#define ALPHA_L 0x00000000
-#define ALPHA_H 0x00ff0000
-
-/**
- * MMX optimized version of the RGB565 to RGBA copy routine.
- */
-
- .text
- .globl _generic_read_RGBA_span_RGB565_MMX
-#ifndef USE_DRICORE
- .hidden _generic_read_RGBA_span_RGB565_MMX
-#endif
- .type _generic_read_RGBA_span_RGB565_MMX, @function
-
-_generic_read_RGBA_span_RGB565_MMX:
-
-#ifdef USE_INNER_EMMS
- emms
-#endif
-
- movl 4(%esp), %eax /* source pointer */
- movl 8(%esp), %edx /* destination pointer */
- movl 12(%esp), %ecx /* number of pixels to copy */
-
- pushl $MASK_565_H
- pushl $MASK_565_L
- movq (%esp), %mm5
- pushl $PRESCALE_H
- pushl $PRESCALE_L
- movq (%esp), %mm6
- pushl $SCALE_H
- pushl $SCALE_L
- movq (%esp), %mm7
- pushl $ALPHA_H
- pushl $ALPHA_L
- movq (%esp), %mm3
- addl $32,%esp
-
- sarl $2, %ecx
- jl .L01 /* Bail early if the count is negative. */
- jmp .L02
-
-.L03:
- /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
- * second pixels into the four words of %mm0 and %mm2.
- */
-
- movq (%eax), %mm4
- addl $8, %eax
-
- pshufw $0x00, %mm4, %mm0
- pshufw $0x55, %mm4, %mm2
-
-
- /* Mask the pixels so that each word of each register contains only
- * one color component.
- */
-
- pand %mm5, %mm0
- pand %mm5, %mm2
-
-
- /* Adjust the component values so that they are as small as possible,
- * but large enough so that we can multiply them by an unsigned 16-bit
- * number and get a value as large as 0x00ff0000.
- */
-
- pmullw %mm6, %mm0
- pmullw %mm6, %mm2
-#if SCALE_ADJUST > 0
- psrlw $SCALE_ADJUST, %mm0
- psrlw $SCALE_ADJUST, %mm2
-#endif
-
- /* Scale the input component values to be on the range
- * [0, 0x00ff0000]. This it the real magic of the whole routine.
- */
-
- pmulhuw %mm7, %mm0
- pmulhuw %mm7, %mm2
-
-
- /* Always set the alpha value to 0xff.
- */
-
- por %mm3, %mm0
- por %mm3, %mm2
-
-
- /* Pack the 16-bit values to 8-bit values and store the converted
- * pixel data.
- */
-
- packuswb %mm2, %mm0
- movq %mm0, (%edx)
- addl $8, %edx
-
- pshufw $0xaa, %mm4, %mm0
- pshufw $0xff, %mm4, %mm2
-
- pand %mm5, %mm0
- pand %mm5, %mm2
- pmullw %mm6, %mm0
- pmullw %mm6, %mm2
-#if SCALE_ADJUST > 0
- psrlw $SCALE_ADJUST, %mm0
- psrlw $SCALE_ADJUST, %mm2
-#endif
- pmulhuw %mm7, %mm0
- pmulhuw %mm7, %mm2
-
- por %mm3, %mm0
- por %mm3, %mm2
-
- packuswb %mm2, %mm0
-
- movq %mm0, (%edx)
- addl $8, %edx
-
- subl $1, %ecx
-.L02:
- jne .L03
-
-
- /* At this point there can be at most 3 pixels left to process. If
- * there is either 2 or 3 left, process 2.
- */
-
- movl 12(%esp), %ecx
- testl $0x02, %ecx
- je .L04
-
- movd (%eax), %mm4
- addl $4, %eax
-
- pshufw $0x00, %mm4, %mm0
- pshufw $0x55, %mm4, %mm2
-
- pand %mm5, %mm0
- pand %mm5, %mm2
- pmullw %mm6, %mm0
- pmullw %mm6, %mm2
-#if SCALE_ADJUST > 0
- psrlw $SCALE_ADJUST, %mm0
- psrlw $SCALE_ADJUST, %mm2
-#endif
- pmulhuw %mm7, %mm0
- pmulhuw %mm7, %mm2
-
- por %mm3, %mm0
- por %mm3, %mm2
-
- packuswb %mm2, %mm0
-
- movq %mm0, (%edx)
- addl $8, %edx
-
-.L04:
- /* At this point there can be at most 1 pixel left to process.
- * Process it if needed.
- */
-
- testl $0x01, %ecx
- je .L01
-
- movzwl (%eax), %ecx
- movd %ecx, %mm4
-
- pshufw $0x00, %mm4, %mm0
-
- pand %mm5, %mm0
- pmullw %mm6, %mm0
-#if SCALE_ADJUST > 0
- psrlw $SCALE_ADJUST, %mm0
-#endif
- pmulhuw %mm7, %mm0
-
- por %mm3, %mm0
-
- packuswb %mm0, %mm0
-
- movd %mm0, (%edx)
-
-.L01:
-#ifdef USE_INNER_EMMS
- emms
-#endif
- ret
-#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+/* + * (C) Copyright IBM Corporation 2004 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * \file read_rgba_span_x86.S + * Optimized routines to transfer pixel data from the framebuffer to a + * buffer in main memory. + * + * \author Ian Romanick <idr@us.ibm.com> + */ + + .file "read_rgba_span_x86.S" +#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */ +/* Kevin F. Quinn 2nd July 2006 + * Replaced data segment constants with text-segment instructions. + */ +#define LOAD_MASK(mvins,m1,m2) \ + pushl $0xff00ff00 ;\ + pushl $0xff00ff00 ;\ + pushl $0xff00ff00 ;\ + pushl $0xff00ff00 ;\ + mvins (%esp), m1 ;\ + pushl $0x00ff0000 ;\ + pushl $0x00ff0000 ;\ + pushl $0x00ff0000 ;\ + pushl $0x00ff0000 ;\ + mvins (%esp), m2 ;\ + addl $32, %esp + +/* I implemented these as macros because they appear in several places, + * and I've tweaked them a number of times. I got tired of changing every + * place they appear. :) + */ + +#define DO_ONE_PIXEL() \ + movl (%ebx), %eax ; \ + addl $4, %ebx ; \ + bswap %eax /* ARGB -> BGRA */ ; \ + rorl $8, %eax /* BGRA -> ABGR */ ; \ + movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ + addl $4, %ecx + +#define DO_ONE_LAST_PIXEL() \ + movl (%ebx), %eax ; \ + bswap %eax /* ARGB -> BGRA */ ; \ + rorl $8, %eax /* BGRA -> ABGR */ ; \ + movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ + + +/** + * MMX optimized version of the BGRA8888_REV to RGBA copy routine. + * + * \warning + * This function assumes that the caller will issue the EMMS instruction + * at the correct places. + */ + +.globl _generic_read_RGBA_span_BGRA8888_REV_MMX +#ifndef USE_DRICORE +.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX +#endif + .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function +_generic_read_RGBA_span_BGRA8888_REV_MMX: + pushl %ebx + +#ifdef USE_INNER_EMMS + emms +#endif + LOAD_MASK(movq,%mm1,%mm2) + + movl 8(%esp), %ebx /* source pointer */ + movl 16(%esp), %edx /* number of pixels to copy */ + movl 12(%esp), %ecx /* destination pointer */ + + testl %edx, %edx + jle .L20 /* Bail if there's nothing to do. */ + + movl %ebx, %eax + + negl %eax + sarl $2, %eax + andl $1, %eax + je .L17 + + subl %eax, %edx + DO_ONE_PIXEL() +.L17: + + /* Would it be faster to unroll this loop once and process 4 pixels + * per pass, instead of just two? + */ + + movl %edx, %eax + shrl %eax + jmp .L18 +.L19: + movq (%ebx), %mm0 + addl $8, %ebx + + /* These 9 instructions do what PSHUFB (if there were such an + * instruction) could do in 1. :( + */ + + movq %mm0, %mm3 + movq %mm0, %mm4 + + pand %mm2, %mm3 + psllq $16, %mm4 + psrlq $16, %mm3 + pand %mm2, %mm4 + + pand %mm1, %mm0 + por %mm4, %mm3 + por %mm3, %mm0 + + movq %mm0, (%ecx) + addl $8, %ecx + subl $1, %eax +.L18: + jne .L19 + +#ifdef USE_INNER_EMMS + emms +#endif + + /* At this point there are either 1 or 0 pixels remaining to be + * converted. Convert the last pixel, if needed. + */ + + testl $1, %edx + je .L20 + + DO_ONE_LAST_PIXEL() + +.L20: + popl %ebx + ret + .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX + + +/** + * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE + * instructions are only actually used to read data from the framebuffer. + * In practice, the speed-up is pretty small. + * + * \todo + * Do some more testing and determine if there's any reason to have this + * function in addition to the MMX version. + * + * \warning + * This function assumes that the caller will issue the EMMS instruction + * at the correct places. + */ + +.globl _generic_read_RGBA_span_BGRA8888_REV_SSE +#ifndef USE_DRICORE +.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE +#endif + .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function +_generic_read_RGBA_span_BGRA8888_REV_SSE: + pushl %esi + pushl %ebx + pushl %ebp + +#ifdef USE_INNER_EMMS + emms +#endif + + LOAD_MASK(movq,%mm1,%mm2) + + movl 16(%esp), %ebx /* source pointer */ + movl 24(%esp), %edx /* number of pixels to copy */ + movl 20(%esp), %ecx /* destination pointer */ + + testl %edx, %edx + jle .L35 /* Bail if there's nothing to do. */ + + movl %esp, %ebp + subl $16, %esp + andl $0xfffffff0, %esp + + movl %ebx, %eax + movl %edx, %esi + + negl %eax + andl $15, %eax + sarl $2, %eax + cmpl %edx, %eax + cmovle %eax, %esi + + subl %esi, %edx + + testl $1, %esi + je .L32 + + DO_ONE_PIXEL() +.L32: + + testl $2, %esi + je .L31 + + movq (%ebx), %mm0 + addl $8, %ebx + + movq %mm0, %mm3 + movq %mm0, %mm4 + + pand %mm2, %mm3 + psllq $16, %mm4 + psrlq $16, %mm3 + pand %mm2, %mm4 + + pand %mm1, %mm0 + por %mm4, %mm3 + por %mm3, %mm0 + + movq %mm0, (%ecx) + addl $8, %ecx +.L31: + + movl %edx, %eax + shrl $2, %eax + jmp .L33 +.L34: + movaps (%ebx), %xmm0 + addl $16, %ebx + + /* This would be so much better if we could just move directly from + * an SSE register to an MMX register. Unfortunately, that + * functionality wasn't introduced until SSE2 with the MOVDQ2Q + * instruction. + */ + + movaps %xmm0, (%esp) + movq (%esp), %mm0 + movq 8(%esp), %mm5 + + movq %mm0, %mm3 + movq %mm0, %mm4 + movq %mm5, %mm6 + movq %mm5, %mm7 + + pand %mm2, %mm3 + pand %mm2, %mm6 + + psllq $16, %mm4 + psllq $16, %mm7 + + psrlq $16, %mm3 + psrlq $16, %mm6 + + pand %mm2, %mm4 + pand %mm2, %mm7 + + pand %mm1, %mm0 + pand %mm1, %mm5 + + por %mm4, %mm3 + por %mm7, %mm6 + + por %mm3, %mm0 + por %mm6, %mm5 + + movq %mm0, (%ecx) + movq %mm5, 8(%ecx) + addl $16, %ecx + + subl $1, %eax +.L33: + jne .L34 + +#ifdef USE_INNER_EMMS + emms +#endif + movl %ebp, %esp + + /* At this point there are either [0, 3] pixels remaining to be + * converted. + */ + + testl $2, %edx + je .L36 + + movq (%ebx), %mm0 + addl $8, %ebx + + movq %mm0, %mm3 + movq %mm0, %mm4 + + pand %mm2, %mm3 + psllq $16, %mm4 + psrlq $16, %mm3 + pand %mm2, %mm4 + + pand %mm1, %mm0 + por %mm4, %mm3 + por %mm3, %mm0 + + movq %mm0, (%ecx) + addl $8, %ecx +.L36: + + testl $1, %edx + je .L35 + + DO_ONE_LAST_PIXEL() +.L35: + popl %ebp + popl %ebx + popl %esi + ret + .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE + + +/** + * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. + */ + + .text +.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 +#ifndef USE_DRICORE +.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2 +#endif + .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function +_generic_read_RGBA_span_BGRA8888_REV_SSE2: + pushl %esi + pushl %ebx + + LOAD_MASK(movdqu,%xmm1,%xmm2) + + movl 12(%esp), %ebx /* source pointer */ + movl 20(%esp), %edx /* number of pixels to copy */ + movl 16(%esp), %ecx /* destination pointer */ + + movl %ebx, %eax + movl %edx, %esi + + testl %edx, %edx + jle .L46 /* Bail if there's nothing to do. */ + + /* If the source pointer isn't a multiple of 16 we have to process + * a few pixels the "slow" way to get the address aligned for + * the SSE fetch intsructions. + */ + + negl %eax + andl $15, %eax + sarl $2, %eax + + cmpl %edx, %eax + cmovbe %eax, %esi + subl %esi, %edx + + testl $1, %esi + je .L41 + + DO_ONE_PIXEL() +.L41: + testl $2, %esi + je .L40 + + movq (%ebx), %xmm0 + addl $8, %ebx + + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm4 + andps %xmm1, %xmm0 + + andps %xmm2, %xmm3 + pslldq $2, %xmm4 + psrldq $2, %xmm3 + andps %xmm2, %xmm4 + + orps %xmm4, %xmm3 + orps %xmm3, %xmm0 + + movq %xmm0, (%ecx) + addl $8, %ecx +.L40: + + /* Would it be worth having a specialized version of this loop for + * the case where the destination is 16-byte aligned? That version + * would be identical except that it could use movedqa instead of + * movdqu. + */ + + movl %edx, %eax + shrl $2, %eax + jmp .L42 +.L43: + movdqa (%ebx), %xmm0 + addl $16, %ebx + + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm4 + andps %xmm1, %xmm0 + + andps %xmm2, %xmm3 + pslldq $2, %xmm4 + psrldq $2, %xmm3 + andps %xmm2, %xmm4 + + orps %xmm4, %xmm3 + orps %xmm3, %xmm0 + + movdqu %xmm0, (%ecx) + addl $16, %ecx + subl $1, %eax +.L42: + jne .L43 + + + /* There may be upto 3 pixels remaining to be copied. Take care + * of them now. We do the 2 pixel case first because the data + * will be aligned. + */ + + testl $2, %edx + je .L47 + + movq (%ebx), %xmm0 + addl $8, %ebx + + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm4 + andps %xmm1, %xmm0 + + andps %xmm2, %xmm3 + pslldq $2, %xmm4 + psrldq $2, %xmm3 + andps %xmm2, %xmm4 + + orps %xmm4, %xmm3 + orps %xmm3, %xmm0 + + movq %xmm0, (%ecx) + addl $8, %ecx +.L47: + + testl $1, %edx + je .L46 + + DO_ONE_LAST_PIXEL() +.L46: + + popl %ebx + popl %esi + ret + .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 + + + +#define MASK_565_L 0x07e0f800 +#define MASK_565_H 0x0000001f +/* Setting SCALE_ADJUST to 5 gives a perfect match with the + * classic C implementation in Mesa. Setting SCALE_ADJUST + * to 0 is slightly faster but at a small cost to accuracy. + */ +#define SCALE_ADJUST 5 +#if SCALE_ADJUST == 5 +#define PRESCALE_L 0x00100001 +#define PRESCALE_H 0x00000200 +#define SCALE_L 0x40C620E8 +#define SCALE_H 0x0000839d +#elif SCALE_ADJUST == 0 +#define PRESCALE_L 0x00200001 +#define PRESCALE_H 0x00000800 +#define SCALE_L 0x01040108 +#define SCALE_H 0x00000108 +#else +#error SCALE_ADJUST must either be 5 or 0. +#endif +#define ALPHA_L 0x00000000 +#define ALPHA_H 0x00ff0000 + +/** + * MMX optimized version of the RGB565 to RGBA copy routine. + */ + + .text + .globl _generic_read_RGBA_span_RGB565_MMX +#ifndef USE_DRICORE + .hidden _generic_read_RGBA_span_RGB565_MMX +#endif + .type _generic_read_RGBA_span_RGB565_MMX, @function + +_generic_read_RGBA_span_RGB565_MMX: + +#ifdef USE_INNER_EMMS + emms +#endif + + movl 4(%esp), %eax /* source pointer */ + movl 8(%esp), %edx /* destination pointer */ + movl 12(%esp), %ecx /* number of pixels to copy */ + + pushl $MASK_565_H + pushl $MASK_565_L + movq (%esp), %mm5 + pushl $PRESCALE_H + pushl $PRESCALE_L + movq (%esp), %mm6 + pushl $SCALE_H + pushl $SCALE_L + movq (%esp), %mm7 + pushl $ALPHA_H + pushl $ALPHA_L + movq (%esp), %mm3 + addl $32,%esp + + sarl $2, %ecx + jl .L01 /* Bail early if the count is negative. */ + jmp .L02 + +.L03: + /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and + * second pixels into the four words of %mm0 and %mm2. + */ + + movq (%eax), %mm4 + addl $8, %eax + + pshufw $0x00, %mm4, %mm0 + pshufw $0x55, %mm4, %mm2 + + + /* Mask the pixels so that each word of each register contains only + * one color component. + */ + + pand %mm5, %mm0 + pand %mm5, %mm2 + + + /* Adjust the component values so that they are as small as possible, + * but large enough so that we can multiply them by an unsigned 16-bit + * number and get a value as large as 0x00ff0000. + */ + + pmullw %mm6, %mm0 + pmullw %mm6, %mm2 +#if SCALE_ADJUST > 0 + psrlw $SCALE_ADJUST, %mm0 + psrlw $SCALE_ADJUST, %mm2 +#endif + + /* Scale the input component values to be on the range + * [0, 0x00ff0000]. This it the real magic of the whole routine. + */ + + pmulhuw %mm7, %mm0 + pmulhuw %mm7, %mm2 + + + /* Always set the alpha value to 0xff. + */ + + por %mm3, %mm0 + por %mm3, %mm2 + + + /* Pack the 16-bit values to 8-bit values and store the converted + * pixel data. + */ + + packuswb %mm2, %mm0 + movq %mm0, (%edx) + addl $8, %edx + + pshufw $0xaa, %mm4, %mm0 + pshufw $0xff, %mm4, %mm2 + + pand %mm5, %mm0 + pand %mm5, %mm2 + pmullw %mm6, %mm0 + pmullw %mm6, %mm2 +#if SCALE_ADJUST > 0 + psrlw $SCALE_ADJUST, %mm0 + psrlw $SCALE_ADJUST, %mm2 +#endif + pmulhuw %mm7, %mm0 + pmulhuw %mm7, %mm2 + + por %mm3, %mm0 + por %mm3, %mm2 + + packuswb %mm2, %mm0 + + movq %mm0, (%edx) + addl $8, %edx + + subl $1, %ecx +.L02: + jne .L03 + + + /* At this point there can be at most 3 pixels left to process. If + * there is either 2 or 3 left, process 2. + */ + + movl 12(%esp), %ecx + testl $0x02, %ecx + je .L04 + + movd (%eax), %mm4 + addl $4, %eax + + pshufw $0x00, %mm4, %mm0 + pshufw $0x55, %mm4, %mm2 + + pand %mm5, %mm0 + pand %mm5, %mm2 + pmullw %mm6, %mm0 + pmullw %mm6, %mm2 +#if SCALE_ADJUST > 0 + psrlw $SCALE_ADJUST, %mm0 + psrlw $SCALE_ADJUST, %mm2 +#endif + pmulhuw %mm7, %mm0 + pmulhuw %mm7, %mm2 + + por %mm3, %mm0 + por %mm3, %mm2 + + packuswb %mm2, %mm0 + + movq %mm0, (%edx) + addl $8, %edx + +.L04: + /* At this point there can be at most 1 pixel left to process. + * Process it if needed. + */ + + testl $0x01, %ecx + je .L01 + + movzwl (%eax), %ecx + movd %ecx, %mm4 + + pshufw $0x00, %mm4, %mm0 + + pand %mm5, %mm0 + pmullw %mm6, %mm0 +#if SCALE_ADJUST > 0 + psrlw $SCALE_ADJUST, %mm0 +#endif + pmulhuw %mm7, %mm0 + + por %mm3, %mm0 + + packuswb %mm0, %mm0 + + movd %mm0, (%edx) + +.L01: +#ifdef USE_INNER_EMMS + emms +#endif + ret +#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */ + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/read_rgba_span_x86.h b/mesalib/src/mesa/x86/read_rgba_span_x86.h index d255f6dcb..564b1bb0f 100644 --- a/mesalib/src/mesa/x86/read_rgba_span_x86.h +++ b/mesalib/src/mesa/x86/read_rgba_span_x86.h @@ -1,56 +1,56 @@ -/*
- * (C) Copyright IBM Corporation 2004
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file read_rgba_span_x86.h
- *
- * \author Ian Romanick <idr@us.ibm.com>
- */
-
-#ifndef READ_RGBA_SPAN_X86_H
-#define READ_RGBA_SPAN_X86_H
-
-#if defined(USE_SSE_ASM) || defined(USE_MMX_ASM)
-#include "x86/common_x86_asm.h"
-#endif
-
-#if defined(USE_SSE_ASM)
-extern void _generic_read_RGBA_span_BGRA8888_REV_SSE2( const unsigned char *,
- unsigned char *, unsigned );
-#endif
-
-#if defined(USE_SSE_ASM)
-extern void _generic_read_RGBA_span_BGRA8888_REV_SSE( const unsigned char *,
- unsigned char *, unsigned );
-#endif
-
-#if defined(USE_MMX_ASM)
-extern void _generic_read_RGBA_span_BGRA8888_REV_MMX( const unsigned char *,
- unsigned char *, unsigned );
-
-extern void _generic_read_RGBA_span_RGB565_MMX( const unsigned char *,
- unsigned char *, unsigned );
-#endif
-
-#endif /* READ_RGBA_SPAN_X86_H */
+/* + * (C) Copyright IBM Corporation 2004 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * \file read_rgba_span_x86.h + * + * \author Ian Romanick <idr@us.ibm.com> + */ + +#ifndef READ_RGBA_SPAN_X86_H +#define READ_RGBA_SPAN_X86_H + +#if defined(USE_SSE_ASM) || defined(USE_MMX_ASM) +#include "x86/common_x86_asm.h" +#endif + +#if defined(USE_SSE_ASM) +extern void _generic_read_RGBA_span_BGRA8888_REV_SSE2( const unsigned char *, + unsigned char *, unsigned ); +#endif + +#if defined(USE_SSE_ASM) +extern void _generic_read_RGBA_span_BGRA8888_REV_SSE( const unsigned char *, + unsigned char *, unsigned ); +#endif + +#if defined(USE_MMX_ASM) +extern void _generic_read_RGBA_span_BGRA8888_REV_MMX( const unsigned char *, + unsigned char *, unsigned ); + +extern void _generic_read_RGBA_span_RGB565_MMX( const unsigned char *, + unsigned char *, unsigned ); +#endif + +#endif /* READ_RGBA_SPAN_X86_H */ diff --git a/mesalib/src/mesa/x86/rtasm/x86sse.c b/mesalib/src/mesa/x86/rtasm/x86sse.c index 08e3969b2..c93faba79 100644 --- a/mesalib/src/mesa/x86/rtasm/x86sse.c +++ b/mesalib/src/mesa/x86/rtasm/x86sse.c @@ -1,1203 +1,1203 @@ -#ifdef USE_X86_ASM
-#if defined(__i386__) || defined(__386__)
-
-#include "main/imports.h"
-#include "x86sse.h"
-
-#define DISASSEM 0
-#define X86_TWOB 0x0f
-
-#if 0
-static unsigned char *cptr( void (*label)() )
-{
- return (unsigned char *)(unsigned long)label;
-}
-#endif
-
-
-static void do_realloc( struct x86_function *p )
-{
- if (p->size == 0) {
- p->size = 1024;
- p->store = _mesa_exec_malloc(p->size);
- p->csr = p->store;
- }
- else {
- unsigned used = p->csr - p->store;
- unsigned char *tmp = p->store;
- p->size *= 2;
- p->store = _mesa_exec_malloc(p->size);
- memcpy(p->store, tmp, used);
- p->csr = p->store + used;
- _mesa_exec_free(tmp);
- }
-}
-
-/* Emit bytes to the instruction stream:
- */
-static unsigned char *reserve( struct x86_function *p, int bytes )
-{
- if (p->csr + bytes - p->store > p->size)
- do_realloc(p);
-
- {
- unsigned char *csr = p->csr;
- p->csr += bytes;
- return csr;
- }
-}
-
-
-
-static void emit_1b( struct x86_function *p, char b0 )
-{
- char *csr = (char *)reserve(p, 1);
- *csr = b0;
-}
-
-static void emit_1i( struct x86_function *p, int i0 )
-{
- int *icsr = (int *)reserve(p, sizeof(i0));
- *icsr = i0;
-}
-
-static void emit_1ub( struct x86_function *p, unsigned char b0 )
-{
- unsigned char *csr = reserve(p, 1);
- *csr++ = b0;
-}
-
-static void emit_2ub( struct x86_function *p, unsigned char b0, unsigned char b1 )
-{
- unsigned char *csr = reserve(p, 2);
- *csr++ = b0;
- *csr++ = b1;
-}
-
-static void emit_3ub( struct x86_function *p, unsigned char b0, unsigned char b1, unsigned char b2 )
-{
- unsigned char *csr = reserve(p, 3);
- *csr++ = b0;
- *csr++ = b1;
- *csr++ = b2;
-}
-
-
-/* Build a modRM byte + possible displacement. No treatment of SIB
- * indexing. BZZT - no way to encode an absolute address.
- */
-static void emit_modrm( struct x86_function *p,
- struct x86_reg reg,
- struct x86_reg regmem )
-{
- unsigned char val = 0;
-
- assert(reg.mod == mod_REG);
-
- val |= regmem.mod << 6; /* mod field */
- val |= reg.idx << 3; /* reg field */
- val |= regmem.idx; /* r/m field */
-
- emit_1ub(p, val);
-
- /* Oh-oh we've stumbled into the SIB thing.
- */
- if (regmem.file == file_REG32 &&
- regmem.idx == reg_SP) {
- emit_1ub(p, 0x24); /* simplistic! */
- }
-
- switch (regmem.mod) {
- case mod_REG:
- case mod_INDIRECT:
- break;
- case mod_DISP8:
- emit_1b(p, regmem.disp);
- break;
- case mod_DISP32:
- emit_1i(p, regmem.disp);
- break;
- default:
- assert(0);
- break;
- }
-}
-
-
-static void emit_modrm_noreg( struct x86_function *p,
- unsigned op,
- struct x86_reg regmem )
-{
- struct x86_reg dummy = x86_make_reg(file_REG32, op);
- emit_modrm(p, dummy, regmem);
-}
-
-/* Many x86 instructions have two opcodes to cope with the situations
- * where the destination is a register or memory reference
- * respectively. This function selects the correct opcode based on
- * the arguments presented.
- */
-static void emit_op_modrm( struct x86_function *p,
- unsigned char op_dst_is_reg,
- unsigned char op_dst_is_mem,
- struct x86_reg dst,
- struct x86_reg src )
-{
- switch (dst.mod) {
- case mod_REG:
- emit_1ub(p, op_dst_is_reg);
- emit_modrm(p, dst, src);
- break;
- case mod_INDIRECT:
- case mod_DISP32:
- case mod_DISP8:
- assert(src.mod == mod_REG);
- emit_1ub(p, op_dst_is_mem);
- emit_modrm(p, src, dst);
- break;
- default:
- assert(0);
- break;
- }
-}
-
-
-
-
-
-
-
-/* Create and manipulate registers and regmem values:
- */
-struct x86_reg x86_make_reg( enum x86_reg_file file,
- enum x86_reg_name idx )
-{
- struct x86_reg reg;
-
- reg.file = file;
- reg.idx = idx;
- reg.mod = mod_REG;
- reg.disp = 0;
-
- return reg;
-}
-
-struct x86_reg x86_make_disp( struct x86_reg reg,
- int disp )
-{
- assert(reg.file == file_REG32);
-
- if (reg.mod == mod_REG)
- reg.disp = disp;
- else
- reg.disp += disp;
-
- if (reg.disp == 0)
- reg.mod = mod_INDIRECT;
- else if (reg.disp <= 127 && reg.disp >= -128)
- reg.mod = mod_DISP8;
- else
- reg.mod = mod_DISP32;
-
- return reg;
-}
-
-struct x86_reg x86_deref( struct x86_reg reg )
-{
- return x86_make_disp(reg, 0);
-}
-
-struct x86_reg x86_get_base_reg( struct x86_reg reg )
-{
- return x86_make_reg( reg.file, reg.idx );
-}
-
-unsigned char *x86_get_label( struct x86_function *p )
-{
- return p->csr;
-}
-
-
-
-/***********************************************************************
- * x86 instructions
- */
-
-
-void x86_jcc( struct x86_function *p,
- enum x86_cc cc,
- unsigned char *label )
-{
- int offset = label - (x86_get_label(p) + 2);
-
- if (offset <= 127 && offset >= -128) {
- emit_1ub(p, 0x70 + cc);
- emit_1b(p, (char) offset);
- }
- else {
- offset = label - (x86_get_label(p) + 6);
- emit_2ub(p, 0x0f, 0x80 + cc);
- emit_1i(p, offset);
- }
-}
-
-/* Always use a 32bit offset for forward jumps:
- */
-unsigned char *x86_jcc_forward( struct x86_function *p,
- enum x86_cc cc )
-{
- emit_2ub(p, 0x0f, 0x80 + cc);
- emit_1i(p, 0);
- return x86_get_label(p);
-}
-
-unsigned char *x86_jmp_forward( struct x86_function *p)
-{
- emit_1ub(p, 0xe9);
- emit_1i(p, 0);
- return x86_get_label(p);
-}
-
-unsigned char *x86_call_forward( struct x86_function *p)
-{
- emit_1ub(p, 0xe8);
- emit_1i(p, 0);
- return x86_get_label(p);
-}
-
-/* Fixup offset from forward jump:
- */
-void x86_fixup_fwd_jump( struct x86_function *p,
- unsigned char *fixup )
-{
- *(int *)(fixup - 4) = x86_get_label(p) - fixup;
-}
-
-void x86_jmp( struct x86_function *p, unsigned char *label)
-{
- emit_1ub(p, 0xe9);
- emit_1i(p, label - x86_get_label(p) - 4);
-}
-
-#if 0
-/* This doesn't work once we start reallocating & copying the
- * generated code on buffer fills, because the call is relative to the
- * current pc.
- */
-void x86_call( struct x86_function *p, void (*label)())
-{
- emit_1ub(p, 0xe8);
- emit_1i(p, cptr(label) - x86_get_label(p) - 4);
-}
-#else
-void x86_call( struct x86_function *p, struct x86_reg reg)
-{
- emit_1ub(p, 0xff);
- emit_modrm_noreg(p, 2, reg);
-}
-#endif
-
-
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
-void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
-{
- assert(dst.mod == mod_REG);
- emit_1ub(p, 0xb8 + dst.idx);
- emit_1i(p, imm);
-}
-
-void x86_push( struct x86_function *p,
- struct x86_reg reg )
-{
- assert(reg.mod == mod_REG);
- emit_1ub(p, 0x50 + reg.idx);
- p->stack_offset += 4;
-}
-
-void x86_pop( struct x86_function *p,
- struct x86_reg reg )
-{
- assert(reg.mod == mod_REG);
- emit_1ub(p, 0x58 + reg.idx);
- p->stack_offset -= 4;
-}
-
-void x86_inc( struct x86_function *p,
- struct x86_reg reg )
-{
- assert(reg.mod == mod_REG);
- emit_1ub(p, 0x40 + reg.idx);
-}
-
-void x86_dec( struct x86_function *p,
- struct x86_reg reg )
-{
- assert(reg.mod == mod_REG);
- emit_1ub(p, 0x48 + reg.idx);
-}
-
-void x86_ret( struct x86_function *p )
-{
- emit_1ub(p, 0xc3);
-}
-
-void x86_sahf( struct x86_function *p )
-{
- emit_1ub(p, 0x9e);
-}
-
-void x86_mov( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_op_modrm( p, 0x8b, 0x89, dst, src );
-}
-
-void x86_xor( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_op_modrm( p, 0x33, 0x31, dst, src );
-}
-
-void x86_cmp( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_op_modrm( p, 0x3b, 0x39, dst, src );
-}
-
-void x86_lea( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_1ub(p, 0x8d);
- emit_modrm( p, dst, src );
-}
-
-void x86_test( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_1ub(p, 0x85);
- emit_modrm( p, dst, src );
-}
-
-void x86_add( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_op_modrm(p, 0x03, 0x01, dst, src );
-}
-
-void x86_mul( struct x86_function *p,
- struct x86_reg src )
-{
- assert (src.file == file_REG32 && src.mod == mod_REG);
- emit_op_modrm(p, 0xf7, 0, x86_make_reg (file_REG32, reg_SP), src );
-}
-
-void x86_sub( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_op_modrm(p, 0x2b, 0x29, dst, src );
-}
-
-void x86_or( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_op_modrm( p, 0x0b, 0x09, dst, src );
-}
-
-void x86_and( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_op_modrm( p, 0x23, 0x21, dst, src );
-}
-
-
-
-/***********************************************************************
- * SSE instructions
- */
-
-
-void sse_movss( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, 0xF3, X86_TWOB);
- emit_op_modrm( p, 0x10, 0x11, dst, src );
-}
-
-void sse_movaps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_1ub(p, X86_TWOB);
- emit_op_modrm( p, 0x28, 0x29, dst, src );
-}
-
-void sse_movups( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_1ub(p, X86_TWOB);
- emit_op_modrm( p, 0x10, 0x11, dst, src );
-}
-
-void sse_movhps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- assert(dst.mod != mod_REG || src.mod != mod_REG);
- emit_1ub(p, X86_TWOB);
- emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
-}
-
-void sse_movlps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- assert(dst.mod != mod_REG || src.mod != mod_REG);
- emit_1ub(p, X86_TWOB);
- emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
-}
-
-void sse_maxps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, X86_TWOB, 0x5F);
- emit_modrm( p, dst, src );
-}
-
-void sse_maxss( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
- emit_modrm( p, dst, src );
-}
-
-void sse_divss( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_3ub(p, 0xF3, X86_TWOB, 0x5E);
- emit_modrm( p, dst, src );
-}
-
-void sse_minps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, X86_TWOB, 0x5D);
- emit_modrm( p, dst, src );
-}
-
-void sse_subps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, X86_TWOB, 0x5C);
- emit_modrm( p, dst, src );
-}
-
-void sse_mulps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, X86_TWOB, 0x59);
- emit_modrm( p, dst, src );
-}
-
-void sse_mulss( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_3ub(p, 0xF3, X86_TWOB, 0x59);
- emit_modrm( p, dst, src );
-}
-
-void sse_addps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, X86_TWOB, 0x58);
- emit_modrm( p, dst, src );
-}
-
-void sse_addss( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_3ub(p, 0xF3, X86_TWOB, 0x58);
- emit_modrm( p, dst, src );
-}
-
-void sse_andnps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, X86_TWOB, 0x55);
- emit_modrm( p, dst, src );
-}
-
-void sse_andps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, X86_TWOB, 0x54);
- emit_modrm( p, dst, src );
-}
-
-void sse_rsqrtps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, X86_TWOB, 0x52);
- emit_modrm( p, dst, src );
-}
-
-void sse_rsqrtss( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_3ub(p, 0xF3, X86_TWOB, 0x52);
- emit_modrm( p, dst, src );
-
-}
-
-void sse_movhlps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- assert(dst.mod == mod_REG && src.mod == mod_REG);
- emit_2ub(p, X86_TWOB, 0x12);
- emit_modrm( p, dst, src );
-}
-
-void sse_movlhps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- assert(dst.mod == mod_REG && src.mod == mod_REG);
- emit_2ub(p, X86_TWOB, 0x16);
- emit_modrm( p, dst, src );
-}
-
-void sse_orps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, X86_TWOB, 0x56);
- emit_modrm( p, dst, src );
-}
-
-void sse_xorps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, X86_TWOB, 0x57);
- emit_modrm( p, dst, src );
-}
-
-void sse_cvtps2pi( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- assert(dst.file == file_MMX &&
- (src.file == file_XMM || src.mod != mod_REG));
-
- p->need_emms = 1;
-
- emit_2ub(p, X86_TWOB, 0x2d);
- emit_modrm( p, dst, src );
-}
-
-
-/* Shufps can also be used to implement a reduced swizzle when dest ==
- * arg0.
- */
-void sse_shufps( struct x86_function *p,
- struct x86_reg dest,
- struct x86_reg arg0,
- unsigned char shuf)
-{
- emit_2ub(p, X86_TWOB, 0xC6);
- emit_modrm(p, dest, arg0);
- emit_1ub(p, shuf);
-}
-
-void sse_cmpps( struct x86_function *p,
- struct x86_reg dest,
- struct x86_reg arg0,
- unsigned char cc)
-{
- emit_2ub(p, X86_TWOB, 0xC2);
- emit_modrm(p, dest, arg0);
- emit_1ub(p, cc);
-}
-
-void sse_pmovmskb( struct x86_function *p,
- struct x86_reg dest,
- struct x86_reg src)
-{
- emit_3ub(p, 0x66, X86_TWOB, 0xD7);
- emit_modrm(p, dest, src);
-}
-
-/***********************************************************************
- * SSE2 instructions
- */
-
-/**
- * Perform a reduced swizzle:
- */
-void sse2_pshufd( struct x86_function *p,
- struct x86_reg dest,
- struct x86_reg arg0,
- unsigned char shuf)
-{
- emit_3ub(p, 0x66, X86_TWOB, 0x70);
- emit_modrm(p, dest, arg0);
- emit_1ub(p, shuf);
-}
-
-void sse2_cvttps2dq( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_3ub( p, 0xF3, X86_TWOB, 0x5B );
- emit_modrm( p, dst, src );
-}
-
-void sse2_cvtps2dq( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_3ub(p, 0x66, X86_TWOB, 0x5B);
- emit_modrm( p, dst, src );
-}
-
-void sse2_packssdw( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_3ub(p, 0x66, X86_TWOB, 0x6B);
- emit_modrm( p, dst, src );
-}
-
-void sse2_packsswb( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_3ub(p, 0x66, X86_TWOB, 0x63);
- emit_modrm( p, dst, src );
-}
-
-void sse2_packuswb( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_3ub(p, 0x66, X86_TWOB, 0x67);
- emit_modrm( p, dst, src );
-}
-
-void sse2_rcpps( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, X86_TWOB, 0x53);
- emit_modrm( p, dst, src );
-}
-
-void sse2_rcpss( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_3ub(p, 0xF3, X86_TWOB, 0x53);
- emit_modrm( p, dst, src );
-}
-
-void sse2_movd( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- emit_2ub(p, 0x66, X86_TWOB);
- emit_op_modrm( p, 0x6e, 0x7e, dst, src );
-}
-
-
-
-
-/***********************************************************************
- * x87 instructions
- */
-void x87_fist( struct x86_function *p, struct x86_reg dst )
-{
- emit_1ub(p, 0xdb);
- emit_modrm_noreg(p, 2, dst);
-}
-
-void x87_fistp( struct x86_function *p, struct x86_reg dst )
-{
- emit_1ub(p, 0xdb);
- emit_modrm_noreg(p, 3, dst);
-}
-
-void x87_fild( struct x86_function *p, struct x86_reg arg )
-{
- emit_1ub(p, 0xdf);
- emit_modrm_noreg(p, 0, arg);
-}
-
-void x87_fldz( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xee);
-}
-
-
-void x87_fldcw( struct x86_function *p, struct x86_reg arg )
-{
- assert(arg.file == file_REG32);
- assert(arg.mod != mod_REG);
- emit_1ub(p, 0xd9);
- emit_modrm_noreg(p, 5, arg);
-}
-
-void x87_fld1( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xe8);
-}
-
-void x87_fldl2e( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xea);
-}
-
-void x87_fldln2( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xed);
-}
-
-void x87_fwait( struct x86_function *p )
-{
- emit_1ub(p, 0x9b);
-}
-
-void x87_fnclex( struct x86_function *p )
-{
- emit_2ub(p, 0xdb, 0xe2);
-}
-
-void x87_fclex( struct x86_function *p )
-{
- x87_fwait(p);
- x87_fnclex(p);
-}
-
-
-static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86_reg arg,
- unsigned char dst0ub0,
- unsigned char dst0ub1,
- unsigned char arg0ub0,
- unsigned char arg0ub1,
- unsigned char argmem_noreg)
-{
- assert(dst.file == file_x87);
-
- if (arg.file == file_x87) {
- if (dst.idx == 0)
- emit_2ub(p, dst0ub0, dst0ub1+arg.idx);
- else if (arg.idx == 0)
- emit_2ub(p, arg0ub0, arg0ub1+arg.idx);
- else
- assert(0);
- }
- else if (dst.idx == 0) {
- assert(arg.file == file_REG32);
- emit_1ub(p, 0xd8);
- emit_modrm_noreg(p, argmem_noreg, arg);
- }
- else
- assert(0);
-}
-
-void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
- x87_arith_op(p, dst, arg,
- 0xd8, 0xc8,
- 0xdc, 0xc8,
- 4);
-}
-
-void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
- x87_arith_op(p, dst, arg,
- 0xd8, 0xe0,
- 0xdc, 0xe8,
- 4);
-}
-
-void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
- x87_arith_op(p, dst, arg,
- 0xd8, 0xe8,
- 0xdc, 0xe0,
- 5);
-}
-
-void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
- x87_arith_op(p, dst, arg,
- 0xd8, 0xc0,
- 0xdc, 0xc0,
- 0);
-}
-
-void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
- x87_arith_op(p, dst, arg,
- 0xd8, 0xf0,
- 0xdc, 0xf8,
- 6);
-}
-
-void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
- x87_arith_op(p, dst, arg,
- 0xd8, 0xf8,
- 0xdc, 0xf0,
- 7);
-}
-
-void x87_fmulp( struct x86_function *p, struct x86_reg dst )
-{
- assert(dst.file == file_x87);
- assert(dst.idx >= 1);
- emit_2ub(p, 0xde, 0xc8+dst.idx);
-}
-
-void x87_fsubp( struct x86_function *p, struct x86_reg dst )
-{
- assert(dst.file == file_x87);
- assert(dst.idx >= 1);
- emit_2ub(p, 0xde, 0xe8+dst.idx);
-}
-
-void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
-{
- assert(dst.file == file_x87);
- assert(dst.idx >= 1);
- emit_2ub(p, 0xde, 0xe0+dst.idx);
-}
-
-void x87_faddp( struct x86_function *p, struct x86_reg dst )
-{
- assert(dst.file == file_x87);
- assert(dst.idx >= 1);
- emit_2ub(p, 0xde, 0xc0+dst.idx);
-}
-
-void x87_fdivp( struct x86_function *p, struct x86_reg dst )
-{
- assert(dst.file == file_x87);
- assert(dst.idx >= 1);
- emit_2ub(p, 0xde, 0xf8+dst.idx);
-}
-
-void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
-{
- assert(dst.file == file_x87);
- assert(dst.idx >= 1);
- emit_2ub(p, 0xde, 0xf0+dst.idx);
-}
-
-void x87_fucom( struct x86_function *p, struct x86_reg arg )
-{
- assert(arg.file == file_x87);
- emit_2ub(p, 0xdd, 0xe0+arg.idx);
-}
-
-void x87_fucomp( struct x86_function *p, struct x86_reg arg )
-{
- assert(arg.file == file_x87);
- emit_2ub(p, 0xdd, 0xe8+arg.idx);
-}
-
-void x87_fucompp( struct x86_function *p )
-{
- emit_2ub(p, 0xda, 0xe9);
-}
-
-void x87_fxch( struct x86_function *p, struct x86_reg arg )
-{
- assert(arg.file == file_x87);
- emit_2ub(p, 0xd9, 0xc8+arg.idx);
-}
-
-void x87_fabs( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xe1);
-}
-
-void x87_fchs( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xe0);
-}
-
-void x87_fcos( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xff);
-}
-
-
-void x87_fprndint( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xfc);
-}
-
-void x87_fscale( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xfd);
-}
-
-void x87_fsin( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xfe);
-}
-
-void x87_fsincos( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xfb);
-}
-
-void x87_fsqrt( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xfa);
-}
-
-void x87_fxtract( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xf4);
-}
-
-/* st0 = (2^st0)-1
- *
- * Restrictions: -1.0 <= st0 <= 1.0
- */
-void x87_f2xm1( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xf0);
-}
-
-/* st1 = st1 * log2(st0);
- * pop_stack;
- */
-void x87_fyl2x( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xf1);
-}
-
-/* st1 = st1 * log2(st0 + 1.0);
- * pop_stack;
- *
- * A fast operation, with restrictions: -.29 < st0 < .29
- */
-void x87_fyl2xp1( struct x86_function *p )
-{
- emit_2ub(p, 0xd9, 0xf9);
-}
-
-
-void x87_fld( struct x86_function *p, struct x86_reg arg )
-{
- if (arg.file == file_x87)
- emit_2ub(p, 0xd9, 0xc0 + arg.idx);
- else {
- emit_1ub(p, 0xd9);
- emit_modrm_noreg(p, 0, arg);
- }
-}
-
-void x87_fst( struct x86_function *p, struct x86_reg dst )
-{
- if (dst.file == file_x87)
- emit_2ub(p, 0xdd, 0xd0 + dst.idx);
- else {
- emit_1ub(p, 0xd9);
- emit_modrm_noreg(p, 2, dst);
- }
-}
-
-void x87_fstp( struct x86_function *p, struct x86_reg dst )
-{
- if (dst.file == file_x87)
- emit_2ub(p, 0xdd, 0xd8 + dst.idx);
- else {
- emit_1ub(p, 0xd9);
- emit_modrm_noreg(p, 3, dst);
- }
-}
-
-void x87_fcom( struct x86_function *p, struct x86_reg dst )
-{
- if (dst.file == file_x87)
- emit_2ub(p, 0xd8, 0xd0 + dst.idx);
- else {
- emit_1ub(p, 0xd8);
- emit_modrm_noreg(p, 2, dst);
- }
-}
-
-void x87_fcomp( struct x86_function *p, struct x86_reg dst )
-{
- if (dst.file == file_x87)
- emit_2ub(p, 0xd8, 0xd8 + dst.idx);
- else {
- emit_1ub(p, 0xd8);
- emit_modrm_noreg(p, 3, dst);
- }
-}
-
-
-void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
-{
- assert(dst.file == file_REG32);
-
- if (dst.idx == reg_AX &&
- dst.mod == mod_REG)
- emit_2ub(p, 0xdf, 0xe0);
- else {
- emit_1ub(p, 0xdd);
- emit_modrm_noreg(p, 7, dst);
- }
-}
-
-
-
-
-/***********************************************************************
- * MMX instructions
- */
-
-void mmx_emms( struct x86_function *p )
-{
- assert(p->need_emms);
- emit_2ub(p, 0x0f, 0x77);
- p->need_emms = 0;
-}
-
-void mmx_packssdw( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- assert(dst.file == file_MMX &&
- (src.file == file_MMX || src.mod != mod_REG));
-
- p->need_emms = 1;
-
- emit_2ub(p, X86_TWOB, 0x6b);
- emit_modrm( p, dst, src );
-}
-
-void mmx_packuswb( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- assert(dst.file == file_MMX &&
- (src.file == file_MMX || src.mod != mod_REG));
-
- p->need_emms = 1;
-
- emit_2ub(p, X86_TWOB, 0x67);
- emit_modrm( p, dst, src );
-}
-
-void mmx_movd( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- p->need_emms = 1;
- emit_1ub(p, X86_TWOB);
- emit_op_modrm( p, 0x6e, 0x7e, dst, src );
-}
-
-void mmx_movq( struct x86_function *p,
- struct x86_reg dst,
- struct x86_reg src )
-{
- p->need_emms = 1;
- emit_1ub(p, X86_TWOB);
- emit_op_modrm( p, 0x6f, 0x7f, dst, src );
-}
-
-
-/***********************************************************************
- * Helper functions
- */
-
-
-/* Retreive a reference to one of the function arguments, taking into
- * account any push/pop activity:
- */
-struct x86_reg x86_fn_arg( struct x86_function *p,
- unsigned arg )
-{
- return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
- p->stack_offset + arg * 4); /* ??? */
-}
-
-
-void x86_init_func( struct x86_function *p )
-{
- p->size = 0;
- p->store = NULL;
- p->csr = p->store;
-}
-
-int x86_init_func_size( struct x86_function *p, unsigned code_size )
-{
- p->size = code_size;
- p->store = _mesa_exec_malloc(code_size);
- p->csr = p->store;
- return p->store != NULL;
-}
-
-void x86_release_func( struct x86_function *p )
-{
- _mesa_exec_free(p->store);
- p->store = NULL;
- p->csr = NULL;
- p->size = 0;
-}
-
-
-void (*x86_get_func( struct x86_function *p ))(void)
-{
- if (DISASSEM && p->store)
- printf("disassemble %p %p\n", p->store, p->csr);
- return (void (*)(void)) (unsigned long) p->store;
-}
-
-#else
-
-void x86sse_dummy( void )
-{
-}
-
-#endif
-
-#else /* USE_X86_ASM */
-
-int x86sse_c_dummy_var; /* silence warning */
-
-#endif /* USE_X86_ASM */
+#ifdef USE_X86_ASM +#if defined(__i386__) || defined(__386__) + +#include "main/imports.h" +#include "x86sse.h" + +#define DISASSEM 0 +#define X86_TWOB 0x0f + +#if 0 +static unsigned char *cptr( void (*label)() ) +{ + return (unsigned char *)(unsigned long)label; +} +#endif + + +static void do_realloc( struct x86_function *p ) +{ + if (p->size == 0) { + p->size = 1024; + p->store = _mesa_exec_malloc(p->size); + p->csr = p->store; + } + else { + unsigned used = p->csr - p->store; + unsigned char *tmp = p->store; + p->size *= 2; + p->store = _mesa_exec_malloc(p->size); + memcpy(p->store, tmp, used); + p->csr = p->store + used; + _mesa_exec_free(tmp); + } +} + +/* Emit bytes to the instruction stream: + */ +static unsigned char *reserve( struct x86_function *p, int bytes ) +{ + if (p->csr + bytes - p->store > p->size) + do_realloc(p); + + { + unsigned char *csr = p->csr; + p->csr += bytes; + return csr; + } +} + + + +static void emit_1b( struct x86_function *p, char b0 ) +{ + char *csr = (char *)reserve(p, 1); + *csr = b0; +} + +static void emit_1i( struct x86_function *p, int i0 ) +{ + int *icsr = (int *)reserve(p, sizeof(i0)); + *icsr = i0; +} + +static void emit_1ub( struct x86_function *p, unsigned char b0 ) +{ + unsigned char *csr = reserve(p, 1); + *csr++ = b0; +} + +static void emit_2ub( struct x86_function *p, unsigned char b0, unsigned char b1 ) +{ + unsigned char *csr = reserve(p, 2); + *csr++ = b0; + *csr++ = b1; +} + +static void emit_3ub( struct x86_function *p, unsigned char b0, unsigned char b1, unsigned char b2 ) +{ + unsigned char *csr = reserve(p, 3); + *csr++ = b0; + *csr++ = b1; + *csr++ = b2; +} + + +/* Build a modRM byte + possible displacement. No treatment of SIB + * indexing. BZZT - no way to encode an absolute address. + */ +static void emit_modrm( struct x86_function *p, + struct x86_reg reg, + struct x86_reg regmem ) +{ + unsigned char val = 0; + + assert(reg.mod == mod_REG); + + val |= regmem.mod << 6; /* mod field */ + val |= reg.idx << 3; /* reg field */ + val |= regmem.idx; /* r/m field */ + + emit_1ub(p, val); + + /* Oh-oh we've stumbled into the SIB thing. + */ + if (regmem.file == file_REG32 && + regmem.idx == reg_SP) { + emit_1ub(p, 0x24); /* simplistic! */ + } + + switch (regmem.mod) { + case mod_REG: + case mod_INDIRECT: + break; + case mod_DISP8: + emit_1b(p, regmem.disp); + break; + case mod_DISP32: + emit_1i(p, regmem.disp); + break; + default: + assert(0); + break; + } +} + + +static void emit_modrm_noreg( struct x86_function *p, + unsigned op, + struct x86_reg regmem ) +{ + struct x86_reg dummy = x86_make_reg(file_REG32, op); + emit_modrm(p, dummy, regmem); +} + +/* Many x86 instructions have two opcodes to cope with the situations + * where the destination is a register or memory reference + * respectively. This function selects the correct opcode based on + * the arguments presented. + */ +static void emit_op_modrm( struct x86_function *p, + unsigned char op_dst_is_reg, + unsigned char op_dst_is_mem, + struct x86_reg dst, + struct x86_reg src ) +{ + switch (dst.mod) { + case mod_REG: + emit_1ub(p, op_dst_is_reg); + emit_modrm(p, dst, src); + break; + case mod_INDIRECT: + case mod_DISP32: + case mod_DISP8: + assert(src.mod == mod_REG); + emit_1ub(p, op_dst_is_mem); + emit_modrm(p, src, dst); + break; + default: + assert(0); + break; + } +} + + + + + + + +/* Create and manipulate registers and regmem values: + */ +struct x86_reg x86_make_reg( enum x86_reg_file file, + enum x86_reg_name idx ) +{ + struct x86_reg reg; + + reg.file = file; + reg.idx = idx; + reg.mod = mod_REG; + reg.disp = 0; + + return reg; +} + +struct x86_reg x86_make_disp( struct x86_reg reg, + int disp ) +{ + assert(reg.file == file_REG32); + + if (reg.mod == mod_REG) + reg.disp = disp; + else + reg.disp += disp; + + if (reg.disp == 0) + reg.mod = mod_INDIRECT; + else if (reg.disp <= 127 && reg.disp >= -128) + reg.mod = mod_DISP8; + else + reg.mod = mod_DISP32; + + return reg; +} + +struct x86_reg x86_deref( struct x86_reg reg ) +{ + return x86_make_disp(reg, 0); +} + +struct x86_reg x86_get_base_reg( struct x86_reg reg ) +{ + return x86_make_reg( reg.file, reg.idx ); +} + +unsigned char *x86_get_label( struct x86_function *p ) +{ + return p->csr; +} + + + +/*********************************************************************** + * x86 instructions + */ + + +void x86_jcc( struct x86_function *p, + enum x86_cc cc, + unsigned char *label ) +{ + int offset = label - (x86_get_label(p) + 2); + + if (offset <= 127 && offset >= -128) { + emit_1ub(p, 0x70 + cc); + emit_1b(p, (char) offset); + } + else { + offset = label - (x86_get_label(p) + 6); + emit_2ub(p, 0x0f, 0x80 + cc); + emit_1i(p, offset); + } +} + +/* Always use a 32bit offset for forward jumps: + */ +unsigned char *x86_jcc_forward( struct x86_function *p, + enum x86_cc cc ) +{ + emit_2ub(p, 0x0f, 0x80 + cc); + emit_1i(p, 0); + return x86_get_label(p); +} + +unsigned char *x86_jmp_forward( struct x86_function *p) +{ + emit_1ub(p, 0xe9); + emit_1i(p, 0); + return x86_get_label(p); +} + +unsigned char *x86_call_forward( struct x86_function *p) +{ + emit_1ub(p, 0xe8); + emit_1i(p, 0); + return x86_get_label(p); +} + +/* Fixup offset from forward jump: + */ +void x86_fixup_fwd_jump( struct x86_function *p, + unsigned char *fixup ) +{ + *(int *)(fixup - 4) = x86_get_label(p) - fixup; +} + +void x86_jmp( struct x86_function *p, unsigned char *label) +{ + emit_1ub(p, 0xe9); + emit_1i(p, label - x86_get_label(p) - 4); +} + +#if 0 +/* This doesn't work once we start reallocating & copying the + * generated code on buffer fills, because the call is relative to the + * current pc. + */ +void x86_call( struct x86_function *p, void (*label)()) +{ + emit_1ub(p, 0xe8); + emit_1i(p, cptr(label) - x86_get_label(p) - 4); +} +#else +void x86_call( struct x86_function *p, struct x86_reg reg) +{ + emit_1ub(p, 0xff); + emit_modrm_noreg(p, 2, reg); +} +#endif + + +/* michal: + * Temporary. As I need immediate operands, and dont want to mess with the codegen, + * I load the immediate into general purpose register and use it. + */ +void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm ) +{ + assert(dst.mod == mod_REG); + emit_1ub(p, 0xb8 + dst.idx); + emit_1i(p, imm); +} + +void x86_push( struct x86_function *p, + struct x86_reg reg ) +{ + assert(reg.mod == mod_REG); + emit_1ub(p, 0x50 + reg.idx); + p->stack_offset += 4; +} + +void x86_pop( struct x86_function *p, + struct x86_reg reg ) +{ + assert(reg.mod == mod_REG); + emit_1ub(p, 0x58 + reg.idx); + p->stack_offset -= 4; +} + +void x86_inc( struct x86_function *p, + struct x86_reg reg ) +{ + assert(reg.mod == mod_REG); + emit_1ub(p, 0x40 + reg.idx); +} + +void x86_dec( struct x86_function *p, + struct x86_reg reg ) +{ + assert(reg.mod == mod_REG); + emit_1ub(p, 0x48 + reg.idx); +} + +void x86_ret( struct x86_function *p ) +{ + emit_1ub(p, 0xc3); +} + +void x86_sahf( struct x86_function *p ) +{ + emit_1ub(p, 0x9e); +} + +void x86_mov( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_op_modrm( p, 0x8b, 0x89, dst, src ); +} + +void x86_xor( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_op_modrm( p, 0x33, 0x31, dst, src ); +} + +void x86_cmp( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_op_modrm( p, 0x3b, 0x39, dst, src ); +} + +void x86_lea( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_1ub(p, 0x8d); + emit_modrm( p, dst, src ); +} + +void x86_test( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_1ub(p, 0x85); + emit_modrm( p, dst, src ); +} + +void x86_add( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_op_modrm(p, 0x03, 0x01, dst, src ); +} + +void x86_mul( struct x86_function *p, + struct x86_reg src ) +{ + assert (src.file == file_REG32 && src.mod == mod_REG); + emit_op_modrm(p, 0xf7, 0, x86_make_reg (file_REG32, reg_SP), src ); +} + +void x86_sub( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_op_modrm(p, 0x2b, 0x29, dst, src ); +} + +void x86_or( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_op_modrm( p, 0x0b, 0x09, dst, src ); +} + +void x86_and( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_op_modrm( p, 0x23, 0x21, dst, src ); +} + + + +/*********************************************************************** + * SSE instructions + */ + + +void sse_movss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, 0xF3, X86_TWOB); + emit_op_modrm( p, 0x10, 0x11, dst, src ); +} + +void sse_movaps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_1ub(p, X86_TWOB); + emit_op_modrm( p, 0x28, 0x29, dst, src ); +} + +void sse_movups( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_1ub(p, X86_TWOB); + emit_op_modrm( p, 0x10, 0x11, dst, src ); +} + +void sse_movhps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + assert(dst.mod != mod_REG || src.mod != mod_REG); + emit_1ub(p, X86_TWOB); + emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */ +} + +void sse_movlps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + assert(dst.mod != mod_REG || src.mod != mod_REG); + emit_1ub(p, X86_TWOB); + emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */ +} + +void sse_maxps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, X86_TWOB, 0x5F); + emit_modrm( p, dst, src ); +} + +void sse_maxss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0xF3, X86_TWOB, 0x5F); + emit_modrm( p, dst, src ); +} + +void sse_divss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0xF3, X86_TWOB, 0x5E); + emit_modrm( p, dst, src ); +} + +void sse_minps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, X86_TWOB, 0x5D); + emit_modrm( p, dst, src ); +} + +void sse_subps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, X86_TWOB, 0x5C); + emit_modrm( p, dst, src ); +} + +void sse_mulps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, X86_TWOB, 0x59); + emit_modrm( p, dst, src ); +} + +void sse_mulss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0xF3, X86_TWOB, 0x59); + emit_modrm( p, dst, src ); +} + +void sse_addps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, X86_TWOB, 0x58); + emit_modrm( p, dst, src ); +} + +void sse_addss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0xF3, X86_TWOB, 0x58); + emit_modrm( p, dst, src ); +} + +void sse_andnps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, X86_TWOB, 0x55); + emit_modrm( p, dst, src ); +} + +void sse_andps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, X86_TWOB, 0x54); + emit_modrm( p, dst, src ); +} + +void sse_rsqrtps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, X86_TWOB, 0x52); + emit_modrm( p, dst, src ); +} + +void sse_rsqrtss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0xF3, X86_TWOB, 0x52); + emit_modrm( p, dst, src ); + +} + +void sse_movhlps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + assert(dst.mod == mod_REG && src.mod == mod_REG); + emit_2ub(p, X86_TWOB, 0x12); + emit_modrm( p, dst, src ); +} + +void sse_movlhps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + assert(dst.mod == mod_REG && src.mod == mod_REG); + emit_2ub(p, X86_TWOB, 0x16); + emit_modrm( p, dst, src ); +} + +void sse_orps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, X86_TWOB, 0x56); + emit_modrm( p, dst, src ); +} + +void sse_xorps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, X86_TWOB, 0x57); + emit_modrm( p, dst, src ); +} + +void sse_cvtps2pi( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + assert(dst.file == file_MMX && + (src.file == file_XMM || src.mod != mod_REG)); + + p->need_emms = 1; + + emit_2ub(p, X86_TWOB, 0x2d); + emit_modrm( p, dst, src ); +} + + +/* Shufps can also be used to implement a reduced swizzle when dest == + * arg0. + */ +void sse_shufps( struct x86_function *p, + struct x86_reg dest, + struct x86_reg arg0, + unsigned char shuf) +{ + emit_2ub(p, X86_TWOB, 0xC6); + emit_modrm(p, dest, arg0); + emit_1ub(p, shuf); +} + +void sse_cmpps( struct x86_function *p, + struct x86_reg dest, + struct x86_reg arg0, + unsigned char cc) +{ + emit_2ub(p, X86_TWOB, 0xC2); + emit_modrm(p, dest, arg0); + emit_1ub(p, cc); +} + +void sse_pmovmskb( struct x86_function *p, + struct x86_reg dest, + struct x86_reg src) +{ + emit_3ub(p, 0x66, X86_TWOB, 0xD7); + emit_modrm(p, dest, src); +} + +/*********************************************************************** + * SSE2 instructions + */ + +/** + * Perform a reduced swizzle: + */ +void sse2_pshufd( struct x86_function *p, + struct x86_reg dest, + struct x86_reg arg0, + unsigned char shuf) +{ + emit_3ub(p, 0x66, X86_TWOB, 0x70); + emit_modrm(p, dest, arg0); + emit_1ub(p, shuf); +} + +void sse2_cvttps2dq( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub( p, 0xF3, X86_TWOB, 0x5B ); + emit_modrm( p, dst, src ); +} + +void sse2_cvtps2dq( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0x66, X86_TWOB, 0x5B); + emit_modrm( p, dst, src ); +} + +void sse2_packssdw( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0x66, X86_TWOB, 0x6B); + emit_modrm( p, dst, src ); +} + +void sse2_packsswb( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0x66, X86_TWOB, 0x63); + emit_modrm( p, dst, src ); +} + +void sse2_packuswb( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0x66, X86_TWOB, 0x67); + emit_modrm( p, dst, src ); +} + +void sse2_rcpps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, X86_TWOB, 0x53); + emit_modrm( p, dst, src ); +} + +void sse2_rcpss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0xF3, X86_TWOB, 0x53); + emit_modrm( p, dst, src ); +} + +void sse2_movd( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_2ub(p, 0x66, X86_TWOB); + emit_op_modrm( p, 0x6e, 0x7e, dst, src ); +} + + + + +/*********************************************************************** + * x87 instructions + */ +void x87_fist( struct x86_function *p, struct x86_reg dst ) +{ + emit_1ub(p, 0xdb); + emit_modrm_noreg(p, 2, dst); +} + +void x87_fistp( struct x86_function *p, struct x86_reg dst ) +{ + emit_1ub(p, 0xdb); + emit_modrm_noreg(p, 3, dst); +} + +void x87_fild( struct x86_function *p, struct x86_reg arg ) +{ + emit_1ub(p, 0xdf); + emit_modrm_noreg(p, 0, arg); +} + +void x87_fldz( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xee); +} + + +void x87_fldcw( struct x86_function *p, struct x86_reg arg ) +{ + assert(arg.file == file_REG32); + assert(arg.mod != mod_REG); + emit_1ub(p, 0xd9); + emit_modrm_noreg(p, 5, arg); +} + +void x87_fld1( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xe8); +} + +void x87_fldl2e( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xea); +} + +void x87_fldln2( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xed); +} + +void x87_fwait( struct x86_function *p ) +{ + emit_1ub(p, 0x9b); +} + +void x87_fnclex( struct x86_function *p ) +{ + emit_2ub(p, 0xdb, 0xe2); +} + +void x87_fclex( struct x86_function *p ) +{ + x87_fwait(p); + x87_fnclex(p); +} + + +static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86_reg arg, + unsigned char dst0ub0, + unsigned char dst0ub1, + unsigned char arg0ub0, + unsigned char arg0ub1, + unsigned char argmem_noreg) +{ + assert(dst.file == file_x87); + + if (arg.file == file_x87) { + if (dst.idx == 0) + emit_2ub(p, dst0ub0, dst0ub1+arg.idx); + else if (arg.idx == 0) + emit_2ub(p, arg0ub0, arg0ub1+arg.idx); + else + assert(0); + } + else if (dst.idx == 0) { + assert(arg.file == file_REG32); + emit_1ub(p, 0xd8); + emit_modrm_noreg(p, argmem_noreg, arg); + } + else + assert(0); +} + +void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ) +{ + x87_arith_op(p, dst, arg, + 0xd8, 0xc8, + 0xdc, 0xc8, + 4); +} + +void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ) +{ + x87_arith_op(p, dst, arg, + 0xd8, 0xe0, + 0xdc, 0xe8, + 4); +} + +void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ) +{ + x87_arith_op(p, dst, arg, + 0xd8, 0xe8, + 0xdc, 0xe0, + 5); +} + +void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ) +{ + x87_arith_op(p, dst, arg, + 0xd8, 0xc0, + 0xdc, 0xc0, + 0); +} + +void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ) +{ + x87_arith_op(p, dst, arg, + 0xd8, 0xf0, + 0xdc, 0xf8, + 6); +} + +void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ) +{ + x87_arith_op(p, dst, arg, + 0xd8, 0xf8, + 0xdc, 0xf0, + 7); +} + +void x87_fmulp( struct x86_function *p, struct x86_reg dst ) +{ + assert(dst.file == file_x87); + assert(dst.idx >= 1); + emit_2ub(p, 0xde, 0xc8+dst.idx); +} + +void x87_fsubp( struct x86_function *p, struct x86_reg dst ) +{ + assert(dst.file == file_x87); + assert(dst.idx >= 1); + emit_2ub(p, 0xde, 0xe8+dst.idx); +} + +void x87_fsubrp( struct x86_function *p, struct x86_reg dst ) +{ + assert(dst.file == file_x87); + assert(dst.idx >= 1); + emit_2ub(p, 0xde, 0xe0+dst.idx); +} + +void x87_faddp( struct x86_function *p, struct x86_reg dst ) +{ + assert(dst.file == file_x87); + assert(dst.idx >= 1); + emit_2ub(p, 0xde, 0xc0+dst.idx); +} + +void x87_fdivp( struct x86_function *p, struct x86_reg dst ) +{ + assert(dst.file == file_x87); + assert(dst.idx >= 1); + emit_2ub(p, 0xde, 0xf8+dst.idx); +} + +void x87_fdivrp( struct x86_function *p, struct x86_reg dst ) +{ + assert(dst.file == file_x87); + assert(dst.idx >= 1); + emit_2ub(p, 0xde, 0xf0+dst.idx); +} + +void x87_fucom( struct x86_function *p, struct x86_reg arg ) +{ + assert(arg.file == file_x87); + emit_2ub(p, 0xdd, 0xe0+arg.idx); +} + +void x87_fucomp( struct x86_function *p, struct x86_reg arg ) +{ + assert(arg.file == file_x87); + emit_2ub(p, 0xdd, 0xe8+arg.idx); +} + +void x87_fucompp( struct x86_function *p ) +{ + emit_2ub(p, 0xda, 0xe9); +} + +void x87_fxch( struct x86_function *p, struct x86_reg arg ) +{ + assert(arg.file == file_x87); + emit_2ub(p, 0xd9, 0xc8+arg.idx); +} + +void x87_fabs( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xe1); +} + +void x87_fchs( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xe0); +} + +void x87_fcos( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xff); +} + + +void x87_fprndint( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xfc); +} + +void x87_fscale( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xfd); +} + +void x87_fsin( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xfe); +} + +void x87_fsincos( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xfb); +} + +void x87_fsqrt( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xfa); +} + +void x87_fxtract( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xf4); +} + +/* st0 = (2^st0)-1 + * + * Restrictions: -1.0 <= st0 <= 1.0 + */ +void x87_f2xm1( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xf0); +} + +/* st1 = st1 * log2(st0); + * pop_stack; + */ +void x87_fyl2x( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xf1); +} + +/* st1 = st1 * log2(st0 + 1.0); + * pop_stack; + * + * A fast operation, with restrictions: -.29 < st0 < .29 + */ +void x87_fyl2xp1( struct x86_function *p ) +{ + emit_2ub(p, 0xd9, 0xf9); +} + + +void x87_fld( struct x86_function *p, struct x86_reg arg ) +{ + if (arg.file == file_x87) + emit_2ub(p, 0xd9, 0xc0 + arg.idx); + else { + emit_1ub(p, 0xd9); + emit_modrm_noreg(p, 0, arg); + } +} + +void x87_fst( struct x86_function *p, struct x86_reg dst ) +{ + if (dst.file == file_x87) + emit_2ub(p, 0xdd, 0xd0 + dst.idx); + else { + emit_1ub(p, 0xd9); + emit_modrm_noreg(p, 2, dst); + } +} + +void x87_fstp( struct x86_function *p, struct x86_reg dst ) +{ + if (dst.file == file_x87) + emit_2ub(p, 0xdd, 0xd8 + dst.idx); + else { + emit_1ub(p, 0xd9); + emit_modrm_noreg(p, 3, dst); + } +} + +void x87_fcom( struct x86_function *p, struct x86_reg dst ) +{ + if (dst.file == file_x87) + emit_2ub(p, 0xd8, 0xd0 + dst.idx); + else { + emit_1ub(p, 0xd8); + emit_modrm_noreg(p, 2, dst); + } +} + +void x87_fcomp( struct x86_function *p, struct x86_reg dst ) +{ + if (dst.file == file_x87) + emit_2ub(p, 0xd8, 0xd8 + dst.idx); + else { + emit_1ub(p, 0xd8); + emit_modrm_noreg(p, 3, dst); + } +} + + +void x87_fnstsw( struct x86_function *p, struct x86_reg dst ) +{ + assert(dst.file == file_REG32); + + if (dst.idx == reg_AX && + dst.mod == mod_REG) + emit_2ub(p, 0xdf, 0xe0); + else { + emit_1ub(p, 0xdd); + emit_modrm_noreg(p, 7, dst); + } +} + + + + +/*********************************************************************** + * MMX instructions + */ + +void mmx_emms( struct x86_function *p ) +{ + assert(p->need_emms); + emit_2ub(p, 0x0f, 0x77); + p->need_emms = 0; +} + +void mmx_packssdw( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + assert(dst.file == file_MMX && + (src.file == file_MMX || src.mod != mod_REG)); + + p->need_emms = 1; + + emit_2ub(p, X86_TWOB, 0x6b); + emit_modrm( p, dst, src ); +} + +void mmx_packuswb( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + assert(dst.file == file_MMX && + (src.file == file_MMX || src.mod != mod_REG)); + + p->need_emms = 1; + + emit_2ub(p, X86_TWOB, 0x67); + emit_modrm( p, dst, src ); +} + +void mmx_movd( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + p->need_emms = 1; + emit_1ub(p, X86_TWOB); + emit_op_modrm( p, 0x6e, 0x7e, dst, src ); +} + +void mmx_movq( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + p->need_emms = 1; + emit_1ub(p, X86_TWOB); + emit_op_modrm( p, 0x6f, 0x7f, dst, src ); +} + + +/*********************************************************************** + * Helper functions + */ + + +/* Retreive a reference to one of the function arguments, taking into + * account any push/pop activity: + */ +struct x86_reg x86_fn_arg( struct x86_function *p, + unsigned arg ) +{ + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + p->stack_offset + arg * 4); /* ??? */ +} + + +void x86_init_func( struct x86_function *p ) +{ + p->size = 0; + p->store = NULL; + p->csr = p->store; +} + +int x86_init_func_size( struct x86_function *p, unsigned code_size ) +{ + p->size = code_size; + p->store = _mesa_exec_malloc(code_size); + p->csr = p->store; + return p->store != NULL; +} + +void x86_release_func( struct x86_function *p ) +{ + _mesa_exec_free(p->store); + p->store = NULL; + p->csr = NULL; + p->size = 0; +} + + +void (*x86_get_func( struct x86_function *p ))(void) +{ + if (DISASSEM && p->store) + printf("disassemble %p %p\n", p->store, p->csr); + return (void (*)(void)) (unsigned long) p->store; +} + +#else + +void x86sse_dummy( void ) +{ +} + +#endif + +#else /* USE_X86_ASM */ + +int x86sse_c_dummy_var; /* silence warning */ + +#endif /* USE_X86_ASM */ diff --git a/mesalib/src/mesa/x86/rtasm/x86sse.h b/mesalib/src/mesa/x86/rtasm/x86sse.h index 399a694bc..f6282f5bd 100644 --- a/mesalib/src/mesa/x86/rtasm/x86sse.h +++ b/mesalib/src/mesa/x86/rtasm/x86sse.h @@ -1,256 +1,256 @@ -
-#ifndef _X86SSE_H_
-#define _X86SSE_H_
-
-#if defined(__i386__) || defined(__386__)
-
-/* It is up to the caller to ensure that instructions issued are
- * suitable for the host cpu. There are no checks made in this module
- * for mmx/sse/sse2 support on the cpu.
- */
-struct x86_reg {
- unsigned file:3;
- unsigned idx:3;
- unsigned mod:2; /* mod_REG if this is just a register */
- int disp:24; /* only +/- 23bits of offset - should be enough... */
-};
-
-struct x86_function {
- unsigned size;
- unsigned char *store;
- unsigned char *csr;
- unsigned stack_offset;
- int need_emms;
- const char *fn;
-};
-
-enum x86_reg_file {
- file_REG32,
- file_MMX,
- file_XMM,
- file_x87
-};
-
-/* Values for mod field of modr/m byte
- */
-enum x86_reg_mod {
- mod_INDIRECT,
- mod_DISP8,
- mod_DISP32,
- mod_REG
-};
-
-enum x86_reg_name {
- reg_AX,
- reg_CX,
- reg_DX,
- reg_BX,
- reg_SP,
- reg_BP,
- reg_SI,
- reg_DI
-};
-
-
-enum x86_cc {
- cc_O, /* overflow */
- cc_NO, /* not overflow */
- cc_NAE, /* not above or equal / carry */
- cc_AE, /* above or equal / not carry */
- cc_E, /* equal / zero */
- cc_NE /* not equal / not zero */
-};
-
-enum sse_cc {
- cc_Equal,
- cc_LessThan,
- cc_LessThanEqual,
- cc_Unordered,
- cc_NotEqual,
- cc_NotLessThan,
- cc_NotLessThanEqual,
- cc_Ordered
-};
-
-#define cc_Z cc_E
-#define cc_NZ cc_NE
-
-/* Begin/end/retreive function creation:
- */
-
-
-void x86_init_func( struct x86_function *p );
-int x86_init_func_size( struct x86_function *p, unsigned code_size );
-void x86_release_func( struct x86_function *p );
-void (*x86_get_func( struct x86_function *p ))( void );
-
-
-
-/* Create and manipulate registers and regmem values:
- */
-struct x86_reg x86_make_reg( enum x86_reg_file file,
- enum x86_reg_name idx );
-
-struct x86_reg x86_make_disp( struct x86_reg reg,
- int disp );
-
-struct x86_reg x86_deref( struct x86_reg reg );
-
-struct x86_reg x86_get_base_reg( struct x86_reg reg );
-
-
-/* Labels, jumps and fixup:
- */
-unsigned char *x86_get_label( struct x86_function *p );
-
-void x86_jcc( struct x86_function *p,
- enum x86_cc cc,
- unsigned char *label );
-
-unsigned char *x86_jcc_forward( struct x86_function *p,
- enum x86_cc cc );
-
-unsigned char *x86_jmp_forward( struct x86_function *p);
-
-unsigned char *x86_call_forward( struct x86_function *p);
-
-void x86_fixup_fwd_jump( struct x86_function *p,
- unsigned char *fixup );
-
-void x86_jmp( struct x86_function *p, unsigned char *label );
-
-/* void x86_call( struct x86_function *p, void (*label)() ); */
-void x86_call( struct x86_function *p, struct x86_reg reg);
-
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
-void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
-
-
-/* Macro for sse_shufps() and sse2_pshufd():
- */
-#define SHUF(_x,_y,_z,_w) (((_x)<<0) | ((_y)<<2) | ((_z)<<4) | ((_w)<<6))
-#define SHUF_NOOP RSW(0,1,2,3)
-#define GET_SHUF(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
-
-void mmx_emms( struct x86_function *p );
-void mmx_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-
-void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
- unsigned char shuf );
-void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-
-void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_andnps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src,
- unsigned char cc );
-void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movlhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_orps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_xorps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
- unsigned char shuf );
-void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
-
-void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_and( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_cmp( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_dec( struct x86_function *p, struct x86_reg reg );
-void x86_inc( struct x86_function *p, struct x86_reg reg );
-void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_mul( struct x86_function *p, struct x86_reg src );
-void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_pop( struct x86_function *p, struct x86_reg reg );
-void x86_push( struct x86_function *p, struct x86_reg reg );
-void x86_ret( struct x86_function *p );
-void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_sahf( struct x86_function *p );
-
-void x87_f2xm1( struct x86_function *p );
-void x87_fabs( struct x86_function *p );
-void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_faddp( struct x86_function *p, struct x86_reg dst );
-void x87_fchs( struct x86_function *p );
-void x87_fclex( struct x86_function *p );
-void x87_fcom( struct x86_function *p, struct x86_reg dst );
-void x87_fcomp( struct x86_function *p, struct x86_reg dst );
-void x87_fcos( struct x86_function *p );
-void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_fdivp( struct x86_function *p, struct x86_reg dst );
-void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_fdivrp( struct x86_function *p, struct x86_reg dst );
-void x87_fild( struct x86_function *p, struct x86_reg arg );
-void x87_fist( struct x86_function *p, struct x86_reg dst );
-void x87_fistp( struct x86_function *p, struct x86_reg dst );
-void x87_fld( struct x86_function *p, struct x86_reg arg );
-void x87_fld1( struct x86_function *p );
-void x87_fldcw( struct x86_function *p, struct x86_reg arg );
-void x87_fldl2e( struct x86_function *p );
-void x87_fldln2( struct x86_function *p );
-void x87_fldz( struct x86_function *p );
-void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_fmulp( struct x86_function *p, struct x86_reg dst );
-void x87_fnclex( struct x86_function *p );
-void x87_fprndint( struct x86_function *p );
-void x87_fscale( struct x86_function *p );
-void x87_fsin( struct x86_function *p );
-void x87_fsincos( struct x86_function *p );
-void x87_fsqrt( struct x86_function *p );
-void x87_fst( struct x86_function *p, struct x86_reg dst );
-void x87_fstp( struct x86_function *p, struct x86_reg dst );
-void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_fsubp( struct x86_function *p, struct x86_reg dst );
-void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_fsubrp( struct x86_function *p, struct x86_reg dst );
-void x87_fxch( struct x86_function *p, struct x86_reg dst );
-void x87_fxtract( struct x86_function *p );
-void x87_fyl2x( struct x86_function *p );
-void x87_fyl2xp1( struct x86_function *p );
-void x87_fwait( struct x86_function *p );
-void x87_fnstsw( struct x86_function *p, struct x86_reg dst );
-void x87_fucompp( struct x86_function *p );
-void x87_fucomp( struct x86_function *p, struct x86_reg arg );
-void x87_fucom( struct x86_function *p, struct x86_reg arg );
-
-
-
-/* Retreive a reference to one of the function arguments, taking into
- * account any push/pop activity. Note - doesn't track explict
- * manipulation of ESP by other instructions.
- */
-struct x86_reg x86_fn_arg( struct x86_function *p, unsigned arg );
-
-#endif
-#endif
+ +#ifndef _X86SSE_H_ +#define _X86SSE_H_ + +#if defined(__i386__) || defined(__386__) + +/* It is up to the caller to ensure that instructions issued are + * suitable for the host cpu. There are no checks made in this module + * for mmx/sse/sse2 support on the cpu. + */ +struct x86_reg { + unsigned file:3; + unsigned idx:3; + unsigned mod:2; /* mod_REG if this is just a register */ + int disp:24; /* only +/- 23bits of offset - should be enough... */ +}; + +struct x86_function { + unsigned size; + unsigned char *store; + unsigned char *csr; + unsigned stack_offset; + int need_emms; + const char *fn; +}; + +enum x86_reg_file { + file_REG32, + file_MMX, + file_XMM, + file_x87 +}; + +/* Values for mod field of modr/m byte + */ +enum x86_reg_mod { + mod_INDIRECT, + mod_DISP8, + mod_DISP32, + mod_REG +}; + +enum x86_reg_name { + reg_AX, + reg_CX, + reg_DX, + reg_BX, + reg_SP, + reg_BP, + reg_SI, + reg_DI +}; + + +enum x86_cc { + cc_O, /* overflow */ + cc_NO, /* not overflow */ + cc_NAE, /* not above or equal / carry */ + cc_AE, /* above or equal / not carry */ + cc_E, /* equal / zero */ + cc_NE /* not equal / not zero */ +}; + +enum sse_cc { + cc_Equal, + cc_LessThan, + cc_LessThanEqual, + cc_Unordered, + cc_NotEqual, + cc_NotLessThan, + cc_NotLessThanEqual, + cc_Ordered +}; + +#define cc_Z cc_E +#define cc_NZ cc_NE + +/* Begin/end/retreive function creation: + */ + + +void x86_init_func( struct x86_function *p ); +int x86_init_func_size( struct x86_function *p, unsigned code_size ); +void x86_release_func( struct x86_function *p ); +void (*x86_get_func( struct x86_function *p ))( void ); + + + +/* Create and manipulate registers and regmem values: + */ +struct x86_reg x86_make_reg( enum x86_reg_file file, + enum x86_reg_name idx ); + +struct x86_reg x86_make_disp( struct x86_reg reg, + int disp ); + +struct x86_reg x86_deref( struct x86_reg reg ); + +struct x86_reg x86_get_base_reg( struct x86_reg reg ); + + +/* Labels, jumps and fixup: + */ +unsigned char *x86_get_label( struct x86_function *p ); + +void x86_jcc( struct x86_function *p, + enum x86_cc cc, + unsigned char *label ); + +unsigned char *x86_jcc_forward( struct x86_function *p, + enum x86_cc cc ); + +unsigned char *x86_jmp_forward( struct x86_function *p); + +unsigned char *x86_call_forward( struct x86_function *p); + +void x86_fixup_fwd_jump( struct x86_function *p, + unsigned char *fixup ); + +void x86_jmp( struct x86_function *p, unsigned char *label ); + +/* void x86_call( struct x86_function *p, void (*label)() ); */ +void x86_call( struct x86_function *p, struct x86_reg reg); + +/* michal: + * Temporary. As I need immediate operands, and dont want to mess with the codegen, + * I load the immediate into general purpose register and use it. + */ +void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm ); + + +/* Macro for sse_shufps() and sse2_pshufd(): + */ +#define SHUF(_x,_y,_z,_w) (((_x)<<0) | ((_y)<<2) | ((_z)<<4) | ((_w)<<6)) +#define SHUF_NOOP RSW(0,1,2,3) +#define GET_SHUF(swz, idx) (((swz) >> ((idx)*2)) & 0x3) + +void mmx_emms( struct x86_function *p ); +void mmx_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, + unsigned char shuf ); +void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_andnps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src, + unsigned char cc ); +void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_movhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_movlhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_orps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_xorps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, + unsigned char shuf ); +void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src ); + +void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_and( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_cmp( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_dec( struct x86_function *p, struct x86_reg reg ); +void x86_inc( struct x86_function *p, struct x86_reg reg ); +void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mul( struct x86_function *p, struct x86_reg src ); +void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_pop( struct x86_function *p, struct x86_reg reg ); +void x86_push( struct x86_function *p, struct x86_reg reg ); +void x86_ret( struct x86_function *p ); +void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_sahf( struct x86_function *p ); + +void x87_f2xm1( struct x86_function *p ); +void x87_fabs( struct x86_function *p ); +void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ); +void x87_faddp( struct x86_function *p, struct x86_reg dst ); +void x87_fchs( struct x86_function *p ); +void x87_fclex( struct x86_function *p ); +void x87_fcom( struct x86_function *p, struct x86_reg dst ); +void x87_fcomp( struct x86_function *p, struct x86_reg dst ); +void x87_fcos( struct x86_function *p ); +void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ); +void x87_fdivp( struct x86_function *p, struct x86_reg dst ); +void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ); +void x87_fdivrp( struct x86_function *p, struct x86_reg dst ); +void x87_fild( struct x86_function *p, struct x86_reg arg ); +void x87_fist( struct x86_function *p, struct x86_reg dst ); +void x87_fistp( struct x86_function *p, struct x86_reg dst ); +void x87_fld( struct x86_function *p, struct x86_reg arg ); +void x87_fld1( struct x86_function *p ); +void x87_fldcw( struct x86_function *p, struct x86_reg arg ); +void x87_fldl2e( struct x86_function *p ); +void x87_fldln2( struct x86_function *p ); +void x87_fldz( struct x86_function *p ); +void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ); +void x87_fmulp( struct x86_function *p, struct x86_reg dst ); +void x87_fnclex( struct x86_function *p ); +void x87_fprndint( struct x86_function *p ); +void x87_fscale( struct x86_function *p ); +void x87_fsin( struct x86_function *p ); +void x87_fsincos( struct x86_function *p ); +void x87_fsqrt( struct x86_function *p ); +void x87_fst( struct x86_function *p, struct x86_reg dst ); +void x87_fstp( struct x86_function *p, struct x86_reg dst ); +void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ); +void x87_fsubp( struct x86_function *p, struct x86_reg dst ); +void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg ); +void x87_fsubrp( struct x86_function *p, struct x86_reg dst ); +void x87_fxch( struct x86_function *p, struct x86_reg dst ); +void x87_fxtract( struct x86_function *p ); +void x87_fyl2x( struct x86_function *p ); +void x87_fyl2xp1( struct x86_function *p ); +void x87_fwait( struct x86_function *p ); +void x87_fnstsw( struct x86_function *p, struct x86_reg dst ); +void x87_fucompp( struct x86_function *p ); +void x87_fucomp( struct x86_function *p, struct x86_reg arg ); +void x87_fucom( struct x86_function *p, struct x86_reg arg ); + + + +/* Retreive a reference to one of the function arguments, taking into + * account any push/pop activity. Note - doesn't track explict + * manipulation of ESP by other instructions. + */ +struct x86_reg x86_fn_arg( struct x86_function *p, unsigned arg ); + +#endif +#endif diff --git a/mesalib/src/mesa/x86/sse.c b/mesalib/src/mesa/x86/sse.c index b249811c7..aef15b531 100644 --- a/mesalib/src/mesa/x86/sse.c +++ b/mesalib/src/mesa/x86/sse.c @@ -1,123 +1,123 @@ -/*
- * Mesa 3-D graphics library
- * Version: 6.0
- *
- * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * PentiumIII-SIMD (SSE) optimizations contributed by
- * Andre Werthmann <wertmann@cs.uni-potsdam.de>
- */
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "math/m_xform.h"
-#include "tnl/t_context.h"
-
-#include "sse.h"
-#include "x86_xform.h"
-
-#ifdef DEBUG_MATH
-#include "math/m_debug.h"
-#endif
-
-
-#ifdef USE_SSE_ASM
-DECLARE_XFORM_GROUP( sse, 2 )
-DECLARE_XFORM_GROUP( sse, 3 )
-
-#if 1
-/* Some functions are not written in SSE-assembly, because the fpu ones are faster */
-extern void _ASMAPI _mesa_sse_transform_normals_no_rot( NORM_ARGS );
-extern void _ASMAPI _mesa_sse_transform_rescale_normals( NORM_ARGS );
-extern void _ASMAPI _mesa_sse_transform_rescale_normals_no_rot( NORM_ARGS );
-
-extern void _ASMAPI _mesa_sse_transform_points4_general( XFORM_ARGS );
-extern void _ASMAPI _mesa_sse_transform_points4_3d( XFORM_ARGS );
-/* XXX this function segfaults, see below */
-extern void _ASMAPI _mesa_sse_transform_points4_identity( XFORM_ARGS );
-/* XXX this one works, see below */
-extern void _ASMAPI _mesa_x86_transform_points4_identity( XFORM_ARGS );
-#else
-DECLARE_NORM_GROUP( sse )
-#endif
-
-
-extern void _ASMAPI
-_mesa_v16_sse_general_xform( GLfloat *first_vert,
- const GLfloat *m,
- const GLfloat *src,
- GLuint src_stride,
- GLuint count );
-
-extern void _ASMAPI
-_mesa_sse_project_vertices( GLfloat *first,
- GLfloat *last,
- const GLfloat *m,
- GLuint stride );
-
-extern void _ASMAPI
-_mesa_sse_project_clipped_vertices( GLfloat *first,
- GLfloat *last,
- const GLfloat *m,
- GLuint stride,
- const GLubyte *clipmask );
-#endif
-
-
-void _mesa_init_sse_transform_asm( void )
-{
-#ifdef USE_SSE_ASM
- ASSIGN_XFORM_GROUP( sse, 2 );
- ASSIGN_XFORM_GROUP( sse, 3 );
-
-#if 1
- /* TODO: Finish these off.
- */
- _mesa_transform_tab[4][MATRIX_GENERAL] =
- _mesa_sse_transform_points4_general;
- _mesa_transform_tab[4][MATRIX_3D] =
- _mesa_sse_transform_points4_3d;
- /* XXX NOTE: _mesa_sse_transform_points4_identity segfaults with the
- conformance tests, so use the x86 version.
- */
- _mesa_transform_tab[4][MATRIX_IDENTITY] =
- _mesa_x86_transform_points4_identity;/*_mesa_sse_transform_points4_identity;*/
-
- _mesa_normal_tab[NORM_TRANSFORM_NO_ROT] =
- _mesa_sse_transform_normals_no_rot;
- _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE] =
- _mesa_sse_transform_rescale_normals;
- _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE] =
- _mesa_sse_transform_rescale_normals_no_rot;
-#else
- ASSIGN_XFORM_GROUP( sse, 4 );
-
- ASSIGN_NORM_GROUP( sse );
-#endif
-
-#ifdef DEBUG_MATH
- _math_test_all_transform_functions( "SSE" );
- _math_test_all_normal_transform_functions( "SSE" );
-#endif
-#endif
-}
-
+/* + * Mesa 3-D graphics library + * Version: 6.0 + * + * Copyright (C) 1999-2004 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * PentiumIII-SIMD (SSE) optimizations contributed by + * Andre Werthmann <wertmann@cs.uni-potsdam.de> + */ + +#include "main/glheader.h" +#include "main/context.h" +#include "math/m_xform.h" +#include "tnl/t_context.h" + +#include "sse.h" +#include "x86_xform.h" + +#ifdef DEBUG_MATH +#include "math/m_debug.h" +#endif + + +#ifdef USE_SSE_ASM +DECLARE_XFORM_GROUP( sse, 2 ) +DECLARE_XFORM_GROUP( sse, 3 ) + +#if 1 +/* Some functions are not written in SSE-assembly, because the fpu ones are faster */ +extern void _ASMAPI _mesa_sse_transform_normals_no_rot( NORM_ARGS ); +extern void _ASMAPI _mesa_sse_transform_rescale_normals( NORM_ARGS ); +extern void _ASMAPI _mesa_sse_transform_rescale_normals_no_rot( NORM_ARGS ); + +extern void _ASMAPI _mesa_sse_transform_points4_general( XFORM_ARGS ); +extern void _ASMAPI _mesa_sse_transform_points4_3d( XFORM_ARGS ); +/* XXX this function segfaults, see below */ +extern void _ASMAPI _mesa_sse_transform_points4_identity( XFORM_ARGS ); +/* XXX this one works, see below */ +extern void _ASMAPI _mesa_x86_transform_points4_identity( XFORM_ARGS ); +#else +DECLARE_NORM_GROUP( sse ) +#endif + + +extern void _ASMAPI +_mesa_v16_sse_general_xform( GLfloat *first_vert, + const GLfloat *m, + const GLfloat *src, + GLuint src_stride, + GLuint count ); + +extern void _ASMAPI +_mesa_sse_project_vertices( GLfloat *first, + GLfloat *last, + const GLfloat *m, + GLuint stride ); + +extern void _ASMAPI +_mesa_sse_project_clipped_vertices( GLfloat *first, + GLfloat *last, + const GLfloat *m, + GLuint stride, + const GLubyte *clipmask ); +#endif + + +void _mesa_init_sse_transform_asm( void ) +{ +#ifdef USE_SSE_ASM + ASSIGN_XFORM_GROUP( sse, 2 ); + ASSIGN_XFORM_GROUP( sse, 3 ); + +#if 1 + /* TODO: Finish these off. + */ + _mesa_transform_tab[4][MATRIX_GENERAL] = + _mesa_sse_transform_points4_general; + _mesa_transform_tab[4][MATRIX_3D] = + _mesa_sse_transform_points4_3d; + /* XXX NOTE: _mesa_sse_transform_points4_identity segfaults with the + conformance tests, so use the x86 version. + */ + _mesa_transform_tab[4][MATRIX_IDENTITY] = + _mesa_x86_transform_points4_identity;/*_mesa_sse_transform_points4_identity;*/ + + _mesa_normal_tab[NORM_TRANSFORM_NO_ROT] = + _mesa_sse_transform_normals_no_rot; + _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE] = + _mesa_sse_transform_rescale_normals; + _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE] = + _mesa_sse_transform_rescale_normals_no_rot; +#else + ASSIGN_XFORM_GROUP( sse, 4 ); + + ASSIGN_NORM_GROUP( sse ); +#endif + +#ifdef DEBUG_MATH + _math_test_all_transform_functions( "SSE" ); + _math_test_all_normal_transform_functions( "SSE" ); +#endif +#endif +} + diff --git a/mesalib/src/mesa/x86/sse.h b/mesalib/src/mesa/x86/sse.h index d240bf38b..e92ddc139 100644 --- a/mesalib/src/mesa/x86/sse.h +++ b/mesalib/src/mesa/x86/sse.h @@ -1,36 +1,36 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * PentiumIII-SIMD (SSE) optimizations contributed by
- * Andre Werthmann <wertmann@cs.uni-potsdam.de>
- */
-
-#ifndef __SSE_H__
-#define __SSE_H__
-
-void _mesa_init_sse_transform_asm( void );
-
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * PentiumIII-SIMD (SSE) optimizations contributed by + * Andre Werthmann <wertmann@cs.uni-potsdam.de> + */ + +#ifndef __SSE_H__ +#define __SSE_H__ + +void _mesa_init_sse_transform_asm( void ); + +#endif diff --git a/mesalib/src/mesa/x86/sse_normal.S b/mesalib/src/mesa/x86/sse_normal.S index b6d5ef965..a8c0d38c7 100644 --- a/mesalib/src/mesa/x86/sse_normal.S +++ b/mesalib/src/mesa/x86/sse_normal.S @@ -1,261 +1,261 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/** TODO:
- * - insert PREFETCH instructions to avoid cache-misses !
- * - some more optimizations are possible...
- * - for 40-50% more performance in the SSE-functions, the
- * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
- */
-
-#ifdef USE_SSE_ASM
-#include "assyntax.h"
-#include "matypes.h"
-#include "norm_args.h"
-
- SEG_TEXT
-
-#define M(i) REGOFF(i * 4, EDX)
-#define S(i) REGOFF(i * 4, ESI)
-#define D(i) REGOFF(i * 4, EDI)
-#define STRIDE REGOFF(12, ESI)
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_sse_transform_rescale_normals_no_rot)
-HIDDEN(_mesa_sse_transform_rescale_normals_no_rot)
-GLNAME(_mesa_sse_transform_rescale_normals_no_rot):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */
- MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */
-
- MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */
- MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */
-
- MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L ( ECX, ECX )
- JZ( LLBL(K_G3TRNNRR_finish) ) /* count was zero; go to finish */
-
- MOV_L ( STRIDE, EAX ) /* stride */
- MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */
-
- IMUL_L( CONST(16), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVSS ( M(0), XMM1 ) /* m0 */
- MOVSS ( M(5), XMM2 ) /* m5 */
- UNPCKLPS( XMM2, XMM1 ) /* m5 | m0 */
- MOVSS ( ARG_SCALE, XMM0 ) /* scale */
- SHUFPS ( CONST(0x0), XMM0, XMM0 ) /* scale | scale */
- MULPS ( XMM0, XMM1 ) /* m5*scale | m0*scale */
- MULSS ( M(10), XMM0 ) /* m10*scale */
-
-ALIGNTEXT32
-LLBL(K_G3TRNNRR_top):
- MOVLPS ( S(0), XMM2 ) /* uy | ux */
- MULPS ( XMM1, XMM2 ) /* uy*m5*scale | ux*m0*scale */
- MOVLPS ( XMM2, D(0) ) /* ->D(1) | D(0) */
-
- MOVSS ( S(2), XMM2 ) /* uz */
- MULSS ( XMM0, XMM2 ) /* uz*m10*scale */
- MOVSS ( XMM2, D(2) ) /* ->D(2) */
-
-LLBL(K_G3TRNNRR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_G3TRNNRR_top) )
-
-LLBL(K_G3TRNNRR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_sse_transform_rescale_normals)
-HIDDEN(_mesa_sse_transform_rescale_normals)
-GLNAME(_mesa_sse_transform_rescale_normals):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */
- MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */
-
- MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */
- MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */
-
- MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L ( ECX, ECX )
- JZ( LLBL(K_G3TRNR_finish) ) /* count was zero; go to finish */
-
- MOV_L ( STRIDE, EAX ) /* stride */
- MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */
-
- IMUL_L( CONST(16), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVSS ( M(0), XMM0 ) /* m0 */
- MOVSS ( M(4), XMM1 ) /* m4 */
- UNPCKLPS( XMM1, XMM0 ) /* m4 | m0 */
-
- MOVSS ( ARG_SCALE, XMM4 ) /* scale */
- SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* scale | scale */
-
- MULPS ( XMM4, XMM0 ) /* m4*scale | m0*scale */
- MOVSS ( M(1), XMM1 ) /* m1 */
- MOVSS ( M(5), XMM2 ) /* m5 */
- UNPCKLPS( XMM2, XMM1 ) /* m5 | m1 */
- MULPS ( XMM4, XMM1 ) /* m5*scale | m1*scale */
- MOVSS ( M(2), XMM2 ) /* m2 */
- MOVSS ( M(6), XMM3 ) /* m6 */
- UNPCKLPS( XMM3, XMM2 ) /* m6 | m2 */
- MULPS ( XMM4, XMM2 ) /* m6*scale | m2*scale */
-
- MOVSS ( M(8), XMM6 ) /* m8 */
- MULSS ( ARG_SCALE, XMM6 ) /* m8*scale */
- MOVSS ( M(9), XMM7 ) /* m9 */
- MULSS ( ARG_SCALE, XMM7 ) /* m9*scale */
-
-ALIGNTEXT32
-LLBL(K_G3TRNR_top):
- MOVSS ( S(0), XMM3 ) /* ux */
- SHUFPS ( CONST(0x0), XMM3, XMM3 ) /* ux | ux */
- MULPS ( XMM0, XMM3 ) /* ux*m4 | ux*m0 */
- MOVSS ( S(1), XMM4 ) /* uy */
- SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* uy | uy */
- MULPS ( XMM1, XMM4 ) /* uy*m5 | uy*m1 */
- MOVSS ( S(2), XMM5 ) /* uz */
- SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* uz | uz */
- MULPS ( XMM2, XMM5 ) /* uz*m6 | uz*m2 */
-
- ADDPS ( XMM4, XMM3 )
- ADDPS ( XMM5, XMM3 )
- MOVLPS ( XMM3, D(0) )
-
- MOVSS ( M(10), XMM3 ) /* m10 */
- MULSS ( ARG_SCALE, XMM3 ) /* m10*scale */
- MULSS ( S(2), XMM3 ) /* m10*scale*uz */
- MOVSS ( S(1), XMM4 ) /* uy */
- MULSS ( XMM7, XMM4 ) /* uy*m9*scale */
- MOVSS ( S(0), XMM5 ) /* ux */
- MULSS ( XMM6, XMM5 ) /* ux*m8*scale */
-
- ADDSS ( XMM4, XMM3 )
- ADDSS ( XMM5, XMM3 )
- MOVSS ( XMM3, D(2) )
-
-LLBL(K_G3TRNR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_G3TRNR_top) )
-
-LLBL(K_G3TRNR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_sse_transform_normals_no_rot)
-HIDDEN(_mesa_sse_transform_normals_no_rot)
-GLNAME(_mesa_sse_transform_normals_no_rot):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */
- MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */
-
- MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */
- MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */
-
- MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L ( ECX, ECX )
- JZ( LLBL(K_G3TNNRR_finish) ) /* count was zero; go to finish */
-
- MOV_L ( STRIDE, EAX ) /* stride */
- MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */
-
- IMUL_L( CONST(16), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVSS( M(0), XMM0 ) /* m0 */
- MOVSS( M(5), XMM1 ) /* m5 */
- UNPCKLPS( XMM1, XMM0 ) /* m5 | m0 */
- MOVSS( M(10), XMM1 ) /* m10 */
-
-ALIGNTEXT32
-LLBL(K_G3TNNRR_top):
- MOVLPS( S(0), XMM2 ) /* uy | ux */
- MULPS( XMM0, XMM2 ) /* uy*m5 | ux*m0 */
- MOVLPS( XMM2, D(0) )
-
- MOVSS( S(2), XMM2 ) /* uz */
- MULSS( XMM1, XMM2 ) /* uz*m10 */
- MOVSS( XMM2, D(2) )
-
-LLBL(K_G3TNNRR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_G3TNNRR_top) )
-
-LLBL(K_G3TNNRR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-#endif
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** TODO: + * - insert PREFETCH instructions to avoid cache-misses ! + * - some more optimizations are possible... + * - for 40-50% more performance in the SSE-functions, the + * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned ! + */ + +#ifdef USE_SSE_ASM +#include "assyntax.h" +#include "matypes.h" +#include "norm_args.h" + + SEG_TEXT + +#define M(i) REGOFF(i * 4, EDX) +#define S(i) REGOFF(i * 4, ESI) +#define D(i) REGOFF(i * 4, EDI) +#define STRIDE REGOFF(12, ESI) + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_sse_transform_rescale_normals_no_rot) +HIDDEN(_mesa_sse_transform_rescale_normals_no_rot) +GLNAME(_mesa_sse_transform_rescale_normals_no_rot): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */ + MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */ + + MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */ + MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */ + + MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L ( ECX, ECX ) + JZ( LLBL(K_G3TRNNRR_finish) ) /* count was zero; go to finish */ + + MOV_L ( STRIDE, EAX ) /* stride */ + MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */ + + IMUL_L( CONST(16), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVSS ( M(0), XMM1 ) /* m0 */ + MOVSS ( M(5), XMM2 ) /* m5 */ + UNPCKLPS( XMM2, XMM1 ) /* m5 | m0 */ + MOVSS ( ARG_SCALE, XMM0 ) /* scale */ + SHUFPS ( CONST(0x0), XMM0, XMM0 ) /* scale | scale */ + MULPS ( XMM0, XMM1 ) /* m5*scale | m0*scale */ + MULSS ( M(10), XMM0 ) /* m10*scale */ + +ALIGNTEXT32 +LLBL(K_G3TRNNRR_top): + MOVLPS ( S(0), XMM2 ) /* uy | ux */ + MULPS ( XMM1, XMM2 ) /* uy*m5*scale | ux*m0*scale */ + MOVLPS ( XMM2, D(0) ) /* ->D(1) | D(0) */ + + MOVSS ( S(2), XMM2 ) /* uz */ + MULSS ( XMM0, XMM2 ) /* uz*m10*scale */ + MOVSS ( XMM2, D(2) ) /* ->D(2) */ + +LLBL(K_G3TRNNRR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_G3TRNNRR_top) ) + +LLBL(K_G3TRNNRR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_sse_transform_rescale_normals) +HIDDEN(_mesa_sse_transform_rescale_normals) +GLNAME(_mesa_sse_transform_rescale_normals): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */ + MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */ + + MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */ + MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */ + + MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L ( ECX, ECX ) + JZ( LLBL(K_G3TRNR_finish) ) /* count was zero; go to finish */ + + MOV_L ( STRIDE, EAX ) /* stride */ + MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */ + + IMUL_L( CONST(16), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVSS ( M(0), XMM0 ) /* m0 */ + MOVSS ( M(4), XMM1 ) /* m4 */ + UNPCKLPS( XMM1, XMM0 ) /* m4 | m0 */ + + MOVSS ( ARG_SCALE, XMM4 ) /* scale */ + SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* scale | scale */ + + MULPS ( XMM4, XMM0 ) /* m4*scale | m0*scale */ + MOVSS ( M(1), XMM1 ) /* m1 */ + MOVSS ( M(5), XMM2 ) /* m5 */ + UNPCKLPS( XMM2, XMM1 ) /* m5 | m1 */ + MULPS ( XMM4, XMM1 ) /* m5*scale | m1*scale */ + MOVSS ( M(2), XMM2 ) /* m2 */ + MOVSS ( M(6), XMM3 ) /* m6 */ + UNPCKLPS( XMM3, XMM2 ) /* m6 | m2 */ + MULPS ( XMM4, XMM2 ) /* m6*scale | m2*scale */ + + MOVSS ( M(8), XMM6 ) /* m8 */ + MULSS ( ARG_SCALE, XMM6 ) /* m8*scale */ + MOVSS ( M(9), XMM7 ) /* m9 */ + MULSS ( ARG_SCALE, XMM7 ) /* m9*scale */ + +ALIGNTEXT32 +LLBL(K_G3TRNR_top): + MOVSS ( S(0), XMM3 ) /* ux */ + SHUFPS ( CONST(0x0), XMM3, XMM3 ) /* ux | ux */ + MULPS ( XMM0, XMM3 ) /* ux*m4 | ux*m0 */ + MOVSS ( S(1), XMM4 ) /* uy */ + SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* uy | uy */ + MULPS ( XMM1, XMM4 ) /* uy*m5 | uy*m1 */ + MOVSS ( S(2), XMM5 ) /* uz */ + SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* uz | uz */ + MULPS ( XMM2, XMM5 ) /* uz*m6 | uz*m2 */ + + ADDPS ( XMM4, XMM3 ) + ADDPS ( XMM5, XMM3 ) + MOVLPS ( XMM3, D(0) ) + + MOVSS ( M(10), XMM3 ) /* m10 */ + MULSS ( ARG_SCALE, XMM3 ) /* m10*scale */ + MULSS ( S(2), XMM3 ) /* m10*scale*uz */ + MOVSS ( S(1), XMM4 ) /* uy */ + MULSS ( XMM7, XMM4 ) /* uy*m9*scale */ + MOVSS ( S(0), XMM5 ) /* ux */ + MULSS ( XMM6, XMM5 ) /* ux*m8*scale */ + + ADDSS ( XMM4, XMM3 ) + ADDSS ( XMM5, XMM3 ) + MOVSS ( XMM3, D(2) ) + +LLBL(K_G3TRNR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_G3TRNR_top) ) + +LLBL(K_G3TRNR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_sse_transform_normals_no_rot) +HIDDEN(_mesa_sse_transform_normals_no_rot) +GLNAME(_mesa_sse_transform_normals_no_rot): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */ + MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */ + + MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */ + MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */ + + MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L ( ECX, ECX ) + JZ( LLBL(K_G3TNNRR_finish) ) /* count was zero; go to finish */ + + MOV_L ( STRIDE, EAX ) /* stride */ + MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */ + + IMUL_L( CONST(16), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVSS( M(0), XMM0 ) /* m0 */ + MOVSS( M(5), XMM1 ) /* m5 */ + UNPCKLPS( XMM1, XMM0 ) /* m5 | m0 */ + MOVSS( M(10), XMM1 ) /* m10 */ + +ALIGNTEXT32 +LLBL(K_G3TNNRR_top): + MOVLPS( S(0), XMM2 ) /* uy | ux */ + MULPS( XMM0, XMM2 ) /* uy*m5 | ux*m0 */ + MOVLPS( XMM2, D(0) ) + + MOVSS( S(2), XMM2 ) /* uz */ + MULSS( XMM1, XMM2 ) /* uz*m10 */ + MOVSS( XMM2, D(2) ) + +LLBL(K_G3TNNRR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_G3TNNRR_top) ) + +LLBL(K_G3TNNRR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET +#endif + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/sse_xform1.S b/mesalib/src/mesa/x86/sse_xform1.S index 45a6dd7ec..4aa9de607 100644 --- a/mesalib/src/mesa/x86/sse_xform1.S +++ b/mesalib/src/mesa/x86/sse_xform1.S @@ -1,446 +1,446 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/** TODO:
- * - insert PREFETCH instructions to avoid cache-misses !
- * - some more optimizations are possible...
- * - for 40-50% more performance in the SSE-functions, the
- * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
- */
-
-#ifdef USE_SSE_ASM
-#include "assyntax.h"
-#include "matypes.h"
-#include "xform_args.h"
-
- SEG_TEXT
-
-#define S(i) REGOFF(i * 4, ESI)
-#define D(i) REGOFF(i * 4, EDI)
-#define M(i) REGOFF(i * 4, EDX)
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points1_general)
-HIDDEN( _mesa_sse_transform_points1_general )
-GLNAME( _mesa_sse_transform_points1_general ):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- CMP_L( CONST(0), ECX ) /* count == 0 ? */
- JE( LLBL(K_GTP1GR_finish) ) /* yes -> nothing to do. */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-
-ALIGNTEXT32
- MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */
- MOVAPS( M(12), XMM1 ) /* m15 | m14 | m13 | m12 */
-
-ALIGNTEXT32
-LLBL(K_GTP1GR_top):
- MOVSS( S(0), XMM2 ) /* ox */
- SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */
- MULPS( XMM0, XMM2 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
- ADDPS( XMM1, XMM2 ) /* + | + | + | + */
- MOVUPS( XMM2, D(0) )
-
-LLBL(K_GTP1GR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTP1GR_top) )
-
-LLBL(K_GTP1GR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points1_identity)
-HIDDEN(_mesa_sse_transform_points1_identity)
-GLNAME( _mesa_sse_transform_points1_identity ):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP1IR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_1), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(1), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
- CMP_L( ESI, EDI )
- JE( LLBL(K_GTP1IR_finish) )
-
-
-ALIGNTEXT32
-LLBL(K_GTP1IR_top):
- MOV_L( S(0), EDX )
- MOV_L( EDX, D(0) )
-
-LLBL(K_GTP1IR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTP1IR_top) )
-
-LLBL(K_GTP1IR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points1_3d_no_rot)
-HIDDEN(_mesa_sse_transform_points1_3d_no_rot)
-GLNAME(_mesa_sse_transform_points1_3d_no_rot):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP13DNRR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-
-ALIGNTEXT32
- MOVSS( M(0), XMM0 ) /* m0 */
- MOVSS( M(12), XMM1 ) /* m12 */
- MOVSS( M(13), XMM2 ) /* m13 */
- MOVSS( M(14), XMM3 ) /* m14 */
-
-ALIGNTEXT32
-LLBL(K_GTP13DNRR_top):
- MOVSS( S(0), XMM4 ) /* ox */
- MULSS( XMM0, XMM4 ) /* ox*m0 */
- ADDSS( XMM1, XMM4 ) /* ox*m0+m12 */
- MOVSS( XMM4, D(0) )
-
- MOVSS( XMM2, D(1) )
- MOVSS( XMM3, D(2) )
-
-LLBL(K_GTP13DNRR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTP13DNRR_top) )
-
-LLBL(K_GTP13DNRR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points1_perspective)
-HIDDEN(_mesa_sse_transform_points1_perspective)
-GLNAME(_mesa_sse_transform_points1_perspective):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP13PR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-
-ALIGNTEXT32
- XORPS( XMM0, XMM0 ) /* 0 | 0 | 0 | 0 */
- MOVSS( M(0), XMM1 ) /* m0 */
- MOVSS( M(14), XMM2 ) /* m14 */
-
-ALIGNTEXT32
-LLBL(K_GTP13PR_top):
- MOVSS( S(0), XMM3 ) /* ox */
- MULSS( XMM1, XMM3 ) /* ox*m0 */
- MOVSS( XMM3, D(0) ) /* ox*m0->D(0) */
- MOVSS( XMM2, D(2) ) /* m14->D(2) */
-
- MOVSS( XMM0, D(1) )
- MOVSS( XMM0, D(3) )
-
-LLBL(K_GTP13PR_skip):
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(K_GTP13PR_top) )
-
-LLBL(K_GTP13PR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points1_2d)
-HIDDEN(_mesa_sse_transform_points1_2d)
-GLNAME(_mesa_sse_transform_points1_2d):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP13P2DR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVLPS( M(0), XMM0 ) /* m1 | m0 */
- MOVLPS( M(12), XMM1 ) /* m13 | m12 */
-
-ALIGNTEXT32
-LLBL(K_GTP13P2DR_top):
- MOVSS( S(0), XMM2 ) /* ox */
- SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */
- MULPS( XMM0, XMM2 ) /* - | - | ox*m1 | ox*m0 */
- ADDPS( XMM1, XMM2 ) /* - | - | ox*m1+m13 | ox*m0+m12 */
- MOVLPS( XMM2, D(0) )
-
-LLBL(K_GTP13P2DR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTP13P2DR_top) )
-
-LLBL(K_GTP13P2DR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points1_2d_no_rot)
-HIDDEN(_mesa_sse_transform_points1_2d_no_rot)
-GLNAME(_mesa_sse_transform_points1_2d_no_rot):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP13P2DNRR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVSS( M(0), XMM0 ) /* m0 */
- MOVSS( M(12), XMM1 ) /* m12 */
- MOVSS( M(13), XMM2 ) /* m13 */
-
-ALIGNTEXT32
-LLBL(K_GTP13P2DNRR_top):
- MOVSS( S(0), XMM3 ) /* ox */
- MULSS( XMM0, XMM3 ) /* ox*m0 */
- ADDSS( XMM1, XMM3 ) /* ox*m0+m12 */
- MOVSS( XMM3, D(0) )
- MOVSS( XMM2, D(1) )
-
-LLBL(K_GTP13P2DNRR_skip):
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(K_GTP13P2DNRR_top) )
-
-LLBL(K_GTP13P2DNRR_finish):
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points1_3d)
-HIDDEN(_mesa_sse_transform_points1_3d)
-GLNAME(_mesa_sse_transform_points1_3d):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP13P3DR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-
-ALIGNTEXT32
- MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */
- MOVAPS( M(12), XMM1 ) /* m15 | m14 | m13 | m12 */
-
-ALIGNTEXT32
-LLBL(K_GTP13P3DR_top):
- MOVSS( S(0), XMM2 ) /* ox */
- SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */
- MULPS( XMM0, XMM2 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
- ADDPS( XMM1, XMM2 ) /* +m15 | +m14 | +m13 | +m12 */
- MOVLPS( XMM2, D(0) ) /* - | - | ->D(1)| ->D(0)*/
- UNPCKHPS( XMM2, XMM2 ) /* ox*m3+m15 | ox*m3+m15 | ox*m2+m14 | ox*m2+m14 */
- MOVSS( XMM2, D(2) )
-
-LLBL(K_GTP13P3DR_skip):
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(K_GTP13P3DR_top) )
-
-LLBL(K_GTP13P3DR_finish):
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-#endif
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** TODO: + * - insert PREFETCH instructions to avoid cache-misses ! + * - some more optimizations are possible... + * - for 40-50% more performance in the SSE-functions, the + * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned ! + */ + +#ifdef USE_SSE_ASM +#include "assyntax.h" +#include "matypes.h" +#include "xform_args.h" + + SEG_TEXT + +#define S(i) REGOFF(i * 4, ESI) +#define D(i) REGOFF(i * 4, EDI) +#define M(i) REGOFF(i * 4, EDX) + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points1_general) +HIDDEN( _mesa_sse_transform_points1_general ) +GLNAME( _mesa_sse_transform_points1_general ): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + CMP_L( CONST(0), ECX ) /* count == 0 ? */ + JE( LLBL(K_GTP1GR_finish) ) /* yes -> nothing to do. */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + +ALIGNTEXT32 + MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */ + MOVAPS( M(12), XMM1 ) /* m15 | m14 | m13 | m12 */ + +ALIGNTEXT32 +LLBL(K_GTP1GR_top): + MOVSS( S(0), XMM2 ) /* ox */ + SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */ + MULPS( XMM0, XMM2 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ + ADDPS( XMM1, XMM2 ) /* + | + | + | + */ + MOVUPS( XMM2, D(0) ) + +LLBL(K_GTP1GR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTP1GR_top) ) + +LLBL(K_GTP1GR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points1_identity) +HIDDEN(_mesa_sse_transform_points1_identity) +GLNAME( _mesa_sse_transform_points1_identity ): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP1IR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_1), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(1), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + CMP_L( ESI, EDI ) + JE( LLBL(K_GTP1IR_finish) ) + + +ALIGNTEXT32 +LLBL(K_GTP1IR_top): + MOV_L( S(0), EDX ) + MOV_L( EDX, D(0) ) + +LLBL(K_GTP1IR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTP1IR_top) ) + +LLBL(K_GTP1IR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points1_3d_no_rot) +HIDDEN(_mesa_sse_transform_points1_3d_no_rot) +GLNAME(_mesa_sse_transform_points1_3d_no_rot): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP13DNRR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + +ALIGNTEXT32 + MOVSS( M(0), XMM0 ) /* m0 */ + MOVSS( M(12), XMM1 ) /* m12 */ + MOVSS( M(13), XMM2 ) /* m13 */ + MOVSS( M(14), XMM3 ) /* m14 */ + +ALIGNTEXT32 +LLBL(K_GTP13DNRR_top): + MOVSS( S(0), XMM4 ) /* ox */ + MULSS( XMM0, XMM4 ) /* ox*m0 */ + ADDSS( XMM1, XMM4 ) /* ox*m0+m12 */ + MOVSS( XMM4, D(0) ) + + MOVSS( XMM2, D(1) ) + MOVSS( XMM3, D(2) ) + +LLBL(K_GTP13DNRR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTP13DNRR_top) ) + +LLBL(K_GTP13DNRR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points1_perspective) +HIDDEN(_mesa_sse_transform_points1_perspective) +GLNAME(_mesa_sse_transform_points1_perspective): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP13PR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + +ALIGNTEXT32 + XORPS( XMM0, XMM0 ) /* 0 | 0 | 0 | 0 */ + MOVSS( M(0), XMM1 ) /* m0 */ + MOVSS( M(14), XMM2 ) /* m14 */ + +ALIGNTEXT32 +LLBL(K_GTP13PR_top): + MOVSS( S(0), XMM3 ) /* ox */ + MULSS( XMM1, XMM3 ) /* ox*m0 */ + MOVSS( XMM3, D(0) ) /* ox*m0->D(0) */ + MOVSS( XMM2, D(2) ) /* m14->D(2) */ + + MOVSS( XMM0, D(1) ) + MOVSS( XMM0, D(3) ) + +LLBL(K_GTP13PR_skip): + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(K_GTP13PR_top) ) + +LLBL(K_GTP13PR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points1_2d) +HIDDEN(_mesa_sse_transform_points1_2d) +GLNAME(_mesa_sse_transform_points1_2d): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP13P2DR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVLPS( M(0), XMM0 ) /* m1 | m0 */ + MOVLPS( M(12), XMM1 ) /* m13 | m12 */ + +ALIGNTEXT32 +LLBL(K_GTP13P2DR_top): + MOVSS( S(0), XMM2 ) /* ox */ + SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */ + MULPS( XMM0, XMM2 ) /* - | - | ox*m1 | ox*m0 */ + ADDPS( XMM1, XMM2 ) /* - | - | ox*m1+m13 | ox*m0+m12 */ + MOVLPS( XMM2, D(0) ) + +LLBL(K_GTP13P2DR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTP13P2DR_top) ) + +LLBL(K_GTP13P2DR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points1_2d_no_rot) +HIDDEN(_mesa_sse_transform_points1_2d_no_rot) +GLNAME(_mesa_sse_transform_points1_2d_no_rot): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP13P2DNRR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVSS( M(0), XMM0 ) /* m0 */ + MOVSS( M(12), XMM1 ) /* m12 */ + MOVSS( M(13), XMM2 ) /* m13 */ + +ALIGNTEXT32 +LLBL(K_GTP13P2DNRR_top): + MOVSS( S(0), XMM3 ) /* ox */ + MULSS( XMM0, XMM3 ) /* ox*m0 */ + ADDSS( XMM1, XMM3 ) /* ox*m0+m12 */ + MOVSS( XMM3, D(0) ) + MOVSS( XMM2, D(1) ) + +LLBL(K_GTP13P2DNRR_skip): + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(K_GTP13P2DNRR_top) ) + +LLBL(K_GTP13P2DNRR_finish): + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points1_3d) +HIDDEN(_mesa_sse_transform_points1_3d) +GLNAME(_mesa_sse_transform_points1_3d): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP13P3DR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + +ALIGNTEXT32 + MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */ + MOVAPS( M(12), XMM1 ) /* m15 | m14 | m13 | m12 */ + +ALIGNTEXT32 +LLBL(K_GTP13P3DR_top): + MOVSS( S(0), XMM2 ) /* ox */ + SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */ + MULPS( XMM0, XMM2 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ + ADDPS( XMM1, XMM2 ) /* +m15 | +m14 | +m13 | +m12 */ + MOVLPS( XMM2, D(0) ) /* - | - | ->D(1)| ->D(0)*/ + UNPCKHPS( XMM2, XMM2 ) /* ox*m3+m15 | ox*m3+m15 | ox*m2+m14 | ox*m2+m14 */ + MOVSS( XMM2, D(2) ) + +LLBL(K_GTP13P3DR_skip): + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(K_GTP13P3DR_top) ) + +LLBL(K_GTP13P3DR_finish): + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET +#endif + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/sse_xform2.S b/mesalib/src/mesa/x86/sse_xform2.S index 7fb52e2e7..a443dad35 100644 --- a/mesalib/src/mesa/x86/sse_xform2.S +++ b/mesalib/src/mesa/x86/sse_xform2.S @@ -1,466 +1,466 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/** TODO:
- * - insert PREFETCH instructions to avoid cache-misses !
- * - some more optimizations are possible...
- * - for 40-50% more performance in the SSE-functions, the
- * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
- */
-
-#ifdef USE_SSE_ASM
-#include "assyntax.h"
-#include "matypes.h"
-#include "xform_args.h"
-
- SEG_TEXT
-
-#define S(i) REGOFF(i * 4, ESI)
-#define D(i) REGOFF(i * 4, EDI)
-#define M(i) REGOFF(i * 4, EDX)
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points2_general)
-HIDDEN (_mesa_sse_transform_points2_general)
-GLNAME( _mesa_sse_transform_points2_general ):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX )
- JZ( LLBL(K_GTP2GR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */
- MOVAPS( M(4), XMM1 ) /* m7 | m6 | m5 | m4 */
- MOVAPS( M(12), XMM2 ) /* m15 | m14 | m13 | m12 */
-
-ALIGNTEXT32
-LLBL(K_GTP2GR_top):
- MOVSS( S(0), XMM3 ) /* ox */
- SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox | ox | ox */
- MULPS( XMM0, XMM3 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
- MOVSS( S(1), XMM4 ) /* oy */
- SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy | oy | oy */
- MULPS( XMM1, XMM4 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
-
- ADDPS( XMM4, XMM3 )
- ADDPS( XMM2, XMM3 )
- MOVAPS( XMM3, D(0) )
-
-LLBL(K_GTP2GR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTP2GR_top) )
-
-LLBL(K_GTP2GR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points2_identity)
-HIDDEN(_mesa_sse_transform_points2_identity)
-GLNAME( _mesa_sse_transform_points2_identity ):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP2IR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
- CMP_L( ESI, EDI )
- JE( LLBL(K_GTP2IR_finish) )
-
-
-ALIGNTEXT32
-LLBL(K_GTP2IR_top):
- MOV_L ( S(0), EDX )
- MOV_L ( EDX, D(0) )
- MOV_L ( S(1), EDX )
- MOV_L ( EDX, D(1) )
-
-LLBL(K_GTP2IR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTP2IR_top) )
-
-LLBL(K_GTP2IR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points2_3d_no_rot)
-HIDDEN(_mesa_sse_transform_points2_3d_no_rot)
-GLNAME(_mesa_sse_transform_points2_3d_no_rot):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP23DNRR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
- XORPS( XMM0, XMM0 ) /* clean the working register */
-
-ALIGNTEXT32
- MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
- MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
- UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
- MOVLPS ( M(12), XMM2 ) /* - | - | m13 | m12 */
- MOVSS ( M(14), XMM3 ) /* - | - | - | m14 */
-
-ALIGNTEXT32
-LLBL(K_GTP23DNRR_top):
- MOVLPS ( S(0), XMM0 ) /* - | - | oy | ox */
- MULPS ( XMM1, XMM0 ) /* - | - | oy*m5 | ox*m0 */
- ADDPS ( XMM2, XMM0 ) /* - | - | +m13 | +m12 */
- MOVLPS ( XMM0, D(0) ) /* -> D(1) | -> D(0) */
-
- MOVSS ( XMM3, D(2) ) /* -> D(2) */
-
-LLBL(K_GTP23DNRR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTP23DNRR_top) )
-
-LLBL(K_GTP23DNRR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points2_perspective)
-HIDDEN(_mesa_sse_transform_points2_perspective)
-GLNAME(_mesa_sse_transform_points2_perspective):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP23PR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
- MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
- UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
- MOVSS ( M(14), XMM3 ) /* m14 */
- XORPS ( XMM0, XMM0 ) /* 0 | 0 | 0 | 0 */
-
-ALIGNTEXT32
-LLBL(K_GTP23PR_top):
- MOVLPS( S(0), XMM4 ) /* oy | ox */
- MULPS( XMM1, XMM4 ) /* oy*m5 | ox*m0 */
- MOVLPS( XMM4, D(0) ) /* ->D(1) | ->D(0) */
- MOVSS( XMM3, D(2) ) /* ->D(2) */
- MOVSS( XMM0, D(3) ) /* ->D(3) */
-
-LLBL(K_GTP23PR_skip):
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(K_GTP23PR_top) )
-
-LLBL(K_GTP23PR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points2_2d)
-HIDDEN(_mesa_sse_transform_points2_2d)
-GLNAME(_mesa_sse_transform_points2_2d):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP23P2DR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVLPS( M(0), XMM0 ) /* m1 | m0 */
- MOVLPS( M(4), XMM1 ) /* m5 | m4 */
- MOVLPS( M(12), XMM2 ) /* m13 | m12 */
-
-ALIGNTEXT32
-LLBL(K_GTP23P2DR_top):
- MOVSS( S(0), XMM3 ) /* ox */
- SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox */
- MULPS( XMM0, XMM3 ) /* ox*m1 | ox*m0 */
-
- MOVSS( S(1), XMM4 ) /* oy */
- SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy */
- MULPS( XMM1, XMM4 ) /* oy*m5 | oy*m4 */
-
- ADDPS( XMM4, XMM3 )
- ADDPS( XMM2, XMM3 )
- MOVLPS( XMM3, D(0) ) /* ->D(1) | ->D(0) */
-
-LLBL(K_GTP23P2DR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTP23P2DR_top) )
-
-LLBL(K_GTP23P2DR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points2_2d_no_rot)
-HIDDEN(_mesa_sse_transform_points2_2d_no_rot)
-GLNAME(_mesa_sse_transform_points2_2d_no_rot):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP23P2DNRR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVSS ( M(0), XMM1 ) /* m0 */
- MOVSS ( M(5), XMM2 ) /* m5 */
- UNPCKLPS ( XMM2, XMM1 ) /* m5 | m0 */
- MOVLPS ( M(12), XMM2 ) /* m13 | m12 */
-
-ALIGNTEXT32
-LLBL(K_GTP23P2DNRR_top):
- MOVLPS( S(0), XMM0 ) /* oy | ox */
- MULPS( XMM1, XMM0 ) /* oy*m5 | ox*m0 */
- ADDPS( XMM2, XMM0 ) /* +m13 | +m12 */
- MOVLPS( XMM0, D(0) ) /* ->D(1) | ->D(0) */
-
-LLBL(K_GTP23P2DNRR_skip):
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(K_GTP23P2DNRR_top) )
-
-LLBL(K_GTP23P2DNRR_finish):
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points2_3d)
-HIDDEN(_mesa_sse_transform_points2_3d)
-GLNAME(_mesa_sse_transform_points2_3d):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP23P3DR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVAPS( M(0), XMM0 ) /* m2 | m1 | m0 */
- MOVAPS( M(4), XMM1 ) /* m6 | m5 | m4 */
- MOVAPS( M(12), XMM2 ) /* m14 | m13 | m12 */
-
-ALIGNTEXT32
-LLBL(K_GTP23P3DR_top):
- MOVSS( S(0), XMM3 ) /* ox */
- SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox | ox */
- MULPS( XMM0, XMM3 ) /* ox*m2 | ox*m1 | ox*m0 */
-
- MOVSS( S(1), XMM4 ) /* oy */
- SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy | oy */
- MULPS( XMM1, XMM4 ) /* oy*m6 | oy*m5 | oy*m4 */
-
- ADDPS( XMM4, XMM3 )
- ADDPS( XMM2, XMM3 )
-
- MOVLPS( XMM3, D(0) ) /* ->D(1) | ->D(0) */
- UNPCKHPS( XMM3, XMM3 )
- MOVSS( XMM3, D(2) ) /* ->D(2) */
-
-LLBL(K_GTP23P3DR_skip):
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(K_GTP23P3DR_top) )
-
-LLBL(K_GTP23P3DR_finish):
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-#endif
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** TODO: + * - insert PREFETCH instructions to avoid cache-misses ! + * - some more optimizations are possible... + * - for 40-50% more performance in the SSE-functions, the + * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned ! + */ + +#ifdef USE_SSE_ASM +#include "assyntax.h" +#include "matypes.h" +#include "xform_args.h" + + SEG_TEXT + +#define S(i) REGOFF(i * 4, ESI) +#define D(i) REGOFF(i * 4, EDI) +#define M(i) REGOFF(i * 4, EDX) + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points2_general) +HIDDEN (_mesa_sse_transform_points2_general) +GLNAME( _mesa_sse_transform_points2_general ): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX ) + JZ( LLBL(K_GTP2GR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */ + MOVAPS( M(4), XMM1 ) /* m7 | m6 | m5 | m4 */ + MOVAPS( M(12), XMM2 ) /* m15 | m14 | m13 | m12 */ + +ALIGNTEXT32 +LLBL(K_GTP2GR_top): + MOVSS( S(0), XMM3 ) /* ox */ + SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox | ox | ox */ + MULPS( XMM0, XMM3 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ + MOVSS( S(1), XMM4 ) /* oy */ + SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy | oy | oy */ + MULPS( XMM1, XMM4 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ + + ADDPS( XMM4, XMM3 ) + ADDPS( XMM2, XMM3 ) + MOVAPS( XMM3, D(0) ) + +LLBL(K_GTP2GR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTP2GR_top) ) + +LLBL(K_GTP2GR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points2_identity) +HIDDEN(_mesa_sse_transform_points2_identity) +GLNAME( _mesa_sse_transform_points2_identity ): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP2IR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + CMP_L( ESI, EDI ) + JE( LLBL(K_GTP2IR_finish) ) + + +ALIGNTEXT32 +LLBL(K_GTP2IR_top): + MOV_L ( S(0), EDX ) + MOV_L ( EDX, D(0) ) + MOV_L ( S(1), EDX ) + MOV_L ( EDX, D(1) ) + +LLBL(K_GTP2IR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTP2IR_top) ) + +LLBL(K_GTP2IR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points2_3d_no_rot) +HIDDEN(_mesa_sse_transform_points2_3d_no_rot) +GLNAME(_mesa_sse_transform_points2_3d_no_rot): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP23DNRR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + XORPS( XMM0, XMM0 ) /* clean the working register */ + +ALIGNTEXT32 + MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */ + MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */ + UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */ + MOVLPS ( M(12), XMM2 ) /* - | - | m13 | m12 */ + MOVSS ( M(14), XMM3 ) /* - | - | - | m14 */ + +ALIGNTEXT32 +LLBL(K_GTP23DNRR_top): + MOVLPS ( S(0), XMM0 ) /* - | - | oy | ox */ + MULPS ( XMM1, XMM0 ) /* - | - | oy*m5 | ox*m0 */ + ADDPS ( XMM2, XMM0 ) /* - | - | +m13 | +m12 */ + MOVLPS ( XMM0, D(0) ) /* -> D(1) | -> D(0) */ + + MOVSS ( XMM3, D(2) ) /* -> D(2) */ + +LLBL(K_GTP23DNRR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTP23DNRR_top) ) + +LLBL(K_GTP23DNRR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points2_perspective) +HIDDEN(_mesa_sse_transform_points2_perspective) +GLNAME(_mesa_sse_transform_points2_perspective): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP23PR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */ + MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */ + UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */ + MOVSS ( M(14), XMM3 ) /* m14 */ + XORPS ( XMM0, XMM0 ) /* 0 | 0 | 0 | 0 */ + +ALIGNTEXT32 +LLBL(K_GTP23PR_top): + MOVLPS( S(0), XMM4 ) /* oy | ox */ + MULPS( XMM1, XMM4 ) /* oy*m5 | ox*m0 */ + MOVLPS( XMM4, D(0) ) /* ->D(1) | ->D(0) */ + MOVSS( XMM3, D(2) ) /* ->D(2) */ + MOVSS( XMM0, D(3) ) /* ->D(3) */ + +LLBL(K_GTP23PR_skip): + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(K_GTP23PR_top) ) + +LLBL(K_GTP23PR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points2_2d) +HIDDEN(_mesa_sse_transform_points2_2d) +GLNAME(_mesa_sse_transform_points2_2d): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP23P2DR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVLPS( M(0), XMM0 ) /* m1 | m0 */ + MOVLPS( M(4), XMM1 ) /* m5 | m4 */ + MOVLPS( M(12), XMM2 ) /* m13 | m12 */ + +ALIGNTEXT32 +LLBL(K_GTP23P2DR_top): + MOVSS( S(0), XMM3 ) /* ox */ + SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox */ + MULPS( XMM0, XMM3 ) /* ox*m1 | ox*m0 */ + + MOVSS( S(1), XMM4 ) /* oy */ + SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy */ + MULPS( XMM1, XMM4 ) /* oy*m5 | oy*m4 */ + + ADDPS( XMM4, XMM3 ) + ADDPS( XMM2, XMM3 ) + MOVLPS( XMM3, D(0) ) /* ->D(1) | ->D(0) */ + +LLBL(K_GTP23P2DR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTP23P2DR_top) ) + +LLBL(K_GTP23P2DR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points2_2d_no_rot) +HIDDEN(_mesa_sse_transform_points2_2d_no_rot) +GLNAME(_mesa_sse_transform_points2_2d_no_rot): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP23P2DNRR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVSS ( M(0), XMM1 ) /* m0 */ + MOVSS ( M(5), XMM2 ) /* m5 */ + UNPCKLPS ( XMM2, XMM1 ) /* m5 | m0 */ + MOVLPS ( M(12), XMM2 ) /* m13 | m12 */ + +ALIGNTEXT32 +LLBL(K_GTP23P2DNRR_top): + MOVLPS( S(0), XMM0 ) /* oy | ox */ + MULPS( XMM1, XMM0 ) /* oy*m5 | ox*m0 */ + ADDPS( XMM2, XMM0 ) /* +m13 | +m12 */ + MOVLPS( XMM0, D(0) ) /* ->D(1) | ->D(0) */ + +LLBL(K_GTP23P2DNRR_skip): + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(K_GTP23P2DNRR_top) ) + +LLBL(K_GTP23P2DNRR_finish): + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points2_3d) +HIDDEN(_mesa_sse_transform_points2_3d) +GLNAME(_mesa_sse_transform_points2_3d): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP23P3DR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVAPS( M(0), XMM0 ) /* m2 | m1 | m0 */ + MOVAPS( M(4), XMM1 ) /* m6 | m5 | m4 */ + MOVAPS( M(12), XMM2 ) /* m14 | m13 | m12 */ + +ALIGNTEXT32 +LLBL(K_GTP23P3DR_top): + MOVSS( S(0), XMM3 ) /* ox */ + SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox | ox */ + MULPS( XMM0, XMM3 ) /* ox*m2 | ox*m1 | ox*m0 */ + + MOVSS( S(1), XMM4 ) /* oy */ + SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy | oy */ + MULPS( XMM1, XMM4 ) /* oy*m6 | oy*m5 | oy*m4 */ + + ADDPS( XMM4, XMM3 ) + ADDPS( XMM2, XMM3 ) + + MOVLPS( XMM3, D(0) ) /* ->D(1) | ->D(0) */ + UNPCKHPS( XMM3, XMM3 ) + MOVSS( XMM3, D(2) ) /* ->D(2) */ + +LLBL(K_GTP23P3DR_skip): + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(K_GTP23P3DR_top) ) + +LLBL(K_GTP23P3DR_finish): + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET +#endif + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/sse_xform3.S b/mesalib/src/mesa/x86/sse_xform3.S index 4e39312bf..4bc22d8a5 100644 --- a/mesalib/src/mesa/x86/sse_xform3.S +++ b/mesalib/src/mesa/x86/sse_xform3.S @@ -1,512 +1,512 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/** TODO:
- * - insert PREFETCH instructions to avoid cache-misses !
- * - some more optimizations are possible...
- * - for 40-50% more performance in the SSE-functions, the
- * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
- */
-
-#ifdef USE_SSE_ASM
-#include "assyntax.h"
-#include "matypes.h"
-#include "xform_args.h"
-
- SEG_TEXT
-
-#define S(i) REGOFF(i * 4, ESI)
-#define D(i) REGOFF(i * 4, EDI)
-#define M(i) REGOFF(i * 4, EDX)
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points3_general)
-HIDDEN(_mesa_sse_transform_points3_general)
-GLNAME( _mesa_sse_transform_points3_general ):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- CMP_L ( CONST(0), ECX ) /* count == 0 ? */
- JE ( LLBL(K_GTPGR_finish) ) /* yes -> nothing to do. */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-
-ALIGNTEXT32
- MOVAPS ( REGOFF(0, EDX), XMM0 ) /* m0 | m1 | m2 | m3 */
- MOVAPS ( REGOFF(16, EDX), XMM1 ) /* m4 | m5 | m6 | m7 */
- MOVAPS ( REGOFF(32, EDX), XMM2 ) /* m8 | m9 | m10 | m11 */
- MOVAPS ( REGOFF(48, EDX), XMM3 ) /* m12 | m13 | m14 | m15 */
-
-
-ALIGNTEXT32
-LLBL(K_GTPGR_top):
- MOVSS ( REGOFF(0, ESI), XMM4 ) /* | | | ox */
- SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox | ox */
- MOVSS ( REGOFF(4, ESI), XMM5 ) /* | | | oy */
- SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy | oy */
- MOVSS ( REGOFF(8, ESI), XMM6 ) /* | | | oz */
- SHUFPS ( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz | oz */
-
- MULPS ( XMM0, XMM4 ) /* m3*ox | m2*ox | m1*ox | m0*ox */
- MULPS ( XMM1, XMM5 ) /* m7*oy | m6*oy | m5*oy | m4*oy */
- MULPS ( XMM2, XMM6 ) /* m11*oz | m10*oz | m9*oz | m8*oz */
-
- ADDPS ( XMM5, XMM4 )
- ADDPS ( XMM6, XMM4 )
- ADDPS ( XMM3, XMM4 )
-
- MOVAPS ( XMM4, REGOFF(0, EDI) )
-
-LLBL(K_GTPGR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTPGR_top) )
-
-LLBL(K_GTPGR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points3_identity)
-HIDDEN(_mesa_sse_transform_points3_identity)
-GLNAME( _mesa_sse_transform_points3_identity ):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTPIR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
- CMP_L( ESI, EDI )
- JE( LLBL(K_GTPIR_finish) )
-
-
-ALIGNTEXT32
-LLBL(K_GTPIR_top):
- MOVLPS ( S(0), XMM0 )
- MOVLPS ( XMM0, D(0) )
- MOVSS ( S(2), XMM0 )
- MOVSS ( XMM0, D(2) )
-
-LLBL(K_GTPIR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTPIR_top) )
-
-LLBL(K_GTPIR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points3_3d_no_rot)
-HIDDEN(_mesa_sse_transform_points3_3d_no_rot)
-GLNAME(_mesa_sse_transform_points3_3d_no_rot):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP3DNRR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
- XORPS( XMM0, XMM0 ) /* clean the working register */
-
-ALIGNTEXT32
- MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
- MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
- UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
- MOVLPS ( M(12), XMM2 ) /* - | - | m13 | m12 */
- MOVSS ( M(10), XMM3 ) /* - | - | - | m10 */
- MOVSS ( M(14), XMM4 ) /* - | - | - | m14 */
-
-ALIGNTEXT32
-LLBL(K_GTP3DNRR_top):
-
- MOVLPS ( S(0), XMM0 ) /* - | - | s1 | s0 */
- MULPS ( XMM1, XMM0 ) /* - | - | s1*m5 | s0*m0 */
- ADDPS ( XMM2, XMM0 ) /* - | - | +m13 | +m12 */
- MOVLPS ( XMM0, D(0) ) /* -> D(1) | -> D(0) */
-
- MOVSS ( S(2), XMM0 ) /* sz */
- MULSS ( XMM3, XMM0 ) /* sz*m10 */
- ADDSS ( XMM4, XMM0 ) /* +m14 */
- MOVSS ( XMM0, D(2) ) /* -> D(2) */
-
-LLBL(K_GTP3DNRR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTP3DNRR_top) )
-
-LLBL(K_GTP3DNRR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points3_perspective)
-HIDDEN(_mesa_sse_transform_points3_perspective)
-GLNAME(_mesa_sse_transform_points3_perspective):
-
-#define FRAME_OFFSET 8
- PUSH_L ( ESI )
- PUSH_L ( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP3PR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
- MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
- UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
- MOVLPS ( M(8), XMM2 ) /* - | - | m9 | m8 */
- MOVSS ( M(10), XMM3 ) /* m10 */
- MOVSS ( M(14), XMM4 ) /* m14 */
- XORPS ( XMM6, XMM6 ) /* 0 */
-
-ALIGNTEXT32
-LLBL(K_GTP3PR_top):
- MOVLPS ( S(0), XMM0 ) /* oy | ox */
- MULPS ( XMM1, XMM0 ) /* oy*m5 | ox*m0 */
- MOVSS ( S(2), XMM5 ) /* oz */
- SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* oz | oz */
- MULPS ( XMM2, XMM5 ) /* oz*m9 | oz*m8 */
- ADDPS ( XMM5, XMM0 ) /* +oy*m5 | +ox*m0 */
- MOVLPS ( XMM0, D(0) ) /* ->D(1) | ->D(0) */
-
- MOVSS ( S(2), XMM0 ) /* oz */
- MULSS ( XMM3, XMM0 ) /* oz*m10 */
- ADDSS ( XMM4, XMM0 ) /* +m14 */
- MOVSS ( XMM0, D(2) ) /* ->D(2) */
-
- MOVSS ( S(2), XMM0 ) /* oz */
- MOVSS ( XMM6, XMM5 ) /* 0 */
- SUBPS ( XMM0, XMM5 ) /* -oz */
- MOVSS ( XMM5, D(3) ) /* ->D(3) */
-
-LLBL(K_GTP3PR_skip):
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(K_GTP3PR_top) )
-
-LLBL(K_GTP3PR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points3_2d)
-HIDDEN(_mesa_sse_transform_points3_2d)
-GLNAME(_mesa_sse_transform_points3_2d):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP3P2DR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVLPS( M(0), XMM0 ) /* m1 | m0 */
- MOVLPS( M(4), XMM1 ) /* m5 | m4 */
- MOVLPS( M(12), XMM2 ) /* m13 | m12 */
-
-ALIGNTEXT32
-LLBL(K_GTP3P2DR_top):
- MOVSS ( S(0), XMM3 ) /* ox */
- SHUFPS ( CONST(0x0), XMM3, XMM3 ) /* ox | ox */
- MULPS ( XMM0, XMM3 ) /* ox*m1 | ox*m0 */
- MOVSS ( S(1), XMM4 ) /* oy */
- SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* oy | oy */
- MULPS ( XMM1, XMM4 ) /* oy*m5 | oy*m4 */
-
- ADDPS ( XMM4, XMM3 )
- ADDPS ( XMM2, XMM3 )
- MOVLPS ( XMM3, D(0) )
-
- MOVSS ( S(2), XMM3 )
- MOVSS ( XMM3, D(2) )
-
-LLBL(K_GTP3P2DR_skip):
- ADD_L ( CONST(16), EDI )
- ADD_L ( EAX, ESI )
- CMP_L ( ECX, EDI )
- JNE ( LLBL(K_GTP3P2DR_top) )
-
-LLBL(K_GTP3P2DR_finish):
- POP_L ( EDI )
- POP_L ( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points3_2d_no_rot)
-HIDDEN(_mesa_sse_transform_points3_2d_no_rot)
-GLNAME(_mesa_sse_transform_points3_2d_no_rot):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP3P2DNRR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-ALIGNTEXT32
- MOVSS ( M(0), XMM1 ) /* m0 */
- MOVSS ( M(5), XMM2 ) /* m5 */
- UNPCKLPS ( XMM2, XMM1 ) /* m5 | m0 */
- MOVLPS ( M(12), XMM2 ) /* m13 | m12 */
-
-ALIGNTEXT32
-LLBL(K_GTP3P2DNRR_top):
- MOVLPS( S(0), XMM0 ) /* oy | ox */
- MULPS( XMM1, XMM0 ) /* oy*m5 | ox*m0 */
- ADDPS( XMM2, XMM0 ) /* +m13 | +m12 */
- MOVLPS( XMM0, D(0) ) /* ->D(1) | ->D(0) */
-
- MOVSS( S(2), XMM0 )
- MOVSS( XMM0, D(2) )
-
-LLBL(K_GTP3P2DNRR_skip):
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(K_GTP3P2DNRR_top) )
-
-LLBL(K_GTP3P2DNRR_finish):
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME(_mesa_sse_transform_points3_3d)
-HIDDEN(_mesa_sse_transform_points3_3d)
-GLNAME(_mesa_sse_transform_points3_3d):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
- MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
-
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP3P3DR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
-
-ALIGNTEXT32
- MOVAPS( M(0), XMM0 ) /* m2 | m1 | m0 */
- MOVAPS( M(4), XMM1 ) /* m6 | m5 | m4 */
- MOVAPS( M(8), XMM2 ) /* m10 | m9 | m8 */
- MOVAPS( M(12), XMM3 ) /* m14 | m13 | m12 */
-
-ALIGNTEXT32
-LLBL(K_GTP3P3DR_top):
- MOVSS( S(0), XMM4 )
- SHUFPS( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox */
- MULPS( XMM0, XMM4 ) /* ox*m2 | ox*m1 | ox*m0 */
-
- MOVSS( S(1), XMM5 )
- SHUFPS( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy */
- MULPS( XMM1, XMM5 ) /* oy*m6 | oy*m5 | oy*m4 */
-
- MOVSS( S(2), XMM6 )
- SHUFPS( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz */
- MULPS( XMM2, XMM6 ) /* oz*m10 | oz*m9 | oz*m8 */
-
- ADDPS( XMM5, XMM4 ) /* + | + | + */
- ADDPS( XMM6, XMM4 ) /* + | + | + */
- ADDPS( XMM3, XMM4 ) /* + | + | + */
-
- MOVLPS( XMM4, D(0) ) /* => D(1) | => D(0) */
- UNPCKHPS( XMM4, XMM4 )
- MOVSS( XMM4, D(2) )
-
-LLBL(K_GTP3P3DR_skip):
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(K_GTP3P3DR_top) )
-
-LLBL(K_GTP3P3DR_finish):
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-#endif
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** TODO: + * - insert PREFETCH instructions to avoid cache-misses ! + * - some more optimizations are possible... + * - for 40-50% more performance in the SSE-functions, the + * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned ! + */ + +#ifdef USE_SSE_ASM +#include "assyntax.h" +#include "matypes.h" +#include "xform_args.h" + + SEG_TEXT + +#define S(i) REGOFF(i * 4, ESI) +#define D(i) REGOFF(i * 4, EDI) +#define M(i) REGOFF(i * 4, EDX) + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points3_general) +HIDDEN(_mesa_sse_transform_points3_general) +GLNAME( _mesa_sse_transform_points3_general ): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + CMP_L ( CONST(0), ECX ) /* count == 0 ? */ + JE ( LLBL(K_GTPGR_finish) ) /* yes -> nothing to do. */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + +ALIGNTEXT32 + MOVAPS ( REGOFF(0, EDX), XMM0 ) /* m0 | m1 | m2 | m3 */ + MOVAPS ( REGOFF(16, EDX), XMM1 ) /* m4 | m5 | m6 | m7 */ + MOVAPS ( REGOFF(32, EDX), XMM2 ) /* m8 | m9 | m10 | m11 */ + MOVAPS ( REGOFF(48, EDX), XMM3 ) /* m12 | m13 | m14 | m15 */ + + +ALIGNTEXT32 +LLBL(K_GTPGR_top): + MOVSS ( REGOFF(0, ESI), XMM4 ) /* | | | ox */ + SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox | ox */ + MOVSS ( REGOFF(4, ESI), XMM5 ) /* | | | oy */ + SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy | oy */ + MOVSS ( REGOFF(8, ESI), XMM6 ) /* | | | oz */ + SHUFPS ( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz | oz */ + + MULPS ( XMM0, XMM4 ) /* m3*ox | m2*ox | m1*ox | m0*ox */ + MULPS ( XMM1, XMM5 ) /* m7*oy | m6*oy | m5*oy | m4*oy */ + MULPS ( XMM2, XMM6 ) /* m11*oz | m10*oz | m9*oz | m8*oz */ + + ADDPS ( XMM5, XMM4 ) + ADDPS ( XMM6, XMM4 ) + ADDPS ( XMM3, XMM4 ) + + MOVAPS ( XMM4, REGOFF(0, EDI) ) + +LLBL(K_GTPGR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTPGR_top) ) + +LLBL(K_GTPGR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points3_identity) +HIDDEN(_mesa_sse_transform_points3_identity) +GLNAME( _mesa_sse_transform_points3_identity ): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTPIR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + CMP_L( ESI, EDI ) + JE( LLBL(K_GTPIR_finish) ) + + +ALIGNTEXT32 +LLBL(K_GTPIR_top): + MOVLPS ( S(0), XMM0 ) + MOVLPS ( XMM0, D(0) ) + MOVSS ( S(2), XMM0 ) + MOVSS ( XMM0, D(2) ) + +LLBL(K_GTPIR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTPIR_top) ) + +LLBL(K_GTPIR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points3_3d_no_rot) +HIDDEN(_mesa_sse_transform_points3_3d_no_rot) +GLNAME(_mesa_sse_transform_points3_3d_no_rot): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP3DNRR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + XORPS( XMM0, XMM0 ) /* clean the working register */ + +ALIGNTEXT32 + MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */ + MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */ + UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */ + MOVLPS ( M(12), XMM2 ) /* - | - | m13 | m12 */ + MOVSS ( M(10), XMM3 ) /* - | - | - | m10 */ + MOVSS ( M(14), XMM4 ) /* - | - | - | m14 */ + +ALIGNTEXT32 +LLBL(K_GTP3DNRR_top): + + MOVLPS ( S(0), XMM0 ) /* - | - | s1 | s0 */ + MULPS ( XMM1, XMM0 ) /* - | - | s1*m5 | s0*m0 */ + ADDPS ( XMM2, XMM0 ) /* - | - | +m13 | +m12 */ + MOVLPS ( XMM0, D(0) ) /* -> D(1) | -> D(0) */ + + MOVSS ( S(2), XMM0 ) /* sz */ + MULSS ( XMM3, XMM0 ) /* sz*m10 */ + ADDSS ( XMM4, XMM0 ) /* +m14 */ + MOVSS ( XMM0, D(2) ) /* -> D(2) */ + +LLBL(K_GTP3DNRR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTP3DNRR_top) ) + +LLBL(K_GTP3DNRR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points3_perspective) +HIDDEN(_mesa_sse_transform_points3_perspective) +GLNAME(_mesa_sse_transform_points3_perspective): + +#define FRAME_OFFSET 8 + PUSH_L ( ESI ) + PUSH_L ( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP3PR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */ + MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */ + UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */ + MOVLPS ( M(8), XMM2 ) /* - | - | m9 | m8 */ + MOVSS ( M(10), XMM3 ) /* m10 */ + MOVSS ( M(14), XMM4 ) /* m14 */ + XORPS ( XMM6, XMM6 ) /* 0 */ + +ALIGNTEXT32 +LLBL(K_GTP3PR_top): + MOVLPS ( S(0), XMM0 ) /* oy | ox */ + MULPS ( XMM1, XMM0 ) /* oy*m5 | ox*m0 */ + MOVSS ( S(2), XMM5 ) /* oz */ + SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* oz | oz */ + MULPS ( XMM2, XMM5 ) /* oz*m9 | oz*m8 */ + ADDPS ( XMM5, XMM0 ) /* +oy*m5 | +ox*m0 */ + MOVLPS ( XMM0, D(0) ) /* ->D(1) | ->D(0) */ + + MOVSS ( S(2), XMM0 ) /* oz */ + MULSS ( XMM3, XMM0 ) /* oz*m10 */ + ADDSS ( XMM4, XMM0 ) /* +m14 */ + MOVSS ( XMM0, D(2) ) /* ->D(2) */ + + MOVSS ( S(2), XMM0 ) /* oz */ + MOVSS ( XMM6, XMM5 ) /* 0 */ + SUBPS ( XMM0, XMM5 ) /* -oz */ + MOVSS ( XMM5, D(3) ) /* ->D(3) */ + +LLBL(K_GTP3PR_skip): + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(K_GTP3PR_top) ) + +LLBL(K_GTP3PR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points3_2d) +HIDDEN(_mesa_sse_transform_points3_2d) +GLNAME(_mesa_sse_transform_points3_2d): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP3P2DR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVLPS( M(0), XMM0 ) /* m1 | m0 */ + MOVLPS( M(4), XMM1 ) /* m5 | m4 */ + MOVLPS( M(12), XMM2 ) /* m13 | m12 */ + +ALIGNTEXT32 +LLBL(K_GTP3P2DR_top): + MOVSS ( S(0), XMM3 ) /* ox */ + SHUFPS ( CONST(0x0), XMM3, XMM3 ) /* ox | ox */ + MULPS ( XMM0, XMM3 ) /* ox*m1 | ox*m0 */ + MOVSS ( S(1), XMM4 ) /* oy */ + SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* oy | oy */ + MULPS ( XMM1, XMM4 ) /* oy*m5 | oy*m4 */ + + ADDPS ( XMM4, XMM3 ) + ADDPS ( XMM2, XMM3 ) + MOVLPS ( XMM3, D(0) ) + + MOVSS ( S(2), XMM3 ) + MOVSS ( XMM3, D(2) ) + +LLBL(K_GTP3P2DR_skip): + ADD_L ( CONST(16), EDI ) + ADD_L ( EAX, ESI ) + CMP_L ( ECX, EDI ) + JNE ( LLBL(K_GTP3P2DR_top) ) + +LLBL(K_GTP3P2DR_finish): + POP_L ( EDI ) + POP_L ( ESI ) + RET +#undef FRAME_OFFSET + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points3_2d_no_rot) +HIDDEN(_mesa_sse_transform_points3_2d_no_rot) +GLNAME(_mesa_sse_transform_points3_2d_no_rot): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP3P2DNRR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + +ALIGNTEXT32 + MOVSS ( M(0), XMM1 ) /* m0 */ + MOVSS ( M(5), XMM2 ) /* m5 */ + UNPCKLPS ( XMM2, XMM1 ) /* m5 | m0 */ + MOVLPS ( M(12), XMM2 ) /* m13 | m12 */ + +ALIGNTEXT32 +LLBL(K_GTP3P2DNRR_top): + MOVLPS( S(0), XMM0 ) /* oy | ox */ + MULPS( XMM1, XMM0 ) /* oy*m5 | ox*m0 */ + ADDPS( XMM2, XMM0 ) /* +m13 | +m12 */ + MOVLPS( XMM0, D(0) ) /* ->D(1) | ->D(0) */ + + MOVSS( S(2), XMM0 ) + MOVSS( XMM0, D(2) ) + +LLBL(K_GTP3P2DNRR_skip): + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(K_GTP3P2DNRR_top) ) + +LLBL(K_GTP3P2DNRR_finish): + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT4 +GLOBL GLNAME(_mesa_sse_transform_points3_3d) +HIDDEN(_mesa_sse_transform_points3_3d) +GLNAME(_mesa_sse_transform_points3_3d): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */ + MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */ + + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP3P3DR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + +ALIGNTEXT32 + MOVAPS( M(0), XMM0 ) /* m2 | m1 | m0 */ + MOVAPS( M(4), XMM1 ) /* m6 | m5 | m4 */ + MOVAPS( M(8), XMM2 ) /* m10 | m9 | m8 */ + MOVAPS( M(12), XMM3 ) /* m14 | m13 | m12 */ + +ALIGNTEXT32 +LLBL(K_GTP3P3DR_top): + MOVSS( S(0), XMM4 ) + SHUFPS( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox */ + MULPS( XMM0, XMM4 ) /* ox*m2 | ox*m1 | ox*m0 */ + + MOVSS( S(1), XMM5 ) + SHUFPS( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy */ + MULPS( XMM1, XMM5 ) /* oy*m6 | oy*m5 | oy*m4 */ + + MOVSS( S(2), XMM6 ) + SHUFPS( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz */ + MULPS( XMM2, XMM6 ) /* oz*m10 | oz*m9 | oz*m8 */ + + ADDPS( XMM5, XMM4 ) /* + | + | + */ + ADDPS( XMM6, XMM4 ) /* + | + | + */ + ADDPS( XMM3, XMM4 ) /* + | + | + */ + + MOVLPS( XMM4, D(0) ) /* => D(1) | => D(0) */ + UNPCKHPS( XMM4, XMM4 ) + MOVSS( XMM4, D(2) ) + +LLBL(K_GTP3P3DR_skip): + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(K_GTP3P3DR_top) ) + +LLBL(K_GTP3P3DR_finish): + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET +#endif + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/sse_xform4.S b/mesalib/src/mesa/x86/sse_xform4.S index 75e0ff396..fb1fa741c 100644 --- a/mesalib/src/mesa/x86/sse_xform4.S +++ b/mesalib/src/mesa/x86/sse_xform4.S @@ -1,235 +1,235 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifdef USE_SSE_ASM
-#include "assyntax.h"
-#include "matypes.h"
-#include "xform_args.h"
-
- SEG_TEXT
-
-#define FRAME_OFFSET 8
-
-#define SRC(i) REGOFF(i * 4, ESI)
-#define DST(i) REGOFF(i * 4, EDI)
-#define MAT(i) REGOFF(i * 4, EDX)
-
-#define SELECT(r0, r1, r2, r3) CONST( r0 * 64 + r1 * 16 + r2 * 4 + r3 )
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_sse_transform_points4_general )
-HIDDEN(_mesa_sse_transform_points4_general)
-GLNAME( _mesa_sse_transform_points4_general ):
-
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX ) /* verify non-zero count */
- JE( LLBL( sse_general_done ) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */
-
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
-
- PREFETCHT0( REGIND(ESI) )
-
- MOVAPS( MAT(0), XMM4 ) /* m3 | m2 | m1 | m0 */
- MOVAPS( MAT(4), XMM5 ) /* m7 | m6 | m5 | m4 */
- MOVAPS( MAT(8), XMM6 ) /* m11 | m10 | m9 | m8 */
- MOVAPS( MAT(12), XMM7 ) /* m15 | m14 | m13 | m12 */
-
-ALIGNTEXT16
-LLBL( sse_general_loop ):
-
- MOVSS( SRC(0), XMM0 ) /* ox */
- SHUFPS( CONST(0x0), XMM0, XMM0 ) /* ox | ox | ox | ox */
- MULPS( XMM4, XMM0 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
-
- MOVSS( SRC(1), XMM1 ) /* oy */
- SHUFPS( CONST(0x0), XMM1, XMM1 ) /* oy | oy | oy | oy */
- MULPS( XMM5, XMM1 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
-
- MOVSS( SRC(2), XMM2 ) /* oz */
- SHUFPS( CONST(0x0), XMM2, XMM2 ) /* oz | oz | oz | oz */
- MULPS( XMM6, XMM2 ) /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
-
- MOVSS( SRC(3), XMM3 ) /* ow */
- SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ow | ow | ow | ow */
- MULPS( XMM7, XMM3 ) /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
-
- ADDPS( XMM1, XMM0 ) /* ox*m3+oy*m7 | ... */
- ADDPS( XMM2, XMM0 ) /* ox*m3+oy*m7+oz*m11 | ... */
- ADDPS( XMM3, XMM0 ) /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
- MOVAPS( XMM0, DST(0) ) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
-
- DEC_L( ECX )
- JNZ( LLBL( sse_general_loop ) )
-
-LLBL( sse_general_done ):
-
- POP_L( EDI )
- POP_L( ESI )
- RET
-
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME( _mesa_sse_transform_points4_3d )
-HIDDEN(_mesa_sse_transform_points4_3d)
-GLNAME( _mesa_sse_transform_points4_3d ):
-
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( ARG_SOURCE, ESI ) /* ptr to source GLvector4f */
- MOV_L( ARG_DEST, EDI ) /* ptr to dest GLvector4f */
-
- MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
-
- TEST_L( ECX, ECX)
- JZ( LLBL(K_GTP43P3DR_finish) ) /* count was zero; go to finish */
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )/* set dest size */
-
- SHL_L( CONST(4), ECX ) /* count *= 16 */
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
-
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
- ADD_L( EDI, ECX ) /* count += dest ptr */
-
- MOVAPS( MAT(0), XMM0 ) /* m3 | m2 | m1 | m0 */
- MOVAPS( MAT(4), XMM1 ) /* m7 | m6 | m5 | m4 */
- MOVAPS( MAT(8), XMM2 ) /* m11 | m10 | m9 | m8 */
- MOVAPS( MAT(12), XMM3 ) /* m15 | m14 | m13 | m12 */
-
-ALIGNTEXT32
-LLBL( K_GTP43P3DR_top ):
- MOVSS( SRC(0), XMM4 ) /* ox */
- SHUFPS( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox | ox */
- MULPS( XMM0, XMM4 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
-
- MOVSS( SRC(1), XMM5 ) /* oy */
- SHUFPS( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy | oy */
- MULPS( XMM1, XMM5 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
-
- MOVSS( SRC(2), XMM6 ) /* oz */
- SHUFPS( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz | oz */
- MULPS( XMM2, XMM6 ) /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
-
- MOVSS( SRC(3), XMM7 ) /* ow */
- SHUFPS( CONST(0x0), XMM7, XMM7 ) /* ow | ow | ow | ow */
- MULPS( XMM3, XMM7 ) /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
-
- ADDPS( XMM5, XMM4 ) /* ox*m3+oy*m7 | ... */
- ADDPS( XMM6, XMM4 ) /* ox*m3+oy*m7+oz*m11 | ... */
- ADDPS( XMM7, XMM4 ) /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
- MOVAPS( XMM4, DST(0) ) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
-
- MOVSS( SRC(3), XMM4 ) /* ow */
- MOVSS( XMM4, DST(3) ) /* ->D(3) */
-
-LLBL( K_GTP43P3DR_skip ):
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(K_GTP43P3DR_top) )
-
-LLBL( K_GTP43P3DR_finish ):
- POP_L( EDI )
- POP_L( ESI )
- RET
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_sse_transform_points4_identity )
-HIDDEN(_mesa_sse_transform_points4_identity)
-GLNAME( _mesa_sse_transform_points4_identity ):
-
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX ) /* verify non-zero count */
- JE( LLBL( sse_identity_done ) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */
-
- MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
- MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
-
-ALIGNTEXT16
-LLBL( sse_identity_loop ):
-
- PREFETCHNTA( REGOFF(32, ESI) )
-
- MOVAPS( REGIND(ESI), XMM0 )
- ADD_L( EAX, ESI )
-
- MOVAPS( XMM0, REGIND(EDI) )
- ADD_L( CONST(16), EDI )
-
- DEC_L( ECX )
- JNZ( LLBL( sse_identity_loop ) )
-
-LLBL( sse_identity_done ):
-
- POP_L( EDI )
- POP_L( ESI )
- RET
-#endif
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef USE_SSE_ASM +#include "assyntax.h" +#include "matypes.h" +#include "xform_args.h" + + SEG_TEXT + +#define FRAME_OFFSET 8 + +#define SRC(i) REGOFF(i * 4, ESI) +#define DST(i) REGOFF(i * 4, EDI) +#define MAT(i) REGOFF(i * 4, EDX) + +#define SELECT(r0, r1, r2, r3) CONST( r0 * 64 + r1 * 16 + r2 * 4 + r3 ) + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_sse_transform_points4_general ) +HIDDEN(_mesa_sse_transform_points4_general) +GLNAME( _mesa_sse_transform_points4_general ): + + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) /* verify non-zero count */ + JE( LLBL( sse_general_done ) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */ + + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + + PREFETCHT0( REGIND(ESI) ) + + MOVAPS( MAT(0), XMM4 ) /* m3 | m2 | m1 | m0 */ + MOVAPS( MAT(4), XMM5 ) /* m7 | m6 | m5 | m4 */ + MOVAPS( MAT(8), XMM6 ) /* m11 | m10 | m9 | m8 */ + MOVAPS( MAT(12), XMM7 ) /* m15 | m14 | m13 | m12 */ + +ALIGNTEXT16 +LLBL( sse_general_loop ): + + MOVSS( SRC(0), XMM0 ) /* ox */ + SHUFPS( CONST(0x0), XMM0, XMM0 ) /* ox | ox | ox | ox */ + MULPS( XMM4, XMM0 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ + + MOVSS( SRC(1), XMM1 ) /* oy */ + SHUFPS( CONST(0x0), XMM1, XMM1 ) /* oy | oy | oy | oy */ + MULPS( XMM5, XMM1 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ + + MOVSS( SRC(2), XMM2 ) /* oz */ + SHUFPS( CONST(0x0), XMM2, XMM2 ) /* oz | oz | oz | oz */ + MULPS( XMM6, XMM2 ) /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ + + MOVSS( SRC(3), XMM3 ) /* ow */ + SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ow | ow | ow | ow */ + MULPS( XMM7, XMM3 ) /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ + + ADDPS( XMM1, XMM0 ) /* ox*m3+oy*m7 | ... */ + ADDPS( XMM2, XMM0 ) /* ox*m3+oy*m7+oz*m11 | ... */ + ADDPS( XMM3, XMM0 ) /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ + MOVAPS( XMM0, DST(0) ) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + + DEC_L( ECX ) + JNZ( LLBL( sse_general_loop ) ) + +LLBL( sse_general_done ): + + POP_L( EDI ) + POP_L( ESI ) + RET + + + + +ALIGNTEXT4 +GLOBL GLNAME( _mesa_sse_transform_points4_3d ) +HIDDEN(_mesa_sse_transform_points4_3d) +GLNAME( _mesa_sse_transform_points4_3d ): + + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( ARG_SOURCE, ESI ) /* ptr to source GLvector4f */ + MOV_L( ARG_DEST, EDI ) /* ptr to dest GLvector4f */ + + MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ + + TEST_L( ECX, ECX) + JZ( LLBL(K_GTP43P3DR_finish) ) /* count was zero; go to finish */ + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )/* set dest size */ + + SHL_L( CONST(4), ECX ) /* count *= 16 */ + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + ADD_L( EDI, ECX ) /* count += dest ptr */ + + MOVAPS( MAT(0), XMM0 ) /* m3 | m2 | m1 | m0 */ + MOVAPS( MAT(4), XMM1 ) /* m7 | m6 | m5 | m4 */ + MOVAPS( MAT(8), XMM2 ) /* m11 | m10 | m9 | m8 */ + MOVAPS( MAT(12), XMM3 ) /* m15 | m14 | m13 | m12 */ + +ALIGNTEXT32 +LLBL( K_GTP43P3DR_top ): + MOVSS( SRC(0), XMM4 ) /* ox */ + SHUFPS( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox | ox */ + MULPS( XMM0, XMM4 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ + + MOVSS( SRC(1), XMM5 ) /* oy */ + SHUFPS( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy | oy */ + MULPS( XMM1, XMM5 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ + + MOVSS( SRC(2), XMM6 ) /* oz */ + SHUFPS( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz | oz */ + MULPS( XMM2, XMM6 ) /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ + + MOVSS( SRC(3), XMM7 ) /* ow */ + SHUFPS( CONST(0x0), XMM7, XMM7 ) /* ow | ow | ow | ow */ + MULPS( XMM3, XMM7 ) /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ + + ADDPS( XMM5, XMM4 ) /* ox*m3+oy*m7 | ... */ + ADDPS( XMM6, XMM4 ) /* ox*m3+oy*m7+oz*m11 | ... */ + ADDPS( XMM7, XMM4 ) /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ + MOVAPS( XMM4, DST(0) ) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ + + MOVSS( SRC(3), XMM4 ) /* ow */ + MOVSS( XMM4, DST(3) ) /* ->D(3) */ + +LLBL( K_GTP43P3DR_skip ): + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(K_GTP43P3DR_top) ) + +LLBL( K_GTP43P3DR_finish ): + POP_L( EDI ) + POP_L( ESI ) + RET + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_sse_transform_points4_identity ) +HIDDEN(_mesa_sse_transform_points4_identity) +GLNAME( _mesa_sse_transform_points4_identity ): + + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) /* verify non-zero count */ + JE( LLBL( sse_identity_done ) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */ + + MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ + MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ + +ALIGNTEXT16 +LLBL( sse_identity_loop ): + + PREFETCHNTA( REGOFF(32, ESI) ) + + MOVAPS( REGIND(ESI), XMM0 ) + ADD_L( EAX, ESI ) + + MOVAPS( XMM0, REGIND(EDI) ) + ADD_L( CONST(16), EDI ) + + DEC_L( ECX ) + JNZ( LLBL( sse_identity_loop ) ) + +LLBL( sse_identity_done ): + + POP_L( EDI ) + POP_L( ESI ) + RET +#endif + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/x86_cliptest.S b/mesalib/src/mesa/x86/x86_cliptest.S index 8556a887a..e413aee61 100644 --- a/mesalib/src/mesa/x86/x86_cliptest.S +++ b/mesalib/src/mesa/x86/x86_cliptest.S @@ -1,407 +1,407 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
- * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces
- * in there will break the build on some platforms.
- */
-
-#include "assyntax.h"
-#include "matypes.h"
-#include "clip_args.h"
-
-#define SRC0 REGOFF(0, ESI)
-#define SRC1 REGOFF(4, ESI)
-#define SRC2 REGOFF(8, ESI)
-#define SRC3 REGOFF(12, ESI)
-#define DST0 REGOFF(0, EDI)
-#define DST1 REGOFF(4, EDI)
-#define DST2 REGOFF(8, EDI)
-#define DST3 REGOFF(12, EDI)
-#define MAT0 REGOFF(0, EDX)
-#define MAT1 REGOFF(4, EDX)
-#define MAT2 REGOFF(8, EDX)
-#define MAT3 REGOFF(12, EDX)
-
-
-/*
- * Table for clip test.
- *
- * bit6 = SRC3 < 0
- * bit5 = SRC2 < 0
- * bit4 = abs(S(2)) > abs(S(3))
- * bit3 = SRC1 < 0
- * bit2 = abs(S(1)) > abs(S(3))
- * bit1 = SRC0 < 0
- * bit0 = abs(S(0)) > abs(S(3))
- */
-
- SEG_DATA
-
-clip_table:
- D_BYTE 0x00, 0x01, 0x00, 0x02, 0x04, 0x05, 0x04, 0x06
- D_BYTE 0x00, 0x01, 0x00, 0x02, 0x08, 0x09, 0x08, 0x0a
- D_BYTE 0x20, 0x21, 0x20, 0x22, 0x24, 0x25, 0x24, 0x26
- D_BYTE 0x20, 0x21, 0x20, 0x22, 0x28, 0x29, 0x28, 0x2a
- D_BYTE 0x00, 0x01, 0x00, 0x02, 0x04, 0x05, 0x04, 0x06
- D_BYTE 0x00, 0x01, 0x00, 0x02, 0x08, 0x09, 0x08, 0x0a
- D_BYTE 0x10, 0x11, 0x10, 0x12, 0x14, 0x15, 0x14, 0x16
- D_BYTE 0x10, 0x11, 0x10, 0x12, 0x18, 0x19, 0x18, 0x1a
- D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x37, 0x35, 0x37, 0x36
- D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x3b, 0x39, 0x3b, 0x3a
- D_BYTE 0x2f, 0x2d, 0x2f, 0x2e, 0x27, 0x25, 0x27, 0x26
- D_BYTE 0x2f, 0x2d, 0x2f, 0x2e, 0x2b, 0x29, 0x2b, 0x2a
- D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x37, 0x35, 0x37, 0x36
- D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x3b, 0x39, 0x3b, 0x3a
- D_BYTE 0x1f, 0x1d, 0x1f, 0x1e, 0x17, 0x15, 0x17, 0x16
- D_BYTE 0x1f, 0x1d, 0x1f, 0x1e, 0x1b, 0x19, 0x1b, 0x1a
-
-
- SEG_TEXT
-
-/*
- * _mesa_x86_cliptest_points4
- *
- * AL: ormask
- * AH: andmask
- * EBX: temp0
- * ECX: temp1
- * EDX: clipmask[]
- * ESI: clip[]
- * EDI: proj[]
- * EBP: temp2
- */
-
-#if defined(__ELF__) && defined(__PIC__) && defined(GNU_ASSEMBLER) && !defined(ELFPIC)
-#define ELFPIC
-#endif
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_cliptest_points4 )
-HIDDEN(_mesa_x86_cliptest_points4)
-GLNAME( _mesa_x86_cliptest_points4 ):
-
-#ifdef ELFPIC
-#define FRAME_OFFSET 20
-#else
-#define FRAME_OFFSET 16
-#endif
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBP )
- PUSH_L( EBX )
-
-#ifdef ELFPIC
- /* store pointer to clip_table on stack */
- CALL( LLBL(ctp4_get_eip) )
- ADD_L( CONST(_GLOBAL_OFFSET_TABLE_), EBX )
- MOV_L( REGOFF(clip_table@GOT, EBX), EBX )
- PUSH_L( EBX )
- JMP( LLBL(ctp4_clip_table_ready) )
-
-LLBL(ctp4_get_eip):
- /* store eip in ebx */
- MOV_L( REGIND(ESP), EBX )
- RET
-
-LLBL(ctp4_clip_table_ready):
-#endif
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_CLIP, EDX )
- MOV_L( ARG_OR, EBX )
-
- MOV_L( ARG_AND, EBP )
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
-
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
- MOV_L( EAX, ARG_SOURCE ) /* put stride in ARG_SOURCE */
-
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDX, ECX )
-
- MOV_L( ECX, ARG_CLIP ) /* put clipmask + count in ARG_CLIP */
- CMP_L( ECX, EDX )
-
- MOV_B( REGIND(EBX), AL )
- MOV_B( REGIND(EBP), AH )
-
- JZ( LLBL(ctp4_finish) )
-
-ALIGNTEXT16
-LLBL(ctp4_top):
-
- FLD1 /* F3 */
- FDIV_S( SRC3 ) /* GH: don't care about div-by-zero */
-
- MOV_L( SRC3, EBP )
- MOV_L( SRC2, EBX )
-
- XOR_L( ECX, ECX )
- ADD_L( EBP, EBP ) /* ebp = abs(S(3))*2 ; carry = sign of S(3) */
-
- ADC_L( ECX, ECX )
- ADD_L( EBX, EBX ) /* ebx = abs(S(2))*2 ; carry = sign of S(2) */
-
- ADC_L( ECX, ECX )
- CMP_L( EBX, EBP ) /* carry = abs(S(2))*2 > abs(S(3))*2 */
-
- ADC_L( ECX, ECX )
- MOV_L( SRC1, EBX )
-
- ADD_L( EBX, EBX ) /* ebx = abs(S(1))*2 ; carry = sign of S(1) */
-
- ADC_L( ECX, ECX )
- CMP_L( EBX, EBP ) /* carry = abs(S(1))*2 > abs(S(3))*2 */
-
- ADC_L( ECX, ECX )
- MOV_L( SRC0, EBX )
-
- ADD_L( EBX, EBX ) /* ebx = abs(S(0))*2 ; carry = sign of S(0) */
-
- ADC_L( ECX, ECX )
- CMP_L( EBX, EBP ) /* carry = abs(S(0))*2 > abs(S(3))*2 */
-
- ADC_L( ECX, ECX )
-
-#ifdef ELFPIC
- MOV_L( REGIND(ESP), EBP ) /* clip_table */
-
- MOV_B( REGBI(EBP, ECX), CL )
-#else
- MOV_B( REGOFF(clip_table,ECX), CL )
-#endif
-
- OR_B( CL, AL )
- AND_B( CL, AH )
-
- TEST_B( CL, CL )
- MOV_B( CL, REGIND(EDX) )
-
- JZ( LLBL(ctp4_proj) )
-
-LLBL(ctp4_noproj):
-
- FSTP( ST(0) ) /* */
-
- MOV_L( CONST(0), DST0 )
- MOV_L( CONST(0), DST1 )
- MOV_L( CONST(0), DST2 )
- MOV_L( CONST(0x3f800000), DST3 )
-
- JMP( LLBL(ctp4_next) )
-
-LLBL(ctp4_proj):
-
- FLD_S( SRC0 ) /* F0 F3 */
- FMUL2( ST(1), ST0 )
-
- FLD_S( SRC1 ) /* F1 F0 F3 */
- FMUL2( ST(2), ST0 )
-
- FLD_S( SRC2 ) /* F2 F1 F0 F3 */
- FMUL2( ST(3), ST0 )
-
- FXCH( ST(2) ) /* F0 F1 F2 F3 */
- FSTP_S( DST0 ) /* F1 F2 F3 */
- FSTP_S( DST1 ) /* F2 F3 */
- FSTP_S( DST2 ) /* F3 */
- FSTP_S( DST3 ) /* */
-
-LLBL(ctp4_next):
-
- INC_L( EDX )
- ADD_L( CONST(16), EDI )
-
- ADD_L( ARG_SOURCE, ESI )
- CMP_L( EDX, ARG_CLIP )
-
- JNZ( LLBL(ctp4_top) )
-
- MOV_L( ARG_OR, ECX )
- MOV_L( ARG_AND, EDX )
-
- MOV_B( AL, REGIND(ECX) )
- MOV_B( AH, REGIND(EDX) )
-
-LLBL(ctp4_finish):
-
- MOV_L( ARG_DEST, EAX )
-#ifdef ELFPIC
- POP_L( ESI ) /* discard ptr to clip_table */
-#endif
- POP_L( EBX )
- POP_L( EBP )
- POP_L( EDI )
- POP_L( ESI )
-
- RET
-
-
-
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_cliptest_points4_np )
-HIDDEN(_mesa_x86_cliptest_points4_np)
-GLNAME( _mesa_x86_cliptest_points4_np ):
-
-#ifdef ELFPIC
-#define FRAME_OFFSET 20
-#else
-#define FRAME_OFFSET 16
-#endif
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBP )
- PUSH_L( EBX )
-
-#ifdef ELFPIC
- /* store pointer to clip_table on stack */
- CALL( LLBL(ctp4_np_get_eip) )
- ADD_L( CONST(_GLOBAL_OFFSET_TABLE_), EBX )
- MOV_L( REGOFF(clip_table@GOT, EBX), EBX )
- PUSH_L( EBX )
- JMP( LLBL(ctp4_np_clip_table_ready) )
-
-LLBL(ctp4_np_get_eip):
- /* store eip in ebx */
- MOV_L( REGIND(ESP), EBX )
- RET
-
-LLBL(ctp4_np_clip_table_ready):
-#endif
-
- MOV_L( ARG_SOURCE, ESI )
- /* slot */
-
- MOV_L( ARG_CLIP, EDX )
- MOV_L( ARG_OR, EBX )
-
- MOV_L( ARG_AND, EBP )
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
-
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( EAX, ARG_DEST ) /* put stride in ARG_DEST */
- ADD_L( EDX, ECX )
-
- MOV_L( ECX, EDI ) /* put clipmask + count in EDI */
- CMP_L( ECX, EDX )
-
- MOV_B( REGIND(EBX), AL )
- MOV_B( REGIND(EBP), AH )
-
- JZ( LLBL(ctp4_np_finish) )
-
-ALIGNTEXT16
-LLBL(ctp4_np_top):
-
- MOV_L( SRC3, EBP )
- MOV_L( SRC2, EBX )
-
- XOR_L( ECX, ECX )
- ADD_L( EBP, EBP ) /* ebp = abs(S(3))*2 ; carry = sign of S(3) */
-
- ADC_L( ECX, ECX )
- ADD_L( EBX, EBX ) /* ebx = abs(S(2))*2 ; carry = sign of S(2) */
-
- ADC_L( ECX, ECX )
- CMP_L( EBX, EBP ) /* carry = abs(S(2))*2 > abs(S(3))*2 */
-
- ADC_L( ECX, ECX )
- MOV_L( SRC1, EBX )
-
- ADD_L( EBX, EBX ) /* ebx = abs(S(1))*2 ; carry = sign of S(1) */
-
- ADC_L( ECX, ECX )
- CMP_L( EBX, EBP ) /* carry = abs(S(1))*2 > abs(S(3))*2 */
-
- ADC_L( ECX, ECX )
- MOV_L( SRC0, EBX )
-
- ADD_L( EBX, EBX ) /* ebx = abs(S(0))*2 ; carry = sign of S(0) */
-
- ADC_L( ECX, ECX )
- CMP_L( EBX, EBP ) /* carry = abs(S(0))*2 > abs(S(3))*2 */
-
- ADC_L( ECX, ECX )
-
-#ifdef ELFPIC
- MOV_L( REGIND(ESP), EBP ) /* clip_table */
-
- MOV_B( REGBI(EBP, ECX), CL )
-#else
- MOV_B( REGOFF(clip_table,ECX), CL )
-#endif
-
- OR_B( CL, AL )
- AND_B( CL, AH )
-
- TEST_B( CL, CL )
- MOV_B( CL, REGIND(EDX) )
-
- INC_L( EDX )
- /* slot */
-
- ADD_L( ARG_DEST, ESI )
- CMP_L( EDX, EDI )
-
- JNZ( LLBL(ctp4_np_top) )
-
- MOV_L( ARG_OR, ECX )
- MOV_L( ARG_AND, EDX )
-
- MOV_B( AL, REGIND(ECX) )
- MOV_B( AH, REGIND(EDX) )
-
-LLBL(ctp4_np_finish):
-
- MOV_L( ARG_SOURCE, EAX )
-#ifdef ELFPIC
- POP_L( ESI ) /* discard ptr to clip_table */
-#endif
- POP_L( EBX )
- POP_L( EBP )
- POP_L( EDI )
- POP_L( ESI )
-
- RET
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * NOTE: Avoid using spaces in between '(' ')' and arguments, especially + * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces + * in there will break the build on some platforms. + */ + +#include "assyntax.h" +#include "matypes.h" +#include "clip_args.h" + +#define SRC0 REGOFF(0, ESI) +#define SRC1 REGOFF(4, ESI) +#define SRC2 REGOFF(8, ESI) +#define SRC3 REGOFF(12, ESI) +#define DST0 REGOFF(0, EDI) +#define DST1 REGOFF(4, EDI) +#define DST2 REGOFF(8, EDI) +#define DST3 REGOFF(12, EDI) +#define MAT0 REGOFF(0, EDX) +#define MAT1 REGOFF(4, EDX) +#define MAT2 REGOFF(8, EDX) +#define MAT3 REGOFF(12, EDX) + + +/* + * Table for clip test. + * + * bit6 = SRC3 < 0 + * bit5 = SRC2 < 0 + * bit4 = abs(S(2)) > abs(S(3)) + * bit3 = SRC1 < 0 + * bit2 = abs(S(1)) > abs(S(3)) + * bit1 = SRC0 < 0 + * bit0 = abs(S(0)) > abs(S(3)) + */ + + SEG_DATA + +clip_table: + D_BYTE 0x00, 0x01, 0x00, 0x02, 0x04, 0x05, 0x04, 0x06 + D_BYTE 0x00, 0x01, 0x00, 0x02, 0x08, 0x09, 0x08, 0x0a + D_BYTE 0x20, 0x21, 0x20, 0x22, 0x24, 0x25, 0x24, 0x26 + D_BYTE 0x20, 0x21, 0x20, 0x22, 0x28, 0x29, 0x28, 0x2a + D_BYTE 0x00, 0x01, 0x00, 0x02, 0x04, 0x05, 0x04, 0x06 + D_BYTE 0x00, 0x01, 0x00, 0x02, 0x08, 0x09, 0x08, 0x0a + D_BYTE 0x10, 0x11, 0x10, 0x12, 0x14, 0x15, 0x14, 0x16 + D_BYTE 0x10, 0x11, 0x10, 0x12, 0x18, 0x19, 0x18, 0x1a + D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x37, 0x35, 0x37, 0x36 + D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x3b, 0x39, 0x3b, 0x3a + D_BYTE 0x2f, 0x2d, 0x2f, 0x2e, 0x27, 0x25, 0x27, 0x26 + D_BYTE 0x2f, 0x2d, 0x2f, 0x2e, 0x2b, 0x29, 0x2b, 0x2a + D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x37, 0x35, 0x37, 0x36 + D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x3b, 0x39, 0x3b, 0x3a + D_BYTE 0x1f, 0x1d, 0x1f, 0x1e, 0x17, 0x15, 0x17, 0x16 + D_BYTE 0x1f, 0x1d, 0x1f, 0x1e, 0x1b, 0x19, 0x1b, 0x1a + + + SEG_TEXT + +/* + * _mesa_x86_cliptest_points4 + * + * AL: ormask + * AH: andmask + * EBX: temp0 + * ECX: temp1 + * EDX: clipmask[] + * ESI: clip[] + * EDI: proj[] + * EBP: temp2 + */ + +#if defined(__ELF__) && defined(__PIC__) && defined(GNU_ASSEMBLER) && !defined(ELFPIC) +#define ELFPIC +#endif + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_cliptest_points4 ) +HIDDEN(_mesa_x86_cliptest_points4) +GLNAME( _mesa_x86_cliptest_points4 ): + +#ifdef ELFPIC +#define FRAME_OFFSET 20 +#else +#define FRAME_OFFSET 16 +#endif + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBP ) + PUSH_L( EBX ) + +#ifdef ELFPIC + /* store pointer to clip_table on stack */ + CALL( LLBL(ctp4_get_eip) ) + ADD_L( CONST(_GLOBAL_OFFSET_TABLE_), EBX ) + MOV_L( REGOFF(clip_table@GOT, EBX), EBX ) + PUSH_L( EBX ) + JMP( LLBL(ctp4_clip_table_ready) ) + +LLBL(ctp4_get_eip): + /* store eip in ebx */ + MOV_L( REGIND(ESP), EBX ) + RET + +LLBL(ctp4_clip_table_ready): +#endif + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_CLIP, EDX ) + MOV_L( ARG_OR, EBX ) + + MOV_L( ARG_AND, EBP ) + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + MOV_L( EAX, ARG_SOURCE ) /* put stride in ARG_SOURCE */ + + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDX, ECX ) + + MOV_L( ECX, ARG_CLIP ) /* put clipmask + count in ARG_CLIP */ + CMP_L( ECX, EDX ) + + MOV_B( REGIND(EBX), AL ) + MOV_B( REGIND(EBP), AH ) + + JZ( LLBL(ctp4_finish) ) + +ALIGNTEXT16 +LLBL(ctp4_top): + + FLD1 /* F3 */ + FDIV_S( SRC3 ) /* GH: don't care about div-by-zero */ + + MOV_L( SRC3, EBP ) + MOV_L( SRC2, EBX ) + + XOR_L( ECX, ECX ) + ADD_L( EBP, EBP ) /* ebp = abs(S(3))*2 ; carry = sign of S(3) */ + + ADC_L( ECX, ECX ) + ADD_L( EBX, EBX ) /* ebx = abs(S(2))*2 ; carry = sign of S(2) */ + + ADC_L( ECX, ECX ) + CMP_L( EBX, EBP ) /* carry = abs(S(2))*2 > abs(S(3))*2 */ + + ADC_L( ECX, ECX ) + MOV_L( SRC1, EBX ) + + ADD_L( EBX, EBX ) /* ebx = abs(S(1))*2 ; carry = sign of S(1) */ + + ADC_L( ECX, ECX ) + CMP_L( EBX, EBP ) /* carry = abs(S(1))*2 > abs(S(3))*2 */ + + ADC_L( ECX, ECX ) + MOV_L( SRC0, EBX ) + + ADD_L( EBX, EBX ) /* ebx = abs(S(0))*2 ; carry = sign of S(0) */ + + ADC_L( ECX, ECX ) + CMP_L( EBX, EBP ) /* carry = abs(S(0))*2 > abs(S(3))*2 */ + + ADC_L( ECX, ECX ) + +#ifdef ELFPIC + MOV_L( REGIND(ESP), EBP ) /* clip_table */ + + MOV_B( REGBI(EBP, ECX), CL ) +#else + MOV_B( REGOFF(clip_table,ECX), CL ) +#endif + + OR_B( CL, AL ) + AND_B( CL, AH ) + + TEST_B( CL, CL ) + MOV_B( CL, REGIND(EDX) ) + + JZ( LLBL(ctp4_proj) ) + +LLBL(ctp4_noproj): + + FSTP( ST(0) ) /* */ + + MOV_L( CONST(0), DST0 ) + MOV_L( CONST(0), DST1 ) + MOV_L( CONST(0), DST2 ) + MOV_L( CONST(0x3f800000), DST3 ) + + JMP( LLBL(ctp4_next) ) + +LLBL(ctp4_proj): + + FLD_S( SRC0 ) /* F0 F3 */ + FMUL2( ST(1), ST0 ) + + FLD_S( SRC1 ) /* F1 F0 F3 */ + FMUL2( ST(2), ST0 ) + + FLD_S( SRC2 ) /* F2 F1 F0 F3 */ + FMUL2( ST(3), ST0 ) + + FXCH( ST(2) ) /* F0 F1 F2 F3 */ + FSTP_S( DST0 ) /* F1 F2 F3 */ + FSTP_S( DST1 ) /* F2 F3 */ + FSTP_S( DST2 ) /* F3 */ + FSTP_S( DST3 ) /* */ + +LLBL(ctp4_next): + + INC_L( EDX ) + ADD_L( CONST(16), EDI ) + + ADD_L( ARG_SOURCE, ESI ) + CMP_L( EDX, ARG_CLIP ) + + JNZ( LLBL(ctp4_top) ) + + MOV_L( ARG_OR, ECX ) + MOV_L( ARG_AND, EDX ) + + MOV_B( AL, REGIND(ECX) ) + MOV_B( AH, REGIND(EDX) ) + +LLBL(ctp4_finish): + + MOV_L( ARG_DEST, EAX ) +#ifdef ELFPIC + POP_L( ESI ) /* discard ptr to clip_table */ +#endif + POP_L( EBX ) + POP_L( EBP ) + POP_L( EDI ) + POP_L( ESI ) + + RET + + + + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_cliptest_points4_np ) +HIDDEN(_mesa_x86_cliptest_points4_np) +GLNAME( _mesa_x86_cliptest_points4_np ): + +#ifdef ELFPIC +#define FRAME_OFFSET 20 +#else +#define FRAME_OFFSET 16 +#endif + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBP ) + PUSH_L( EBX ) + +#ifdef ELFPIC + /* store pointer to clip_table on stack */ + CALL( LLBL(ctp4_np_get_eip) ) + ADD_L( CONST(_GLOBAL_OFFSET_TABLE_), EBX ) + MOV_L( REGOFF(clip_table@GOT, EBX), EBX ) + PUSH_L( EBX ) + JMP( LLBL(ctp4_np_clip_table_ready) ) + +LLBL(ctp4_np_get_eip): + /* store eip in ebx */ + MOV_L( REGIND(ESP), EBX ) + RET + +LLBL(ctp4_np_clip_table_ready): +#endif + + MOV_L( ARG_SOURCE, ESI ) + /* slot */ + + MOV_L( ARG_CLIP, EDX ) + MOV_L( ARG_OR, EBX ) + + MOV_L( ARG_AND, EBP ) + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( EAX, ARG_DEST ) /* put stride in ARG_DEST */ + ADD_L( EDX, ECX ) + + MOV_L( ECX, EDI ) /* put clipmask + count in EDI */ + CMP_L( ECX, EDX ) + + MOV_B( REGIND(EBX), AL ) + MOV_B( REGIND(EBP), AH ) + + JZ( LLBL(ctp4_np_finish) ) + +ALIGNTEXT16 +LLBL(ctp4_np_top): + + MOV_L( SRC3, EBP ) + MOV_L( SRC2, EBX ) + + XOR_L( ECX, ECX ) + ADD_L( EBP, EBP ) /* ebp = abs(S(3))*2 ; carry = sign of S(3) */ + + ADC_L( ECX, ECX ) + ADD_L( EBX, EBX ) /* ebx = abs(S(2))*2 ; carry = sign of S(2) */ + + ADC_L( ECX, ECX ) + CMP_L( EBX, EBP ) /* carry = abs(S(2))*2 > abs(S(3))*2 */ + + ADC_L( ECX, ECX ) + MOV_L( SRC1, EBX ) + + ADD_L( EBX, EBX ) /* ebx = abs(S(1))*2 ; carry = sign of S(1) */ + + ADC_L( ECX, ECX ) + CMP_L( EBX, EBP ) /* carry = abs(S(1))*2 > abs(S(3))*2 */ + + ADC_L( ECX, ECX ) + MOV_L( SRC0, EBX ) + + ADD_L( EBX, EBX ) /* ebx = abs(S(0))*2 ; carry = sign of S(0) */ + + ADC_L( ECX, ECX ) + CMP_L( EBX, EBP ) /* carry = abs(S(0))*2 > abs(S(3))*2 */ + + ADC_L( ECX, ECX ) + +#ifdef ELFPIC + MOV_L( REGIND(ESP), EBP ) /* clip_table */ + + MOV_B( REGBI(EBP, ECX), CL ) +#else + MOV_B( REGOFF(clip_table,ECX), CL ) +#endif + + OR_B( CL, AL ) + AND_B( CL, AH ) + + TEST_B( CL, CL ) + MOV_B( CL, REGIND(EDX) ) + + INC_L( EDX ) + /* slot */ + + ADD_L( ARG_DEST, ESI ) + CMP_L( EDX, EDI ) + + JNZ( LLBL(ctp4_np_top) ) + + MOV_L( ARG_OR, ECX ) + MOV_L( ARG_AND, EDX ) + + MOV_B( AL, REGIND(ECX) ) + MOV_B( AH, REGIND(EDX) ) + +LLBL(ctp4_np_finish): + + MOV_L( ARG_SOURCE, EAX ) +#ifdef ELFPIC + POP_L( ESI ) /* discard ptr to clip_table */ +#endif + POP_L( EBX ) + POP_L( EBP ) + POP_L( EDI ) + POP_L( ESI ) + + RET + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/x86_xform.c b/mesalib/src/mesa/x86/x86_xform.c index 57b10c262..3dcc55e16 100644 --- a/mesalib/src/mesa/x86/x86_xform.c +++ b/mesalib/src/mesa/x86/x86_xform.c @@ -1,126 +1,126 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * Intel x86 assembly code by Josh Vanderhoof
- */
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "math/m_xform.h"
-
-#include "x86_xform.h"
-#include "common_x86_asm.h"
-
-#ifdef USE_X86_ASM
-#ifdef USE_3DNOW_ASM
-#include "3dnow.h"
-#endif
-#ifdef USE_SSE_ASM
-#include "sse.h"
-#endif
-#endif
-
-#ifdef DEBUG_MATH
-#include "math/m_debug.h"
-#endif
-
-
-#ifdef USE_X86_ASM
-DECLARE_XFORM_GROUP( x86, 2 )
-DECLARE_XFORM_GROUP( x86, 3 )
-DECLARE_XFORM_GROUP( x86, 4 )
-
-
-extern GLvector4f * _ASMAPI
-_mesa_x86_cliptest_points4( GLvector4f *clip_vec,
- GLvector4f *proj_vec,
- GLubyte clipMask[],
- GLubyte *orMask,
- GLubyte *andMask,
- GLboolean viewport_z_clip );
-
-extern GLvector4f * _ASMAPI
-_mesa_x86_cliptest_points4_np( GLvector4f *clip_vec,
- GLvector4f *proj_vec,
- GLubyte clipMask[],
- GLubyte *orMask,
- GLubyte *andMask,
- GLboolean viewport_z_clip );
-
-extern void _ASMAPI
-_mesa_v16_x86_cliptest_points4( GLfloat *first_vert,
- GLfloat *last_vert,
- GLubyte *or_mask,
- GLubyte *and_mask,
- GLubyte *clip_mask,
- GLboolean viewport_z_clip );
-
-extern void _ASMAPI
-_mesa_v16_x86_general_xform( GLfloat *dest,
- const GLfloat *m,
- const GLfloat *src,
- GLuint src_stride,
- GLuint count );
-#endif
-
-
-#ifdef USE_X86_ASM
-static void _mesa_init_x86_transform_asm( void )
-{
- ASSIGN_XFORM_GROUP( x86, 2 );
- ASSIGN_XFORM_GROUP( x86, 3 );
- ASSIGN_XFORM_GROUP( x86, 4 );
-
- _mesa_clip_tab[4] = _mesa_x86_cliptest_points4;
- _mesa_clip_np_tab[4] = _mesa_x86_cliptest_points4_np;
-
-#ifdef DEBUG_MATH
- _math_test_all_transform_functions( "x86" );
- _math_test_all_cliptest_functions( "x86" );
-#endif
-}
-#endif
-
-
-void _mesa_init_all_x86_transform_asm( void )
-{
- _mesa_get_x86_features();
-
-#ifdef USE_X86_ASM
- if ( _mesa_x86_cpu_features ) {
- _mesa_init_x86_transform_asm();
- }
-
- if (cpu_has_3dnow) {
- _mesa_init_3dnow_transform_asm();
- }
-
- if ( cpu_has_xmm ) {
- _mesa_init_sse_transform_asm();
- }
-
-#endif
-}
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Intel x86 assembly code by Josh Vanderhoof + */ + +#include "main/glheader.h" +#include "main/context.h" +#include "math/m_xform.h" + +#include "x86_xform.h" +#include "common_x86_asm.h" + +#ifdef USE_X86_ASM +#ifdef USE_3DNOW_ASM +#include "3dnow.h" +#endif +#ifdef USE_SSE_ASM +#include "sse.h" +#endif +#endif + +#ifdef DEBUG_MATH +#include "math/m_debug.h" +#endif + + +#ifdef USE_X86_ASM +DECLARE_XFORM_GROUP( x86, 2 ) +DECLARE_XFORM_GROUP( x86, 3 ) +DECLARE_XFORM_GROUP( x86, 4 ) + + +extern GLvector4f * _ASMAPI +_mesa_x86_cliptest_points4( GLvector4f *clip_vec, + GLvector4f *proj_vec, + GLubyte clipMask[], + GLubyte *orMask, + GLubyte *andMask, + GLboolean viewport_z_clip ); + +extern GLvector4f * _ASMAPI +_mesa_x86_cliptest_points4_np( GLvector4f *clip_vec, + GLvector4f *proj_vec, + GLubyte clipMask[], + GLubyte *orMask, + GLubyte *andMask, + GLboolean viewport_z_clip ); + +extern void _ASMAPI +_mesa_v16_x86_cliptest_points4( GLfloat *first_vert, + GLfloat *last_vert, + GLubyte *or_mask, + GLubyte *and_mask, + GLubyte *clip_mask, + GLboolean viewport_z_clip ); + +extern void _ASMAPI +_mesa_v16_x86_general_xform( GLfloat *dest, + const GLfloat *m, + const GLfloat *src, + GLuint src_stride, + GLuint count ); +#endif + + +#ifdef USE_X86_ASM +static void _mesa_init_x86_transform_asm( void ) +{ + ASSIGN_XFORM_GROUP( x86, 2 ); + ASSIGN_XFORM_GROUP( x86, 3 ); + ASSIGN_XFORM_GROUP( x86, 4 ); + + _mesa_clip_tab[4] = _mesa_x86_cliptest_points4; + _mesa_clip_np_tab[4] = _mesa_x86_cliptest_points4_np; + +#ifdef DEBUG_MATH + _math_test_all_transform_functions( "x86" ); + _math_test_all_cliptest_functions( "x86" ); +#endif +} +#endif + + +void _mesa_init_all_x86_transform_asm( void ) +{ + _mesa_get_x86_features(); + +#ifdef USE_X86_ASM + if ( _mesa_x86_cpu_features ) { + _mesa_init_x86_transform_asm(); + } + + if (cpu_has_3dnow) { + _mesa_init_3dnow_transform_asm(); + } + + if ( cpu_has_xmm ) { + _mesa_init_sse_transform_asm(); + } + +#endif +} diff --git a/mesalib/src/mesa/x86/x86_xform.h b/mesalib/src/mesa/x86/x86_xform.h index 055fd6a1f..e886d9add 100644 --- a/mesalib/src/mesa/x86/x86_xform.h +++ b/mesalib/src/mesa/x86/x86_xform.h @@ -1,106 +1,106 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- * Gareth Hughes
- */
-
-#ifndef X86_XFORM_H
-#define X86_XFORM_H
-
-
-/* =============================================================
- * Transformation function declarations:
- */
-
-#define XFORM_ARGS GLvector4f *to_vec, \
- const GLfloat m[16], \
- const GLvector4f *from_vec
-
-#define DECLARE_XFORM_GROUP( pfx, sz ) \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_general( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_identity( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_no_rot( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_perspective( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d_no_rot( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d( XFORM_ARGS );
-
-#define ASSIGN_XFORM_GROUP( pfx, sz ) \
- _mesa_transform_tab[sz][MATRIX_GENERAL] = \
- _mesa_##pfx##_transform_points##sz##_general; \
- _mesa_transform_tab[sz][MATRIX_IDENTITY] = \
- _mesa_##pfx##_transform_points##sz##_identity; \
- _mesa_transform_tab[sz][MATRIX_3D_NO_ROT] = \
- _mesa_##pfx##_transform_points##sz##_3d_no_rot; \
- _mesa_transform_tab[sz][MATRIX_PERSPECTIVE] = \
- _mesa_##pfx##_transform_points##sz##_perspective; \
- _mesa_transform_tab[sz][MATRIX_2D] = \
- _mesa_##pfx##_transform_points##sz##_2d; \
- _mesa_transform_tab[sz][MATRIX_2D_NO_ROT] = \
- _mesa_##pfx##_transform_points##sz##_2d_no_rot; \
- _mesa_transform_tab[sz][MATRIX_3D] = \
- _mesa_##pfx##_transform_points##sz##_3d;
-
-
-/* =============================================================
- * Normal transformation function declarations:
- */
-
-#define NORM_ARGS const GLmatrix *mat, \
- GLfloat scale, \
- const GLvector4f *in, \
- const GLfloat *lengths, \
- GLvector4f *dest
-
-#define DECLARE_NORM_GROUP( pfx ) \
-extern void _ASMAPI _mesa_##pfx##_rescale_normals( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_normalize_normals( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_normals( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_normals_no_rot( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals_no_rot( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals_no_rot( NORM_ARGS );
-
-#define ASSIGN_NORM_GROUP( pfx ) \
- _mesa_normal_tab[NORM_RESCALE] = \
- _mesa_##pfx##_rescale_normals; \
- _mesa_normal_tab[NORM_NORMALIZE] = \
- _mesa_##pfx##_normalize_normals; \
- _mesa_normal_tab[NORM_TRANSFORM] = \
- _mesa_##pfx##_transform_normals; \
- _mesa_normal_tab[NORM_TRANSFORM_NO_ROT] = \
- _mesa_##pfx##_transform_normals_no_rot; \
- _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE] = \
- _mesa_##pfx##_transform_rescale_normals; \
- _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE] = \
- _mesa_##pfx##_transform_rescale_normals_no_rot; \
- _mesa_normal_tab[NORM_TRANSFORM | NORM_NORMALIZE] = \
- _mesa_##pfx##_transform_normalize_normals; \
- _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_NORMALIZE] = \
- _mesa_##pfx##_transform_normalize_normals_no_rot;
-
-
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Gareth Hughes + */ + +#ifndef X86_XFORM_H +#define X86_XFORM_H + + +/* ============================================================= + * Transformation function declarations: + */ + +#define XFORM_ARGS GLvector4f *to_vec, \ + const GLfloat m[16], \ + const GLvector4f *from_vec + +#define DECLARE_XFORM_GROUP( pfx, sz ) \ +extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_general( XFORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_identity( XFORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_no_rot( XFORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_perspective( XFORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d( XFORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d_no_rot( XFORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d( XFORM_ARGS ); + +#define ASSIGN_XFORM_GROUP( pfx, sz ) \ + _mesa_transform_tab[sz][MATRIX_GENERAL] = \ + _mesa_##pfx##_transform_points##sz##_general; \ + _mesa_transform_tab[sz][MATRIX_IDENTITY] = \ + _mesa_##pfx##_transform_points##sz##_identity; \ + _mesa_transform_tab[sz][MATRIX_3D_NO_ROT] = \ + _mesa_##pfx##_transform_points##sz##_3d_no_rot; \ + _mesa_transform_tab[sz][MATRIX_PERSPECTIVE] = \ + _mesa_##pfx##_transform_points##sz##_perspective; \ + _mesa_transform_tab[sz][MATRIX_2D] = \ + _mesa_##pfx##_transform_points##sz##_2d; \ + _mesa_transform_tab[sz][MATRIX_2D_NO_ROT] = \ + _mesa_##pfx##_transform_points##sz##_2d_no_rot; \ + _mesa_transform_tab[sz][MATRIX_3D] = \ + _mesa_##pfx##_transform_points##sz##_3d; + + +/* ============================================================= + * Normal transformation function declarations: + */ + +#define NORM_ARGS const GLmatrix *mat, \ + GLfloat scale, \ + const GLvector4f *in, \ + const GLfloat *lengths, \ + GLvector4f *dest + +#define DECLARE_NORM_GROUP( pfx ) \ +extern void _ASMAPI _mesa_##pfx##_rescale_normals( NORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_normalize_normals( NORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_normals( NORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_normals_no_rot( NORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals( NORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals_no_rot( NORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals( NORM_ARGS ); \ +extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals_no_rot( NORM_ARGS ); + +#define ASSIGN_NORM_GROUP( pfx ) \ + _mesa_normal_tab[NORM_RESCALE] = \ + _mesa_##pfx##_rescale_normals; \ + _mesa_normal_tab[NORM_NORMALIZE] = \ + _mesa_##pfx##_normalize_normals; \ + _mesa_normal_tab[NORM_TRANSFORM] = \ + _mesa_##pfx##_transform_normals; \ + _mesa_normal_tab[NORM_TRANSFORM_NO_ROT] = \ + _mesa_##pfx##_transform_normals_no_rot; \ + _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE] = \ + _mesa_##pfx##_transform_rescale_normals; \ + _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE] = \ + _mesa_##pfx##_transform_rescale_normals_no_rot; \ + _mesa_normal_tab[NORM_TRANSFORM | NORM_NORMALIZE] = \ + _mesa_##pfx##_transform_normalize_normals; \ + _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_NORMALIZE] = \ + _mesa_##pfx##_transform_normalize_normals_no_rot; + + +#endif diff --git a/mesalib/src/mesa/x86/x86_xform2.S b/mesalib/src/mesa/x86/x86_xform2.S index 482035c1f..980725ef5 100644 --- a/mesalib/src/mesa/x86/x86_xform2.S +++ b/mesalib/src/mesa/x86/x86_xform2.S @@ -1,574 +1,574 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
- * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces
- * in there will break the build on some platforms.
- */
-
-#include "assyntax.h"
-#include "matypes.h"
-#include "xform_args.h"
-
- SEG_TEXT
-
-#define FP_ONE 1065353216
-#define FP_ZERO 0
-
-#define SRC0 REGOFF(0, ESI)
-#define SRC1 REGOFF(4, ESI)
-#define SRC2 REGOFF(8, ESI)
-#define SRC3 REGOFF(12, ESI)
-#define DST0 REGOFF(0, EDI)
-#define DST1 REGOFF(4, EDI)
-#define DST2 REGOFF(8, EDI)
-#define DST3 REGOFF(12, EDI)
-#define MAT0 REGOFF(0, EDX)
-#define MAT1 REGOFF(4, EDX)
-#define MAT2 REGOFF(8, EDX)
-#define MAT3 REGOFF(12, EDX)
-#define MAT4 REGOFF(16, EDX)
-#define MAT5 REGOFF(20, EDX)
-#define MAT6 REGOFF(24, EDX)
-#define MAT7 REGOFF(28, EDX)
-#define MAT8 REGOFF(32, EDX)
-#define MAT9 REGOFF(36, EDX)
-#define MAT10 REGOFF(40, EDX)
-#define MAT11 REGOFF(44, EDX)
-#define MAT12 REGOFF(48, EDX)
-#define MAT13 REGOFF(52, EDX)
-#define MAT14 REGOFF(56, EDX)
-#define MAT15 REGOFF(60, EDX)
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points2_general )
-HIDDEN(_mesa_x86_transform_points2_general)
-GLNAME( _mesa_x86_transform_points2_general ):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p2_gr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p2_gr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
- FLD_S( SRC0 ) /* F5 F4 */
- FMUL_S( MAT1 )
- FLD_S( SRC0 ) /* F6 F5 F4 */
- FMUL_S( MAT2 )
- FLD_S( SRC0 ) /* F7 F6 F5 F4 */
- FMUL_S( MAT3 )
-
- FLD_S( SRC1 ) /* F0 F7 F6 F5 F4 */
- FMUL_S( MAT4 )
- FLD_S( SRC1 ) /* F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT5 )
- FLD_S( SRC1 ) /* F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT6 )
- FLD_S( SRC1 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT7 )
-
- FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
- FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
-
- FXCH( ST(3) ) /* F4 F6 F5 F7 */
- FADD_S( MAT12 )
- FXCH( ST(2) ) /* F5 F6 F4 F7 */
- FADD_S( MAT13 )
- FXCH( ST(1) ) /* F6 F5 F4 F7 */
- FADD_S( MAT14 )
- FXCH( ST(3) ) /* F7 F5 F4 F6 */
- FADD_S( MAT15 )
-
- FXCH( ST(2) ) /* F4 F5 F7 F6 */
- FSTP_S( DST0 ) /* F5 F7 F6 */
- FSTP_S( DST1 ) /* F7 F6 */
- FXCH( ST(1) ) /* F6 F7 */
- FSTP_S( DST2 ) /* F7 */
- FSTP_S( DST3 ) /* */
-
-LLBL(x86_p2_gr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p2_gr_loop) )
-
-LLBL(x86_p2_gr_done):
-
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points2_perspective )
-HIDDEN(_mesa_x86_transform_points2_perspective)
-GLNAME( _mesa_x86_transform_points2_perspective ):
-
-#define FRAME_OFFSET 12
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p2_pr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
- MOV_L( MAT14, EBX )
-
-ALIGNTEXT16
-LLBL(x86_p2_pr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
-
- FLD_S( SRC1 ) /* F1 F4 */
- FMUL_S( MAT5 )
-
- FXCH( ST(1) ) /* F4 F1 */
- FSTP_S( DST0 ) /* F1 */
- FSTP_S( DST1 ) /* */
- MOV_L( EBX, DST2 )
- MOV_L( CONST(FP_ZERO), DST3 )
-
-LLBL(x86_p2_pr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p2_pr_loop) )
-
-LLBL(x86_p2_pr_done):
-
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points2_3d )
-HIDDEN(_mesa_x86_transform_points2_3d)
-GLNAME( _mesa_x86_transform_points2_3d ):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p2_3dr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p2_3dr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
- FLD_S( SRC0 ) /* F5 F4 */
- FMUL_S( MAT1 )
- FLD_S( SRC0 ) /* F6 F5 F4 */
- FMUL_S( MAT2 )
-
- FLD_S( SRC1 ) /* F0 F6 F5 F4 */
- FMUL_S( MAT4 )
- FLD_S( SRC1 ) /* F1 F0 F6 F5 F4 */
- FMUL_S( MAT5 )
- FLD_S( SRC1 ) /* F2 F1 F0 F6 F5 F4 */
- FMUL_S( MAT6 )
-
- FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F6 F5 F4 */
-
- FXCH( ST(2) ) /* F4 F5 F6 */
- FADD_S( MAT12 )
- FXCH( ST(1) ) /* F5 F4 F6 */
- FADD_S( MAT13 )
- FXCH( ST(2) ) /* F6 F4 F5 */
- FADD_S( MAT14 )
-
- FXCH( ST(1) ) /* F4 F6 F5 */
- FSTP_S( DST0 ) /* F6 F5 */
- FXCH( ST(1) ) /* F5 F6 */
- FSTP_S( DST1 ) /* F6 */
- FSTP_S( DST2 ) /* */
-
-LLBL(x86_p2_3dr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p2_3dr_loop) )
-
-LLBL(x86_p2_3dr_done):
-
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points2_3d_no_rot )
-HIDDEN(_mesa_x86_transform_points2_3d_no_rot)
-GLNAME( _mesa_x86_transform_points2_3d_no_rot ):
-
-#define FRAME_OFFSET 12
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p2_3dnrr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
- MOV_L( MAT14, EBX )
-
-ALIGNTEXT16
-LLBL(x86_p2_3dnrr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
-
- FLD_S( SRC1 ) /* F1 F4 */
- FMUL_S( MAT5 )
-
- FXCH( ST(1) ) /* F4 F1 */
- FADD_S( MAT12 )
- FLD_S( MAT13 ) /* F5 F4 F1 */
- FXCH( ST(2) ) /* F1 F4 F5 */
- FADDP( ST0, ST(2) ) /* F4 F5 */
-
- FSTP_S( DST0 ) /* F5 */
- FSTP_S( DST1 ) /* */
- MOV_L( EBX, DST2 )
-
-LLBL(x86_p2_3dnrr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p2_3dnrr_loop) )
-
-LLBL(x86_p2_3dnrr_done):
-
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points2_2d )
-HIDDEN(_mesa_x86_transform_points2_2d)
-GLNAME( _mesa_x86_transform_points2_2d ):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p2_2dr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p2_2dr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
- FLD_S( SRC0 ) /* F5 F4 */
- FMUL_S( MAT1 )
-
- FLD_S( SRC1 ) /* F0 F5 F4 */
- FMUL_S( MAT4 )
- FLD_S( SRC1 ) /* F1 F0 F5 F4 */
- FMUL_S( MAT5 )
-
- FXCH( ST(1) ) /* F0 F1 F5 F4 */
- FADDP( ST0, ST(3) ) /* F1 F5 F4 */
- FADDP( ST0, ST(1) ) /* F5 F4 */
-
- FXCH( ST(1) ) /* F4 F5 */
- FADD_S( MAT12 )
- FXCH( ST(1) ) /* F5 F4 */
- FADD_S( MAT13 )
-
- FXCH( ST(1) ) /* F4 F5 */
- FSTP_S( DST0 ) /* F5 */
- FSTP_S( DST1 ) /* */
-
-LLBL(x86_p2_2dr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p2_2dr_loop) )
-
-LLBL(x86_p2_2dr_done):
-
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT4
-GLOBL GLNAME( _mesa_x86_transform_points2_2d_no_rot )
-HIDDEN(_mesa_x86_transform_points2_2d_no_rot)
-GLNAME( _mesa_x86_transform_points2_2d_no_rot ):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p2_2dnrr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p2_2dnrr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
-
- FLD_S( SRC1 ) /* F1 F4 */
- FMUL_S( MAT5 )
-
- FXCH( ST(1) ) /* F4 F1 */
- FADD_S( MAT12 )
- FLD_S( MAT13 ) /* F5 F4 F1 */
- FXCH( ST(2) ) /* F1 F4 F5 */
- FADDP( ST0, ST(2) ) /* F4 F5 */
-
- FSTP_S( DST0 ) /* F5 */
- FSTP_S( DST1 ) /* */
-
-LLBL(x86_p2_2dnrr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p2_2dnrr_loop) )
-
-LLBL(x86_p2_2dnrr_done):
-
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points2_identity )
-HIDDEN(_mesa_x86_transform_points2_identity)
-GLNAME( _mesa_x86_transform_points2_identity ):
-
-#define FRAME_OFFSET 12
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p2_ir_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
- CMP_L( ESI, EDI )
- JE( LLBL(x86_p2_ir_done) )
-
-ALIGNTEXT16
-LLBL(x86_p2_ir_loop):
-
- MOV_L( SRC0, EBX )
- MOV_L( SRC1, EDX )
-
- MOV_L( EBX, DST0 )
- MOV_L( EDX, DST1 )
-
-LLBL(x86_p2_ir_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p2_ir_loop) )
-
-LLBL(x86_p2_ir_done):
-
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * NOTE: Avoid using spaces in between '(' ')' and arguments, especially + * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces + * in there will break the build on some platforms. + */ + +#include "assyntax.h" +#include "matypes.h" +#include "xform_args.h" + + SEG_TEXT + +#define FP_ONE 1065353216 +#define FP_ZERO 0 + +#define SRC0 REGOFF(0, ESI) +#define SRC1 REGOFF(4, ESI) +#define SRC2 REGOFF(8, ESI) +#define SRC3 REGOFF(12, ESI) +#define DST0 REGOFF(0, EDI) +#define DST1 REGOFF(4, EDI) +#define DST2 REGOFF(8, EDI) +#define DST3 REGOFF(12, EDI) +#define MAT0 REGOFF(0, EDX) +#define MAT1 REGOFF(4, EDX) +#define MAT2 REGOFF(8, EDX) +#define MAT3 REGOFF(12, EDX) +#define MAT4 REGOFF(16, EDX) +#define MAT5 REGOFF(20, EDX) +#define MAT6 REGOFF(24, EDX) +#define MAT7 REGOFF(28, EDX) +#define MAT8 REGOFF(32, EDX) +#define MAT9 REGOFF(36, EDX) +#define MAT10 REGOFF(40, EDX) +#define MAT11 REGOFF(44, EDX) +#define MAT12 REGOFF(48, EDX) +#define MAT13 REGOFF(52, EDX) +#define MAT14 REGOFF(56, EDX) +#define MAT15 REGOFF(60, EDX) + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points2_general ) +HIDDEN(_mesa_x86_transform_points2_general) +GLNAME( _mesa_x86_transform_points2_general ): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p2_gr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p2_gr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + FLD_S( SRC0 ) /* F5 F4 */ + FMUL_S( MAT1 ) + FLD_S( SRC0 ) /* F6 F5 F4 */ + FMUL_S( MAT2 ) + FLD_S( SRC0 ) /* F7 F6 F5 F4 */ + FMUL_S( MAT3 ) + + FLD_S( SRC1 ) /* F0 F7 F6 F5 F4 */ + FMUL_S( MAT4 ) + FLD_S( SRC1 ) /* F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT5 ) + FLD_S( SRC1 ) /* F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT6 ) + FLD_S( SRC1 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT7 ) + + FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */ + FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */ + + FXCH( ST(3) ) /* F4 F6 F5 F7 */ + FADD_S( MAT12 ) + FXCH( ST(2) ) /* F5 F6 F4 F7 */ + FADD_S( MAT13 ) + FXCH( ST(1) ) /* F6 F5 F4 F7 */ + FADD_S( MAT14 ) + FXCH( ST(3) ) /* F7 F5 F4 F6 */ + FADD_S( MAT15 ) + + FXCH( ST(2) ) /* F4 F5 F7 F6 */ + FSTP_S( DST0 ) /* F5 F7 F6 */ + FSTP_S( DST1 ) /* F7 F6 */ + FXCH( ST(1) ) /* F6 F7 */ + FSTP_S( DST2 ) /* F7 */ + FSTP_S( DST3 ) /* */ + +LLBL(x86_p2_gr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p2_gr_loop) ) + +LLBL(x86_p2_gr_done): + + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points2_perspective ) +HIDDEN(_mesa_x86_transform_points2_perspective) +GLNAME( _mesa_x86_transform_points2_perspective ): + +#define FRAME_OFFSET 12 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p2_pr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + + MOV_L( MAT14, EBX ) + +ALIGNTEXT16 +LLBL(x86_p2_pr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + + FLD_S( SRC1 ) /* F1 F4 */ + FMUL_S( MAT5 ) + + FXCH( ST(1) ) /* F4 F1 */ + FSTP_S( DST0 ) /* F1 */ + FSTP_S( DST1 ) /* */ + MOV_L( EBX, DST2 ) + MOV_L( CONST(FP_ZERO), DST3 ) + +LLBL(x86_p2_pr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p2_pr_loop) ) + +LLBL(x86_p2_pr_done): + + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points2_3d ) +HIDDEN(_mesa_x86_transform_points2_3d) +GLNAME( _mesa_x86_transform_points2_3d ): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p2_3dr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p2_3dr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + FLD_S( SRC0 ) /* F5 F4 */ + FMUL_S( MAT1 ) + FLD_S( SRC0 ) /* F6 F5 F4 */ + FMUL_S( MAT2 ) + + FLD_S( SRC1 ) /* F0 F6 F5 F4 */ + FMUL_S( MAT4 ) + FLD_S( SRC1 ) /* F1 F0 F6 F5 F4 */ + FMUL_S( MAT5 ) + FLD_S( SRC1 ) /* F2 F1 F0 F6 F5 F4 */ + FMUL_S( MAT6 ) + + FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F6 F5 F4 */ + + FXCH( ST(2) ) /* F4 F5 F6 */ + FADD_S( MAT12 ) + FXCH( ST(1) ) /* F5 F4 F6 */ + FADD_S( MAT13 ) + FXCH( ST(2) ) /* F6 F4 F5 */ + FADD_S( MAT14 ) + + FXCH( ST(1) ) /* F4 F6 F5 */ + FSTP_S( DST0 ) /* F6 F5 */ + FXCH( ST(1) ) /* F5 F6 */ + FSTP_S( DST1 ) /* F6 */ + FSTP_S( DST2 ) /* */ + +LLBL(x86_p2_3dr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p2_3dr_loop) ) + +LLBL(x86_p2_3dr_done): + + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points2_3d_no_rot ) +HIDDEN(_mesa_x86_transform_points2_3d_no_rot) +GLNAME( _mesa_x86_transform_points2_3d_no_rot ): + +#define FRAME_OFFSET 12 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p2_3dnrr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + + MOV_L( MAT14, EBX ) + +ALIGNTEXT16 +LLBL(x86_p2_3dnrr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + + FLD_S( SRC1 ) /* F1 F4 */ + FMUL_S( MAT5 ) + + FXCH( ST(1) ) /* F4 F1 */ + FADD_S( MAT12 ) + FLD_S( MAT13 ) /* F5 F4 F1 */ + FXCH( ST(2) ) /* F1 F4 F5 */ + FADDP( ST0, ST(2) ) /* F4 F5 */ + + FSTP_S( DST0 ) /* F5 */ + FSTP_S( DST1 ) /* */ + MOV_L( EBX, DST2 ) + +LLBL(x86_p2_3dnrr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p2_3dnrr_loop) ) + +LLBL(x86_p2_3dnrr_done): + + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points2_2d ) +HIDDEN(_mesa_x86_transform_points2_2d) +GLNAME( _mesa_x86_transform_points2_2d ): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p2_2dr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p2_2dr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + FLD_S( SRC0 ) /* F5 F4 */ + FMUL_S( MAT1 ) + + FLD_S( SRC1 ) /* F0 F5 F4 */ + FMUL_S( MAT4 ) + FLD_S( SRC1 ) /* F1 F0 F5 F4 */ + FMUL_S( MAT5 ) + + FXCH( ST(1) ) /* F0 F1 F5 F4 */ + FADDP( ST0, ST(3) ) /* F1 F5 F4 */ + FADDP( ST0, ST(1) ) /* F5 F4 */ + + FXCH( ST(1) ) /* F4 F5 */ + FADD_S( MAT12 ) + FXCH( ST(1) ) /* F5 F4 */ + FADD_S( MAT13 ) + + FXCH( ST(1) ) /* F4 F5 */ + FSTP_S( DST0 ) /* F5 */ + FSTP_S( DST1 ) /* */ + +LLBL(x86_p2_2dr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p2_2dr_loop) ) + +LLBL(x86_p2_2dr_done): + + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT4 +GLOBL GLNAME( _mesa_x86_transform_points2_2d_no_rot ) +HIDDEN(_mesa_x86_transform_points2_2d_no_rot) +GLNAME( _mesa_x86_transform_points2_2d_no_rot ): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p2_2dnrr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p2_2dnrr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + + FLD_S( SRC1 ) /* F1 F4 */ + FMUL_S( MAT5 ) + + FXCH( ST(1) ) /* F4 F1 */ + FADD_S( MAT12 ) + FLD_S( MAT13 ) /* F5 F4 F1 */ + FXCH( ST(2) ) /* F1 F4 F5 */ + FADDP( ST0, ST(2) ) /* F4 F5 */ + + FSTP_S( DST0 ) /* F5 */ + FSTP_S( DST1 ) /* */ + +LLBL(x86_p2_2dnrr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p2_2dnrr_loop) ) + +LLBL(x86_p2_2dnrr_done): + + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points2_identity ) +HIDDEN(_mesa_x86_transform_points2_identity) +GLNAME( _mesa_x86_transform_points2_identity ): + +#define FRAME_OFFSET 12 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p2_ir_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + + CMP_L( ESI, EDI ) + JE( LLBL(x86_p2_ir_done) ) + +ALIGNTEXT16 +LLBL(x86_p2_ir_loop): + + MOV_L( SRC0, EBX ) + MOV_L( SRC1, EDX ) + + MOV_L( EBX, DST0 ) + MOV_L( EDX, DST1 ) + +LLBL(x86_p2_ir_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p2_ir_loop) ) + +LLBL(x86_p2_ir_done): + + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/x86_xform3.S b/mesalib/src/mesa/x86/x86_xform3.S index 6f717caf9..1c782f1c5 100644 --- a/mesalib/src/mesa/x86/x86_xform3.S +++ b/mesalib/src/mesa/x86/x86_xform3.S @@ -1,644 +1,644 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
- * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces
- * in there will break the build on some platforms.
- */
-
-#include "assyntax.h"
-#include "matypes.h"
-#include "xform_args.h"
-
- SEG_TEXT
-
-#define FP_ONE 1065353216
-#define FP_ZERO 0
-
-#define SRC0 REGOFF(0, ESI)
-#define SRC1 REGOFF(4, ESI)
-#define SRC2 REGOFF(8, ESI)
-#define SRC3 REGOFF(12, ESI)
-#define DST0 REGOFF(0, EDI)
-#define DST1 REGOFF(4, EDI)
-#define DST2 REGOFF(8, EDI)
-#define DST3 REGOFF(12, EDI)
-#define MAT0 REGOFF(0, EDX)
-#define MAT1 REGOFF(4, EDX)
-#define MAT2 REGOFF(8, EDX)
-#define MAT3 REGOFF(12, EDX)
-#define MAT4 REGOFF(16, EDX)
-#define MAT5 REGOFF(20, EDX)
-#define MAT6 REGOFF(24, EDX)
-#define MAT7 REGOFF(28, EDX)
-#define MAT8 REGOFF(32, EDX)
-#define MAT9 REGOFF(36, EDX)
-#define MAT10 REGOFF(40, EDX)
-#define MAT11 REGOFF(44, EDX)
-#define MAT12 REGOFF(48, EDX)
-#define MAT13 REGOFF(52, EDX)
-#define MAT14 REGOFF(56, EDX)
-#define MAT15 REGOFF(60, EDX)
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points3_general )
-HIDDEN(_mesa_x86_transform_points3_general)
-GLNAME( _mesa_x86_transform_points3_general ):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p3_gr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p3_gr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
- FLD_S( SRC0 ) /* F5 F4 */
- FMUL_S( MAT1 )
- FLD_S( SRC0 ) /* F6 F5 F4 */
- FMUL_S( MAT2 )
- FLD_S( SRC0 ) /* F7 F6 F5 F4 */
- FMUL_S( MAT3 )
-
- FLD_S( SRC1 ) /* F0 F7 F6 F5 F4 */
- FMUL_S( MAT4 )
- FLD_S( SRC1 ) /* F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT5 )
- FLD_S( SRC1 ) /* F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT6 )
- FLD_S( SRC1 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT7 )
-
- FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
- FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
-
- FLD_S( SRC2 ) /* F0 F7 F6 F5 F4 */
- FMUL_S( MAT8 )
- FLD_S( SRC2 ) /* F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT9 )
- FLD_S( SRC2 ) /* F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT10 )
- FLD_S( SRC2 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT11 )
-
- FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
- FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
-
- FXCH( ST(3) ) /* F4 F6 F5 F7 */
- FADD_S( MAT12 )
- FXCH( ST(2) ) /* F5 F6 F4 F7 */
- FADD_S( MAT13 )
- FXCH( ST(1) ) /* F6 F5 F4 F7 */
- FADD_S( MAT14 )
- FXCH( ST(3) ) /* F7 F5 F4 F6 */
- FADD_S( MAT15 )
-
- FXCH( ST(2) ) /* F4 F5 F7 F6 */
- FSTP_S( DST0 ) /* F5 F7 F6 */
- FSTP_S( DST1 ) /* F7 F6 */
- FXCH( ST(1) ) /* F6 F7 */
- FSTP_S( DST2 ) /* F7 */
- FSTP_S( DST3 ) /* */
-
-LLBL(x86_p3_gr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p3_gr_loop) )
-
-LLBL(x86_p3_gr_done):
-
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points3_perspective )
-HIDDEN(_mesa_x86_transform_points3_perspective)
-GLNAME( _mesa_x86_transform_points3_perspective ):
-
-#define FRAME_OFFSET 12
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p3_pr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p3_pr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
-
- FLD_S( SRC1 ) /* F5 F4 */
- FMUL_S( MAT5 )
-
- FLD_S( SRC2 ) /* F0 F5 F4 */
- FMUL_S( MAT8 )
- FLD_S( SRC2 ) /* F1 F0 F5 F4 */
- FMUL_S( MAT9 )
- FLD_S( SRC2 ) /* F2 F1 F0 F5 F4 */
- FMUL_S( MAT10 )
-
- FXCH( ST(2) ) /* F0 F1 F2 F5 F4 */
- FADDP( ST0, ST(4) ) /* F1 F2 F5 F4 */
- FADDP( ST0, ST(2) ) /* F2 F5 F4 */
- FLD_S( MAT14 ) /* F6 F2 F5 F4 */
- FXCH( ST(1) ) /* F2 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F6 F5 F4 */
-
- MOV_L( SRC2, EBX )
- XOR_L( CONST(-2147483648), EBX )/* change sign */
-
- FXCH( ST(2) ) /* F4 F5 F6 */
- FSTP_S( DST0 ) /* F5 F6 */
- FSTP_S( DST1 ) /* F6 */
- FSTP_S( DST2 ) /* */
- MOV_L( EBX, DST3 )
-
-LLBL(x86_p3_pr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p3_pr_loop) )
-
-LLBL(x86_p3_pr_done):
-
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points3_3d )
-HIDDEN(_mesa_x86_transform_points3_3d)
-GLNAME( _mesa_x86_transform_points3_3d ):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p3_3dr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p3_3dr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
- FLD_S( SRC0 ) /* F5 F4 */
- FMUL_S( MAT1 )
- FLD_S( SRC0 ) /* F6 F5 F4 */
- FMUL_S( MAT2 )
-
- FLD_S( SRC1 ) /* F0 F6 F5 F4 */
- FMUL_S( MAT4 )
- FLD_S( SRC1 ) /* F1 F0 F6 F5 F4 */
- FMUL_S( MAT5 )
- FLD_S( SRC1 ) /* F2 F1 F0 F6 F5 F4 */
- FMUL_S( MAT6 )
-
- FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F6 F5 F4 */
-
- FLD_S( SRC2 ) /* F0 F6 F5 F4 */
- FMUL_S( MAT8 )
- FLD_S( SRC2 ) /* F1 F0 F6 F5 F4 */
- FMUL_S( MAT9 )
- FLD_S( SRC2 ) /* F2 F1 F0 F6 F5 F4 */
- FMUL_S( MAT10 )
-
- FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F6 F5 F4 */
-
- FXCH( ST(2) ) /* F4 F5 F6 */
- FADD_S( MAT12 )
- FXCH( ST(1) ) /* F5 F4 F6 */
- FADD_S( MAT13 )
- FXCH( ST(2) ) /* F6 F4 F5 */
- FADD_S( MAT14 )
-
- FXCH( ST(1) ) /* F4 F6 F5 */
- FSTP_S( DST0 ) /* F6 F5 */
- FXCH( ST(1) ) /* F5 F6 */
- FSTP_S( DST1 ) /* F6 */
- FSTP_S( DST2 ) /* */
-
-LLBL(x86_p3_3dr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p3_3dr_loop) )
-
-LLBL(x86_p3_3dr_done):
-
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points3_3d_no_rot )
-HIDDEN(_mesa_x86_transform_points3_3d_no_rot)
-GLNAME( _mesa_x86_transform_points3_3d_no_rot ):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p3_3dnrr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p3_3dnrr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
-
- FLD_S( SRC1 ) /* F1 F4 */
- FMUL_S( MAT5 )
-
- FLD_S( SRC2 ) /* F2 F1 F4 */
- FMUL_S( MAT10 )
-
- FXCH( ST(2) ) /* F4 F1 F2 */
- FADD_S( MAT12 )
- FLD_S( MAT13 ) /* F5 F4 F1 F2 */
- FXCH( ST(2) ) /* F1 F4 F5 F2 */
- FADDP( ST0, ST(2) ) /* F4 F5 F2 */
- FLD_S( MAT14 ) /* F6 F4 F5 F2 */
- FXCH( ST(3) ) /* F2 F4 F5 F6 */
- FADDP( ST0, ST(3) ) /* F4 F5 F6 */
-
- FSTP_S( DST0 ) /* F5 F6 */
- FSTP_S( DST1 ) /* F6 */
- FSTP_S( DST2 ) /* */
-
-LLBL(x86_p3_3dnrr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p3_3dnrr_loop) )
-
-LLBL(x86_p3_3dnrr_done):
-
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points3_2d )
-HIDDEN(_mesa_x86_transform_points3_2d)
-GLNAME( _mesa_x86_transform_points3_2d ):
-
-#define FRAME_OFFSET 12
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p3_2dr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p3_2dr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
- FLD_S( SRC0 ) /* F5 F4 */
- FMUL_S( MAT1 )
-
- FLD_S( SRC1 ) /* F0 F5 F4 */
- FMUL_S( MAT4 )
- FLD_S( SRC1 ) /* F1 F0 F5 F4 */
- FMUL_S( MAT5 )
-
- FXCH( ST(1) ) /* F0 F1 F5 F4 */
- FADDP( ST0, ST(3) ) /* F1 F5 F4 */
- FADDP( ST0, ST(1) ) /* F5 F4 */
-
- FXCH( ST(1) ) /* F4 F5 */
- FADD_S( MAT12 )
- FXCH( ST(1) ) /* F5 F4 */
- FADD_S( MAT13 )
-
- MOV_L( SRC2, EBX )
-
- FXCH( ST(1) ) /* F4 F5 */
- FSTP_S( DST0 ) /* F5 */
- FSTP_S( DST1 ) /* */
- MOV_L( EBX, DST2 )
-
-LLBL(x86_p3_2dr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p3_2dr_loop) )
-
-LLBL(x86_p3_2dr_done):
-
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points3_2d_no_rot )
-HIDDEN(_mesa_x86_transform_points3_2d_no_rot)
-GLNAME( _mesa_x86_transform_points3_2d_no_rot ):
-
-#define FRAME_OFFSET 12
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p3_2dnrr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p3_2dnrr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
-
- FLD_S( SRC1 ) /* F1 F4 */
- FMUL_S( MAT5 )
-
- FXCH( ST(1) ) /* F4 F1 */
- FADD_S( MAT12 )
- FLD_S( MAT13 ) /* F5 F4 F1 */
-
- FXCH( ST(2) ) /* F1 F4 F5 */
- FADDP( ST0, ST(2) ) /* F4 F5 */
-
- MOV_L( SRC2, EBX )
-
- FSTP_S( DST0 ) /* F5 */
- FSTP_S( DST1 ) /* */
- MOV_L( EBX, DST2 )
-
-LLBL(x86_p3_2dnrr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p3_2dnrr_loop) )
-
-LLBL(x86_p3_2dnrr_done):
-
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points3_identity )
-HIDDEN(_mesa_x86_transform_points3_identity)
-GLNAME(_mesa_x86_transform_points3_identity ):
-
-#define FRAME_OFFSET 16
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
- PUSH_L( EBP )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p3_ir_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
- CMP_L( ESI, EDI )
- JE( LLBL(x86_p3_ir_done) )
-
-ALIGNTEXT16
-LLBL(x86_p3_ir_loop):
-
-#if 1
- MOV_L( SRC0, EBX )
- MOV_L( SRC1, EBP )
- MOV_L( SRC2, EDX )
-
- MOV_L( EBX, DST0 )
- MOV_L( EBP, DST1 )
- MOV_L( EDX, DST2 )
-#else
- FLD_S( SRC0 )
- FLD_S( SRC1 )
- FLD_S( SRC2 )
-
- FSTP_S( DST2 )
- FSTP_S( DST1 )
- FSTP_S( DST0 )
-#endif
-
-LLBL(x86_p3_ir_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p3_ir_loop) )
-
-LLBL(x86_p3_ir_done):
-
- POP_L( EBP )
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * NOTE: Avoid using spaces in between '(' ')' and arguments, especially + * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces + * in there will break the build on some platforms. + */ + +#include "assyntax.h" +#include "matypes.h" +#include "xform_args.h" + + SEG_TEXT + +#define FP_ONE 1065353216 +#define FP_ZERO 0 + +#define SRC0 REGOFF(0, ESI) +#define SRC1 REGOFF(4, ESI) +#define SRC2 REGOFF(8, ESI) +#define SRC3 REGOFF(12, ESI) +#define DST0 REGOFF(0, EDI) +#define DST1 REGOFF(4, EDI) +#define DST2 REGOFF(8, EDI) +#define DST3 REGOFF(12, EDI) +#define MAT0 REGOFF(0, EDX) +#define MAT1 REGOFF(4, EDX) +#define MAT2 REGOFF(8, EDX) +#define MAT3 REGOFF(12, EDX) +#define MAT4 REGOFF(16, EDX) +#define MAT5 REGOFF(20, EDX) +#define MAT6 REGOFF(24, EDX) +#define MAT7 REGOFF(28, EDX) +#define MAT8 REGOFF(32, EDX) +#define MAT9 REGOFF(36, EDX) +#define MAT10 REGOFF(40, EDX) +#define MAT11 REGOFF(44, EDX) +#define MAT12 REGOFF(48, EDX) +#define MAT13 REGOFF(52, EDX) +#define MAT14 REGOFF(56, EDX) +#define MAT15 REGOFF(60, EDX) + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points3_general ) +HIDDEN(_mesa_x86_transform_points3_general) +GLNAME( _mesa_x86_transform_points3_general ): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p3_gr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p3_gr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + FLD_S( SRC0 ) /* F5 F4 */ + FMUL_S( MAT1 ) + FLD_S( SRC0 ) /* F6 F5 F4 */ + FMUL_S( MAT2 ) + FLD_S( SRC0 ) /* F7 F6 F5 F4 */ + FMUL_S( MAT3 ) + + FLD_S( SRC1 ) /* F0 F7 F6 F5 F4 */ + FMUL_S( MAT4 ) + FLD_S( SRC1 ) /* F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT5 ) + FLD_S( SRC1 ) /* F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT6 ) + FLD_S( SRC1 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT7 ) + + FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */ + FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */ + + FLD_S( SRC2 ) /* F0 F7 F6 F5 F4 */ + FMUL_S( MAT8 ) + FLD_S( SRC2 ) /* F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT9 ) + FLD_S( SRC2 ) /* F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT10 ) + FLD_S( SRC2 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT11 ) + + FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */ + FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */ + + FXCH( ST(3) ) /* F4 F6 F5 F7 */ + FADD_S( MAT12 ) + FXCH( ST(2) ) /* F5 F6 F4 F7 */ + FADD_S( MAT13 ) + FXCH( ST(1) ) /* F6 F5 F4 F7 */ + FADD_S( MAT14 ) + FXCH( ST(3) ) /* F7 F5 F4 F6 */ + FADD_S( MAT15 ) + + FXCH( ST(2) ) /* F4 F5 F7 F6 */ + FSTP_S( DST0 ) /* F5 F7 F6 */ + FSTP_S( DST1 ) /* F7 F6 */ + FXCH( ST(1) ) /* F6 F7 */ + FSTP_S( DST2 ) /* F7 */ + FSTP_S( DST3 ) /* */ + +LLBL(x86_p3_gr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p3_gr_loop) ) + +LLBL(x86_p3_gr_done): + + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points3_perspective ) +HIDDEN(_mesa_x86_transform_points3_perspective) +GLNAME( _mesa_x86_transform_points3_perspective ): + +#define FRAME_OFFSET 12 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p3_pr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p3_pr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + + FLD_S( SRC1 ) /* F5 F4 */ + FMUL_S( MAT5 ) + + FLD_S( SRC2 ) /* F0 F5 F4 */ + FMUL_S( MAT8 ) + FLD_S( SRC2 ) /* F1 F0 F5 F4 */ + FMUL_S( MAT9 ) + FLD_S( SRC2 ) /* F2 F1 F0 F5 F4 */ + FMUL_S( MAT10 ) + + FXCH( ST(2) ) /* F0 F1 F2 F5 F4 */ + FADDP( ST0, ST(4) ) /* F1 F2 F5 F4 */ + FADDP( ST0, ST(2) ) /* F2 F5 F4 */ + FLD_S( MAT14 ) /* F6 F2 F5 F4 */ + FXCH( ST(1) ) /* F2 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F6 F5 F4 */ + + MOV_L( SRC2, EBX ) + XOR_L( CONST(-2147483648), EBX )/* change sign */ + + FXCH( ST(2) ) /* F4 F5 F6 */ + FSTP_S( DST0 ) /* F5 F6 */ + FSTP_S( DST1 ) /* F6 */ + FSTP_S( DST2 ) /* */ + MOV_L( EBX, DST3 ) + +LLBL(x86_p3_pr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p3_pr_loop) ) + +LLBL(x86_p3_pr_done): + + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points3_3d ) +HIDDEN(_mesa_x86_transform_points3_3d) +GLNAME( _mesa_x86_transform_points3_3d ): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p3_3dr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p3_3dr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + FLD_S( SRC0 ) /* F5 F4 */ + FMUL_S( MAT1 ) + FLD_S( SRC0 ) /* F6 F5 F4 */ + FMUL_S( MAT2 ) + + FLD_S( SRC1 ) /* F0 F6 F5 F4 */ + FMUL_S( MAT4 ) + FLD_S( SRC1 ) /* F1 F0 F6 F5 F4 */ + FMUL_S( MAT5 ) + FLD_S( SRC1 ) /* F2 F1 F0 F6 F5 F4 */ + FMUL_S( MAT6 ) + + FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F6 F5 F4 */ + + FLD_S( SRC2 ) /* F0 F6 F5 F4 */ + FMUL_S( MAT8 ) + FLD_S( SRC2 ) /* F1 F0 F6 F5 F4 */ + FMUL_S( MAT9 ) + FLD_S( SRC2 ) /* F2 F1 F0 F6 F5 F4 */ + FMUL_S( MAT10 ) + + FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F6 F5 F4 */ + + FXCH( ST(2) ) /* F4 F5 F6 */ + FADD_S( MAT12 ) + FXCH( ST(1) ) /* F5 F4 F6 */ + FADD_S( MAT13 ) + FXCH( ST(2) ) /* F6 F4 F5 */ + FADD_S( MAT14 ) + + FXCH( ST(1) ) /* F4 F6 F5 */ + FSTP_S( DST0 ) /* F6 F5 */ + FXCH( ST(1) ) /* F5 F6 */ + FSTP_S( DST1 ) /* F6 */ + FSTP_S( DST2 ) /* */ + +LLBL(x86_p3_3dr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p3_3dr_loop) ) + +LLBL(x86_p3_3dr_done): + + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points3_3d_no_rot ) +HIDDEN(_mesa_x86_transform_points3_3d_no_rot) +GLNAME( _mesa_x86_transform_points3_3d_no_rot ): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p3_3dnrr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p3_3dnrr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + + FLD_S( SRC1 ) /* F1 F4 */ + FMUL_S( MAT5 ) + + FLD_S( SRC2 ) /* F2 F1 F4 */ + FMUL_S( MAT10 ) + + FXCH( ST(2) ) /* F4 F1 F2 */ + FADD_S( MAT12 ) + FLD_S( MAT13 ) /* F5 F4 F1 F2 */ + FXCH( ST(2) ) /* F1 F4 F5 F2 */ + FADDP( ST0, ST(2) ) /* F4 F5 F2 */ + FLD_S( MAT14 ) /* F6 F4 F5 F2 */ + FXCH( ST(3) ) /* F2 F4 F5 F6 */ + FADDP( ST0, ST(3) ) /* F4 F5 F6 */ + + FSTP_S( DST0 ) /* F5 F6 */ + FSTP_S( DST1 ) /* F6 */ + FSTP_S( DST2 ) /* */ + +LLBL(x86_p3_3dnrr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p3_3dnrr_loop) ) + +LLBL(x86_p3_3dnrr_done): + + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points3_2d ) +HIDDEN(_mesa_x86_transform_points3_2d) +GLNAME( _mesa_x86_transform_points3_2d ): + +#define FRAME_OFFSET 12 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p3_2dr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p3_2dr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + FLD_S( SRC0 ) /* F5 F4 */ + FMUL_S( MAT1 ) + + FLD_S( SRC1 ) /* F0 F5 F4 */ + FMUL_S( MAT4 ) + FLD_S( SRC1 ) /* F1 F0 F5 F4 */ + FMUL_S( MAT5 ) + + FXCH( ST(1) ) /* F0 F1 F5 F4 */ + FADDP( ST0, ST(3) ) /* F1 F5 F4 */ + FADDP( ST0, ST(1) ) /* F5 F4 */ + + FXCH( ST(1) ) /* F4 F5 */ + FADD_S( MAT12 ) + FXCH( ST(1) ) /* F5 F4 */ + FADD_S( MAT13 ) + + MOV_L( SRC2, EBX ) + + FXCH( ST(1) ) /* F4 F5 */ + FSTP_S( DST0 ) /* F5 */ + FSTP_S( DST1 ) /* */ + MOV_L( EBX, DST2 ) + +LLBL(x86_p3_2dr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p3_2dr_loop) ) + +LLBL(x86_p3_2dr_done): + + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points3_2d_no_rot ) +HIDDEN(_mesa_x86_transform_points3_2d_no_rot) +GLNAME( _mesa_x86_transform_points3_2d_no_rot ): + +#define FRAME_OFFSET 12 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p3_2dnrr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p3_2dnrr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + + FLD_S( SRC1 ) /* F1 F4 */ + FMUL_S( MAT5 ) + + FXCH( ST(1) ) /* F4 F1 */ + FADD_S( MAT12 ) + FLD_S( MAT13 ) /* F5 F4 F1 */ + + FXCH( ST(2) ) /* F1 F4 F5 */ + FADDP( ST0, ST(2) ) /* F4 F5 */ + + MOV_L( SRC2, EBX ) + + FSTP_S( DST0 ) /* F5 */ + FSTP_S( DST1 ) /* */ + MOV_L( EBX, DST2 ) + +LLBL(x86_p3_2dnrr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p3_2dnrr_loop) ) + +LLBL(x86_p3_2dnrr_done): + + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points3_identity ) +HIDDEN(_mesa_x86_transform_points3_identity) +GLNAME(_mesa_x86_transform_points3_identity ): + +#define FRAME_OFFSET 16 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + PUSH_L( EBP ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p3_ir_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + + CMP_L( ESI, EDI ) + JE( LLBL(x86_p3_ir_done) ) + +ALIGNTEXT16 +LLBL(x86_p3_ir_loop): + +#if 1 + MOV_L( SRC0, EBX ) + MOV_L( SRC1, EBP ) + MOV_L( SRC2, EDX ) + + MOV_L( EBX, DST0 ) + MOV_L( EBP, DST1 ) + MOV_L( EDX, DST2 ) +#else + FLD_S( SRC0 ) + FLD_S( SRC1 ) + FLD_S( SRC2 ) + + FSTP_S( DST2 ) + FSTP_S( DST1 ) + FSTP_S( DST0 ) +#endif + +LLBL(x86_p3_ir_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p3_ir_loop) ) + +LLBL(x86_p3_ir_done): + + POP_L( EBP ) + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/x86_xform4.S b/mesalib/src/mesa/x86/x86_xform4.S index 687426eb5..97a841138 100644 --- a/mesalib/src/mesa/x86/x86_xform4.S +++ b/mesalib/src/mesa/x86/x86_xform4.S @@ -1,677 +1,677 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
- * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces
- * in there will break the build on some platforms.
- */
-
-#include "assyntax.h"
-#include "matypes.h"
-#include "xform_args.h"
-
- SEG_TEXT
-
-#define FP_ONE 1065353216
-#define FP_ZERO 0
-
-#define SRC0 REGOFF(0, ESI)
-#define SRC1 REGOFF(4, ESI)
-#define SRC2 REGOFF(8, ESI)
-#define SRC3 REGOFF(12, ESI)
-#define DST0 REGOFF(0, EDI)
-#define DST1 REGOFF(4, EDI)
-#define DST2 REGOFF(8, EDI)
-#define DST3 REGOFF(12, EDI)
-#define MAT0 REGOFF(0, EDX)
-#define MAT1 REGOFF(4, EDX)
-#define MAT2 REGOFF(8, EDX)
-#define MAT3 REGOFF(12, EDX)
-#define MAT4 REGOFF(16, EDX)
-#define MAT5 REGOFF(20, EDX)
-#define MAT6 REGOFF(24, EDX)
-#define MAT7 REGOFF(28, EDX)
-#define MAT8 REGOFF(32, EDX)
-#define MAT9 REGOFF(36, EDX)
-#define MAT10 REGOFF(40, EDX)
-#define MAT11 REGOFF(44, EDX)
-#define MAT12 REGOFF(48, EDX)
-#define MAT13 REGOFF(52, EDX)
-#define MAT14 REGOFF(56, EDX)
-#define MAT15 REGOFF(60, EDX)
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points4_general )
-HIDDEN(_mesa_x86_transform_points4_general)
-GLNAME( _mesa_x86_transform_points4_general ):
-
-#define FRAME_OFFSET 8
- PUSH_L( ESI )
- PUSH_L( EDI )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p4_gr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p4_gr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
- FLD_S( SRC0 ) /* F5 F4 */
- FMUL_S( MAT1 )
- FLD_S( SRC0 ) /* F6 F5 F4 */
- FMUL_S( MAT2 )
- FLD_S( SRC0 ) /* F7 F6 F5 F4 */
- FMUL_S( MAT3 )
-
- FLD_S( SRC1 ) /* F0 F7 F6 F5 F4 */
- FMUL_S( MAT4 )
- FLD_S( SRC1 ) /* F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT5 )
- FLD_S( SRC1 ) /* F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT6 )
- FLD_S( SRC1 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT7 )
-
- FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
- FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
-
- FLD_S( SRC2 ) /* F0 F7 F6 F5 F4 */
- FMUL_S( MAT8 )
- FLD_S( SRC2 ) /* F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT9 )
- FLD_S( SRC2 ) /* F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT10 )
- FLD_S( SRC2 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT11 )
-
- FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
- FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
-
- FLD_S( SRC3 ) /* F0 F7 F6 F5 F4 */
- FMUL_S( MAT12 )
- FLD_S( SRC3 ) /* F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT13 )
- FLD_S( SRC3 ) /* F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT14 )
- FLD_S( SRC3 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
- FMUL_S( MAT15 )
-
- FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
- FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
-
- FXCH( ST(3) ) /* F4 F6 F5 F7 */
- FSTP_S( DST0 ) /* F6 F5 F7 */
- FXCH( ST(1) ) /* F5 F6 F7 */
- FSTP_S( DST1 ) /* F6 F7 */
- FSTP_S( DST2 ) /* F7 */
- FSTP_S( DST3 ) /* */
-
-LLBL(x86_p4_gr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p4_gr_loop) )
-
-LLBL(x86_p4_gr_done):
-
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points4_perspective )
-HIDDEN(_mesa_x86_transform_points4_perspective)
-GLNAME( _mesa_x86_transform_points4_perspective ):
-
-#define FRAME_OFFSET 12
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p4_pr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p4_pr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
-
- FLD_S( SRC1 ) /* F5 F4 */
- FMUL_S( MAT5 )
-
- FLD_S( SRC2 ) /* F0 F5 F4 */
- FMUL_S( MAT8 )
- FLD_S( SRC2 ) /* F1 F0 F5 F4 */
- FMUL_S( MAT9 )
- FLD_S( SRC2 ) /* F6 F1 F0 F5 F4 */
- FMUL_S( MAT10 )
-
- FXCH( ST(2) ) /* F0 F1 F6 F5 F4 */
- FADDP( ST0, ST(4) ) /* F1 F6 F5 F4 */
- FADDP( ST0, ST(2) ) /* F6 F5 F4 */
-
- FLD_S( SRC3 ) /* F2 F6 F5 F4 */
- FMUL_S( MAT14 )
-
- FADDP( ST0, ST(1) ) /* F6 F5 F4 */
-
- MOV_L( SRC2, EBX )
- XOR_L( CONST(-2147483648), EBX )/* change sign */
-
- FXCH( ST(2) ) /* F4 F5 F6 */
- FSTP_S( DST0 ) /* F5 F6 */
- FSTP_S( DST1 ) /* F6 */
- FSTP_S( DST2 ) /* */
- MOV_L( EBX, DST3 )
-
-LLBL(x86_p4_pr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p4_pr_loop) )
-
-LLBL(x86_p4_pr_done):
-
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points4_3d )
-HIDDEN(_mesa_x86_transform_points4_3d)
-GLNAME( _mesa_x86_transform_points4_3d ):
-
-#define FRAME_OFFSET 12
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p4_3dr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p4_3dr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
- FLD_S( SRC0 ) /* F5 F4 */
- FMUL_S( MAT1 )
- FLD_S( SRC0 ) /* F6 F5 F4 */
- FMUL_S( MAT2 )
-
- FLD_S( SRC1 ) /* F0 F6 F5 F4 */
- FMUL_S( MAT4 )
- FLD_S( SRC1 ) /* F1 F0 F6 F5 F4 */
- FMUL_S( MAT5 )
- FLD_S( SRC1 ) /* F2 F1 F0 F6 F5 F4 */
- FMUL_S( MAT6 )
-
- FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F6 F5 F4 */
-
- FLD_S( SRC2 ) /* F0 F6 F5 F4 */
- FMUL_S( MAT8 )
- FLD_S( SRC2 ) /* F1 F0 F6 F5 F4 */
- FMUL_S( MAT9 )
- FLD_S( SRC2 ) /* F2 F1 F0 F6 F5 F4 */
- FMUL_S( MAT10 )
-
- FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F6 F5 F4 */
-
- FLD_S( SRC3 ) /* F0 F6 F5 F4 */
- FMUL_S( MAT12 )
- FLD_S( SRC3 ) /* F1 F0 F6 F5 F4 */
- FMUL_S( MAT13 )
- FLD_S( SRC3 ) /* F2 F1 F0 F6 F5 F4 */
- FMUL_S( MAT14 )
-
- FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F6 F5 F4 */
-
- MOV_L( SRC3, EBX )
-
- FXCH( ST(2) ) /* F4 F5 F6 */
- FSTP_S( DST0 ) /* F5 F6 */
- FSTP_S( DST1 ) /* F6 */
- FSTP_S( DST2 ) /* */
- MOV_L( EBX, DST3 )
-
-LLBL(x86_p4_3dr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p4_3dr_loop) )
-
-LLBL(x86_p4_3dr_done):
-
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_x86_transform_points4_3d_no_rot)
-HIDDEN(_mesa_x86_transform_points4_3d_no_rot)
-GLNAME(_mesa_x86_transform_points4_3d_no_rot):
-
-#define FRAME_OFFSET 12
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p4_3dnrr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p4_3dnrr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
-
- FLD_S( SRC1 ) /* F5 F4 */
- FMUL_S( MAT5 )
-
- FLD_S( SRC2 ) /* F6 F5 F4 */
- FMUL_S( MAT10 )
-
- FLD_S( SRC3 ) /* F0 F6 F5 F4 */
- FMUL_S( MAT12 )
- FLD_S( SRC3 ) /* F1 F0 F6 F5 F4 */
- FMUL_S( MAT13 )
- FLD_S( SRC3 ) /* F2 F1 F0 F6 F5 F4 */
- FMUL_S( MAT14 )
-
- FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
- FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
- FADDP( ST0, ST(1) ) /* F6 F5 F4 */
-
- MOV_L( SRC3, EBX )
-
- FXCH( ST(2) ) /* F4 F5 F6 */
- FSTP_S( DST0 ) /* F5 F6 */
- FSTP_S( DST1 ) /* F6 */
- FSTP_S( DST2 ) /* */
- MOV_L( EBX, DST3 )
-
-LLBL(x86_p4_3dnrr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p4_3dnrr_loop) )
-
-LLBL(x86_p4_3dnrr_done):
-
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points4_2d )
-HIDDEN(_mesa_x86_transform_points4_2d)
-GLNAME( _mesa_x86_transform_points4_2d ):
-
-#define FRAME_OFFSET 16
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
- PUSH_L( EBP )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p4_2dr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p4_2dr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
- FLD_S( SRC0 ) /* F5 F4 */
- FMUL_S( MAT1 )
-
- FLD_S( SRC1 ) /* F0 F5 F4 */
- FMUL_S( MAT4 )
- FLD_S( SRC1 ) /* F1 F0 F5 F4 */
- FMUL_S( MAT5 )
-
- FXCH( ST(1) ) /* F0 F1 F5 F4 */
- FADDP( ST0, ST(3) ) /* F1 F5 F4 */
- FADDP( ST0, ST(1) ) /* F5 F4 */
-
- FLD_S( SRC3 ) /* F0 F5 F4 */
- FMUL_S( MAT12 )
- FLD_S( SRC3 ) /* F1 F0 F5 F4 */
- FMUL_S( MAT13 )
-
- FXCH( ST(1) ) /* F0 F1 F5 F4 */
- FADDP( ST0, ST(3) ) /* F1 F5 F4 */
- FADDP( ST0, ST(1) ) /* F5 F4 */
-
- MOV_L( SRC2, EBX )
- MOV_L( SRC3, EBP )
-
- FXCH( ST(1) ) /* F4 F5 */
- FSTP_S( DST0 ) /* F5 */
- FSTP_S( DST1 ) /* */
- MOV_L( EBX, DST2 )
- MOV_L( EBP, DST3 )
-
-LLBL(x86_p4_2dr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p4_2dr_loop) )
-
-LLBL(x86_p4_2dr_done):
-
- POP_L( EBP )
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points4_2d_no_rot )
-HIDDEN(_mesa_x86_transform_points4_2d_no_rot)
-GLNAME( _mesa_x86_transform_points4_2d_no_rot ):
-
-#define FRAME_OFFSET 16
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
- PUSH_L( EBP )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p4_2dnrr_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
-ALIGNTEXT16
-LLBL(x86_p4_2dnrr_loop):
-
- FLD_S( SRC0 ) /* F4 */
- FMUL_S( MAT0 )
-
- FLD_S( SRC1 ) /* F5 F4 */
- FMUL_S( MAT5 )
-
- FLD_S( SRC3 ) /* F0 F5 F4 */
- FMUL_S( MAT12 )
- FLD_S( SRC3 ) /* F1 F0 F5 F4 */
- FMUL_S( MAT13 )
-
- FXCH( ST(1) ) /* F0 F1 F5 F4 */
- FADDP( ST0, ST(3) ) /* F1 F5 F4 */
- FADDP( ST0, ST(1) ) /* F5 F4 */
-
- MOV_L( SRC2, EBX )
- MOV_L( SRC3, EBP )
-
- FXCH( ST(1) ) /* F4 F5 */
- FSTP_S( DST0 ) /* F5 */
- FSTP_S( DST1 ) /* */
- MOV_L( EBX, DST2 )
- MOV_L( EBP, DST3 )
-
-LLBL(x86_p4_2dnrr_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p4_2dnrr_loop) )
-
-LLBL(x86_p4_2dnrr_done):
-
- POP_L( EBP )
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-#undef FRAME_OFFSET
-
-
-
-
-ALIGNTEXT16
-GLOBL GLNAME( _mesa_x86_transform_points4_identity )
-HIDDEN(_mesa_x86_transform_points4_identity)
-GLNAME( _mesa_x86_transform_points4_identity ):
-
-#define FRAME_OFFSET 12
- PUSH_L( ESI )
- PUSH_L( EDI )
- PUSH_L( EBX )
-
- MOV_L( ARG_SOURCE, ESI )
- MOV_L( ARG_DEST, EDI )
-
- MOV_L( ARG_MATRIX, EDX )
- MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
-
- TEST_L( ECX, ECX )
- JZ( LLBL(x86_p4_ir_done) )
-
- MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
- OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
-
- MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
- MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
-
- SHL_L( CONST(4), ECX )
- MOV_L( REGOFF(V4F_START, ESI), ESI )
-
- MOV_L( REGOFF(V4F_START, EDI), EDI )
- ADD_L( EDI, ECX )
-
- CMP_L( ESI, EDI )
- JE( LLBL(x86_p4_ir_done) )
-
-ALIGNTEXT16
-LLBL(x86_p4_ir_loop):
-
- MOV_L( SRC0, EBX )
- MOV_L( SRC1, EDX )
-
- MOV_L( EBX, DST0 )
- MOV_L( EDX, DST1 )
-
- MOV_L( SRC2, EBX )
- MOV_L( SRC3, EDX )
-
- MOV_L( EBX, DST2 )
- MOV_L( EDX, DST3 )
-
-LLBL(x86_p4_ir_skip):
-
- ADD_L( CONST(16), EDI )
- ADD_L( EAX, ESI )
- CMP_L( ECX, EDI )
- JNE( LLBL(x86_p4_ir_loop) )
-
-LLBL(x86_p4_ir_done):
-
- POP_L( EBX )
- POP_L( EDI )
- POP_L( ESI )
- RET
-
-#if defined (__ELF__) && defined (__linux__)
- .section .note.GNU-stack,"",%progbits
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * NOTE: Avoid using spaces in between '(' ')' and arguments, especially + * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces + * in there will break the build on some platforms. + */ + +#include "assyntax.h" +#include "matypes.h" +#include "xform_args.h" + + SEG_TEXT + +#define FP_ONE 1065353216 +#define FP_ZERO 0 + +#define SRC0 REGOFF(0, ESI) +#define SRC1 REGOFF(4, ESI) +#define SRC2 REGOFF(8, ESI) +#define SRC3 REGOFF(12, ESI) +#define DST0 REGOFF(0, EDI) +#define DST1 REGOFF(4, EDI) +#define DST2 REGOFF(8, EDI) +#define DST3 REGOFF(12, EDI) +#define MAT0 REGOFF(0, EDX) +#define MAT1 REGOFF(4, EDX) +#define MAT2 REGOFF(8, EDX) +#define MAT3 REGOFF(12, EDX) +#define MAT4 REGOFF(16, EDX) +#define MAT5 REGOFF(20, EDX) +#define MAT6 REGOFF(24, EDX) +#define MAT7 REGOFF(28, EDX) +#define MAT8 REGOFF(32, EDX) +#define MAT9 REGOFF(36, EDX) +#define MAT10 REGOFF(40, EDX) +#define MAT11 REGOFF(44, EDX) +#define MAT12 REGOFF(48, EDX) +#define MAT13 REGOFF(52, EDX) +#define MAT14 REGOFF(56, EDX) +#define MAT15 REGOFF(60, EDX) + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points4_general ) +HIDDEN(_mesa_x86_transform_points4_general) +GLNAME( _mesa_x86_transform_points4_general ): + +#define FRAME_OFFSET 8 + PUSH_L( ESI ) + PUSH_L( EDI ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p4_gr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p4_gr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + FLD_S( SRC0 ) /* F5 F4 */ + FMUL_S( MAT1 ) + FLD_S( SRC0 ) /* F6 F5 F4 */ + FMUL_S( MAT2 ) + FLD_S( SRC0 ) /* F7 F6 F5 F4 */ + FMUL_S( MAT3 ) + + FLD_S( SRC1 ) /* F0 F7 F6 F5 F4 */ + FMUL_S( MAT4 ) + FLD_S( SRC1 ) /* F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT5 ) + FLD_S( SRC1 ) /* F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT6 ) + FLD_S( SRC1 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT7 ) + + FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */ + FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */ + + FLD_S( SRC2 ) /* F0 F7 F6 F5 F4 */ + FMUL_S( MAT8 ) + FLD_S( SRC2 ) /* F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT9 ) + FLD_S( SRC2 ) /* F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT10 ) + FLD_S( SRC2 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT11 ) + + FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */ + FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */ + + FLD_S( SRC3 ) /* F0 F7 F6 F5 F4 */ + FMUL_S( MAT12 ) + FLD_S( SRC3 ) /* F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT13 ) + FLD_S( SRC3 ) /* F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT14 ) + FLD_S( SRC3 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */ + FMUL_S( MAT15 ) + + FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */ + FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */ + + FXCH( ST(3) ) /* F4 F6 F5 F7 */ + FSTP_S( DST0 ) /* F6 F5 F7 */ + FXCH( ST(1) ) /* F5 F6 F7 */ + FSTP_S( DST1 ) /* F6 F7 */ + FSTP_S( DST2 ) /* F7 */ + FSTP_S( DST3 ) /* */ + +LLBL(x86_p4_gr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p4_gr_loop) ) + +LLBL(x86_p4_gr_done): + + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points4_perspective ) +HIDDEN(_mesa_x86_transform_points4_perspective) +GLNAME( _mesa_x86_transform_points4_perspective ): + +#define FRAME_OFFSET 12 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p4_pr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p4_pr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + + FLD_S( SRC1 ) /* F5 F4 */ + FMUL_S( MAT5 ) + + FLD_S( SRC2 ) /* F0 F5 F4 */ + FMUL_S( MAT8 ) + FLD_S( SRC2 ) /* F1 F0 F5 F4 */ + FMUL_S( MAT9 ) + FLD_S( SRC2 ) /* F6 F1 F0 F5 F4 */ + FMUL_S( MAT10 ) + + FXCH( ST(2) ) /* F0 F1 F6 F5 F4 */ + FADDP( ST0, ST(4) ) /* F1 F6 F5 F4 */ + FADDP( ST0, ST(2) ) /* F6 F5 F4 */ + + FLD_S( SRC3 ) /* F2 F6 F5 F4 */ + FMUL_S( MAT14 ) + + FADDP( ST0, ST(1) ) /* F6 F5 F4 */ + + MOV_L( SRC2, EBX ) + XOR_L( CONST(-2147483648), EBX )/* change sign */ + + FXCH( ST(2) ) /* F4 F5 F6 */ + FSTP_S( DST0 ) /* F5 F6 */ + FSTP_S( DST1 ) /* F6 */ + FSTP_S( DST2 ) /* */ + MOV_L( EBX, DST3 ) + +LLBL(x86_p4_pr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p4_pr_loop) ) + +LLBL(x86_p4_pr_done): + + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points4_3d ) +HIDDEN(_mesa_x86_transform_points4_3d) +GLNAME( _mesa_x86_transform_points4_3d ): + +#define FRAME_OFFSET 12 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p4_3dr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p4_3dr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + FLD_S( SRC0 ) /* F5 F4 */ + FMUL_S( MAT1 ) + FLD_S( SRC0 ) /* F6 F5 F4 */ + FMUL_S( MAT2 ) + + FLD_S( SRC1 ) /* F0 F6 F5 F4 */ + FMUL_S( MAT4 ) + FLD_S( SRC1 ) /* F1 F0 F6 F5 F4 */ + FMUL_S( MAT5 ) + FLD_S( SRC1 ) /* F2 F1 F0 F6 F5 F4 */ + FMUL_S( MAT6 ) + + FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F6 F5 F4 */ + + FLD_S( SRC2 ) /* F0 F6 F5 F4 */ + FMUL_S( MAT8 ) + FLD_S( SRC2 ) /* F1 F0 F6 F5 F4 */ + FMUL_S( MAT9 ) + FLD_S( SRC2 ) /* F2 F1 F0 F6 F5 F4 */ + FMUL_S( MAT10 ) + + FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F6 F5 F4 */ + + FLD_S( SRC3 ) /* F0 F6 F5 F4 */ + FMUL_S( MAT12 ) + FLD_S( SRC3 ) /* F1 F0 F6 F5 F4 */ + FMUL_S( MAT13 ) + FLD_S( SRC3 ) /* F2 F1 F0 F6 F5 F4 */ + FMUL_S( MAT14 ) + + FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F6 F5 F4 */ + + MOV_L( SRC3, EBX ) + + FXCH( ST(2) ) /* F4 F5 F6 */ + FSTP_S( DST0 ) /* F5 F6 */ + FSTP_S( DST1 ) /* F6 */ + FSTP_S( DST2 ) /* */ + MOV_L( EBX, DST3 ) + +LLBL(x86_p4_3dr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p4_3dr_loop) ) + +LLBL(x86_p4_3dr_done): + + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME(_mesa_x86_transform_points4_3d_no_rot) +HIDDEN(_mesa_x86_transform_points4_3d_no_rot) +GLNAME(_mesa_x86_transform_points4_3d_no_rot): + +#define FRAME_OFFSET 12 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p4_3dnrr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p4_3dnrr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + + FLD_S( SRC1 ) /* F5 F4 */ + FMUL_S( MAT5 ) + + FLD_S( SRC2 ) /* F6 F5 F4 */ + FMUL_S( MAT10 ) + + FLD_S( SRC3 ) /* F0 F6 F5 F4 */ + FMUL_S( MAT12 ) + FLD_S( SRC3 ) /* F1 F0 F6 F5 F4 */ + FMUL_S( MAT13 ) + FLD_S( SRC3 ) /* F2 F1 F0 F6 F5 F4 */ + FMUL_S( MAT14 ) + + FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */ + FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */ + FADDP( ST0, ST(1) ) /* F6 F5 F4 */ + + MOV_L( SRC3, EBX ) + + FXCH( ST(2) ) /* F4 F5 F6 */ + FSTP_S( DST0 ) /* F5 F6 */ + FSTP_S( DST1 ) /* F6 */ + FSTP_S( DST2 ) /* */ + MOV_L( EBX, DST3 ) + +LLBL(x86_p4_3dnrr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p4_3dnrr_loop) ) + +LLBL(x86_p4_3dnrr_done): + + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points4_2d ) +HIDDEN(_mesa_x86_transform_points4_2d) +GLNAME( _mesa_x86_transform_points4_2d ): + +#define FRAME_OFFSET 16 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + PUSH_L( EBP ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p4_2dr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p4_2dr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + FLD_S( SRC0 ) /* F5 F4 */ + FMUL_S( MAT1 ) + + FLD_S( SRC1 ) /* F0 F5 F4 */ + FMUL_S( MAT4 ) + FLD_S( SRC1 ) /* F1 F0 F5 F4 */ + FMUL_S( MAT5 ) + + FXCH( ST(1) ) /* F0 F1 F5 F4 */ + FADDP( ST0, ST(3) ) /* F1 F5 F4 */ + FADDP( ST0, ST(1) ) /* F5 F4 */ + + FLD_S( SRC3 ) /* F0 F5 F4 */ + FMUL_S( MAT12 ) + FLD_S( SRC3 ) /* F1 F0 F5 F4 */ + FMUL_S( MAT13 ) + + FXCH( ST(1) ) /* F0 F1 F5 F4 */ + FADDP( ST0, ST(3) ) /* F1 F5 F4 */ + FADDP( ST0, ST(1) ) /* F5 F4 */ + + MOV_L( SRC2, EBX ) + MOV_L( SRC3, EBP ) + + FXCH( ST(1) ) /* F4 F5 */ + FSTP_S( DST0 ) /* F5 */ + FSTP_S( DST1 ) /* */ + MOV_L( EBX, DST2 ) + MOV_L( EBP, DST3 ) + +LLBL(x86_p4_2dr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p4_2dr_loop) ) + +LLBL(x86_p4_2dr_done): + + POP_L( EBP ) + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points4_2d_no_rot ) +HIDDEN(_mesa_x86_transform_points4_2d_no_rot) +GLNAME( _mesa_x86_transform_points4_2d_no_rot ): + +#define FRAME_OFFSET 16 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + PUSH_L( EBP ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p4_2dnrr_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + +ALIGNTEXT16 +LLBL(x86_p4_2dnrr_loop): + + FLD_S( SRC0 ) /* F4 */ + FMUL_S( MAT0 ) + + FLD_S( SRC1 ) /* F5 F4 */ + FMUL_S( MAT5 ) + + FLD_S( SRC3 ) /* F0 F5 F4 */ + FMUL_S( MAT12 ) + FLD_S( SRC3 ) /* F1 F0 F5 F4 */ + FMUL_S( MAT13 ) + + FXCH( ST(1) ) /* F0 F1 F5 F4 */ + FADDP( ST0, ST(3) ) /* F1 F5 F4 */ + FADDP( ST0, ST(1) ) /* F5 F4 */ + + MOV_L( SRC2, EBX ) + MOV_L( SRC3, EBP ) + + FXCH( ST(1) ) /* F4 F5 */ + FSTP_S( DST0 ) /* F5 */ + FSTP_S( DST1 ) /* */ + MOV_L( EBX, DST2 ) + MOV_L( EBP, DST3 ) + +LLBL(x86_p4_2dnrr_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p4_2dnrr_loop) ) + +LLBL(x86_p4_2dnrr_done): + + POP_L( EBP ) + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET +#undef FRAME_OFFSET + + + + +ALIGNTEXT16 +GLOBL GLNAME( _mesa_x86_transform_points4_identity ) +HIDDEN(_mesa_x86_transform_points4_identity) +GLNAME( _mesa_x86_transform_points4_identity ): + +#define FRAME_OFFSET 12 + PUSH_L( ESI ) + PUSH_L( EDI ) + PUSH_L( EBX ) + + MOV_L( ARG_SOURCE, ESI ) + MOV_L( ARG_DEST, EDI ) + + MOV_L( ARG_MATRIX, EDX ) + MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) + + TEST_L( ECX, ECX ) + JZ( LLBL(x86_p4_ir_done) ) + + MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) + OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) + + MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) + MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) + + SHL_L( CONST(4), ECX ) + MOV_L( REGOFF(V4F_START, ESI), ESI ) + + MOV_L( REGOFF(V4F_START, EDI), EDI ) + ADD_L( EDI, ECX ) + + CMP_L( ESI, EDI ) + JE( LLBL(x86_p4_ir_done) ) + +ALIGNTEXT16 +LLBL(x86_p4_ir_loop): + + MOV_L( SRC0, EBX ) + MOV_L( SRC1, EDX ) + + MOV_L( EBX, DST0 ) + MOV_L( EDX, DST1 ) + + MOV_L( SRC2, EBX ) + MOV_L( SRC3, EDX ) + + MOV_L( EBX, DST2 ) + MOV_L( EDX, DST3 ) + +LLBL(x86_p4_ir_skip): + + ADD_L( CONST(16), EDI ) + ADD_L( EAX, ESI ) + CMP_L( ECX, EDI ) + JNE( LLBL(x86_p4_ir_loop) ) + +LLBL(x86_p4_ir_done): + + POP_L( EBX ) + POP_L( EDI ) + POP_L( ESI ) + RET + +#if defined (__ELF__) && defined (__linux__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/mesalib/src/mesa/x86/xform_args.h b/mesalib/src/mesa/x86/xform_args.h index 772ab9d1f..b773f5198 100644 --- a/mesalib/src/mesa/x86/xform_args.h +++ b/mesalib/src/mesa/x86/xform_args.h @@ -1,51 +1,51 @@ -
-/*
- * Mesa 3-D graphics library
- * Version: 3.5
- *
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * Transform function interface for assembly code. Simply define
- * FRAME_OFFSET to the number of bytes pushed onto the stack before
- * using the ARG_* argument macros.
- *
- * Gareth Hughes
- */
-
-#ifndef __XFORM_ARGS_H__
-#define __XFORM_ARGS_H__
-
-/* Offsets for transform_func arguments
- *
- * typedef void (*transform_func)( GLvector4f *to_vec,
- * const GLfloat m[16],
- * const GLvector4f *from_vec );
- */
-#define OFFSET_DEST 4
-#define OFFSET_MATRIX 8
-#define OFFSET_SOURCE 12
-
-#define ARG_DEST REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP)
-#define ARG_MATRIX REGOFF(FRAME_OFFSET+OFFSET_MATRIX, ESP)
-#define ARG_SOURCE REGOFF(FRAME_OFFSET+OFFSET_SOURCE, ESP)
-
-#endif
+ +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Transform function interface for assembly code. Simply define + * FRAME_OFFSET to the number of bytes pushed onto the stack before + * using the ARG_* argument macros. + * + * Gareth Hughes + */ + +#ifndef __XFORM_ARGS_H__ +#define __XFORM_ARGS_H__ + +/* Offsets for transform_func arguments + * + * typedef void (*transform_func)( GLvector4f *to_vec, + * const GLfloat m[16], + * const GLvector4f *from_vec ); + */ +#define OFFSET_DEST 4 +#define OFFSET_MATRIX 8 +#define OFFSET_SOURCE 12 + +#define ARG_DEST REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP) +#define ARG_MATRIX REGOFF(FRAME_OFFSET+OFFSET_MATRIX, ESP) +#define ARG_SOURCE REGOFF(FRAME_OFFSET+OFFSET_SOURCE, ESP) + +#endif |