From a0c4815433ccd57322f4f7703ca35e9ccfa59250 Mon Sep 17 00:00:00 2001
From: marha <marha@users.sourceforge.net>
Date: Thu, 8 Oct 2009 13:15:52 +0000
Subject: Added MesaLib-7.6

---
 mesalib/src/mesa/math/descrip.mms     |   47 +
 mesalib/src/mesa/math/m_clip_tmp.h    |  243 +++++
 mesalib/src/mesa/math/m_copy_tmp.h    |   86 ++
 mesalib/src/mesa/math/m_debug.h       |   42 +
 mesalib/src/mesa/math/m_debug_clip.c  |  371 ++++++++
 mesalib/src/mesa/math/m_debug_norm.c  |  383 ++++++++
 mesalib/src/mesa/math/m_debug_util.h  |  320 +++++++
 mesalib/src/mesa/math/m_debug_xform.c |  339 +++++++
 mesalib/src/mesa/math/m_dotprod_tmp.h |  102 ++
 mesalib/src/mesa/math/m_eval.c        |  461 +++++++++
 mesalib/src/mesa/math/m_eval.h        |  103 +++
 mesalib/src/mesa/math/m_matrix.c      | 1642 +++++++++++++++++++++++++++++++++
 mesalib/src/mesa/math/m_matrix.h      |  210 +++++
 mesalib/src/mesa/math/m_norm_tmp.h    |  390 ++++++++
 mesalib/src/mesa/math/m_trans_tmp.h   |  281 ++++++
 mesalib/src/mesa/math/m_translate.c   |  751 +++++++++++++++
 mesalib/src/mesa/math/m_translate.h   |  122 +++
 mesalib/src/mesa/math/m_vector.c      |  185 ++++
 mesalib/src/mesa/math/m_vector.h      |   92 ++
 mesalib/src/mesa/math/m_xform.c       |  128 +++
 mesalib/src/mesa/math/m_xform.h       |  166 ++++
 mesalib/src/mesa/math/m_xform_tmp.h   |  810 ++++++++++++++++
 22 files changed, 7274 insertions(+)
 create mode 100644 mesalib/src/mesa/math/descrip.mms
 create mode 100644 mesalib/src/mesa/math/m_clip_tmp.h
 create mode 100644 mesalib/src/mesa/math/m_copy_tmp.h
 create mode 100644 mesalib/src/mesa/math/m_debug.h
 create mode 100644 mesalib/src/mesa/math/m_debug_clip.c
 create mode 100644 mesalib/src/mesa/math/m_debug_norm.c
 create mode 100644 mesalib/src/mesa/math/m_debug_util.h
 create mode 100644 mesalib/src/mesa/math/m_debug_xform.c
 create mode 100644 mesalib/src/mesa/math/m_dotprod_tmp.h
 create mode 100644 mesalib/src/mesa/math/m_eval.c
 create mode 100644 mesalib/src/mesa/math/m_eval.h
 create mode 100644 mesalib/src/mesa/math/m_matrix.c
 create mode 100644 mesalib/src/mesa/math/m_matrix.h
 create mode 100644 mesalib/src/mesa/math/m_norm_tmp.h
 create mode 100644 mesalib/src/mesa/math/m_trans_tmp.h
 create mode 100644 mesalib/src/mesa/math/m_translate.c
 create mode 100644 mesalib/src/mesa/math/m_translate.h
 create mode 100644 mesalib/src/mesa/math/m_vector.c
 create mode 100644 mesalib/src/mesa/math/m_vector.h
 create mode 100644 mesalib/src/mesa/math/m_xform.c
 create mode 100644 mesalib/src/mesa/math/m_xform.h
 create mode 100644 mesalib/src/mesa/math/m_xform_tmp.h

(limited to 'mesalib/src/mesa/math')

diff --git a/mesalib/src/mesa/math/descrip.mms b/mesalib/src/mesa/math/descrip.mms
new file mode 100644
index 000000000..3aaa6eb8b
--- /dev/null
+++ b/mesalib/src/mesa/math/descrip.mms
@@ -0,0 +1,47 @@
+# Makefile for core library for VMS
+# contributed by Jouk Jansen  joukj@hrem.nano.tudelft.nl
+# Last revision : 3 October 2007
+
+.first
+	define gl [---.include.gl]
+	define math [-.math]
+	define glapi [-.glapi]
+	define main [-.main]
+
+.include [---]mms-config.
+
+##### MACROS #####
+
+VPATH = RCS
+
+INCDIR = [---.include],[-.main],[-.glapi]
+LIBDIR = [---.lib]
+CFLAGS = /include=($(INCDIR),[])/define=(PTHREADS=1)/name=(as_is,short)/float=ieee/ieee=denorm
+
+SOURCES = m_debug_clip.c m_debug_norm.c m_debug_xform.c m_eval.c m_matrix.c\
+	m_translate.c m_vector.c m_xform.c
+
+OBJECTS = m_debug_clip.obj,m_debug_norm.obj,m_debug_xform.obj,m_eval.obj,\
+	m_matrix.obj,m_translate.obj,m_vector.obj,m_xform.obj
+ 
+##### RULES #####
+
+VERSION=Mesa V3.4
+
+##### TARGETS #####
+# Make the library
+$(LIBDIR)$(GL_LIB) : $(OBJECTS)
+  @ library $(LIBDIR)$(GL_LIB) $(OBJECTS)
+
+clean :
+	purge
+	delete *.obj;*
+
+m_debug_clip.obj : m_debug_clip.c
+m_debug_norm.obj : m_debug_norm.c
+m_debug_xform.obj : m_debug_xform.c
+m_eval.obj : m_eval.c
+m_matrix.obj : m_matrix.c
+m_translate.obj : m_translate.c
+m_vector.obj : m_vector.c
+m_xform.obj : m_xform.c
diff --git a/mesalib/src/mesa/math/m_clip_tmp.h b/mesalib/src/mesa/math/m_clip_tmp.h
new file mode 100644
index 000000000..f3a589be0
--- /dev/null
+++ b/mesalib/src/mesa/math/m_clip_tmp.h
@@ -0,0 +1,243 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.2
+ *
+ * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * New (3.1) transformation code written by Keith Whitwell.
+ */
+
+
+/* KW: a clever asm implementation would nestle integer versions
+ * of the outcode calculation underneath the division.  Gcc won't
+ * do this, strangely enough, so I only do the divide in
+ * the case where the cliptest passes.  This isn't essential,
+ * and an asm implementation needn't replicate that behaviour.
+ *
+ * \param clip_vec vector of incoming clip-space coords
+ * \param proj_vec vector of resultant NDC-space projected coords
+ * \param clipMask resulting array of clip flags
+ * \param orMask bitwise-OR of clipMask values
+ * \param andMask bitwise-AND of clipMask values
+ * \return proj_vec pointer
+ */
+static GLvector4f * _XFORMAPI TAG(cliptest_points4)( GLvector4f *clip_vec,
+                                                     GLvector4f *proj_vec,
+                                                     GLubyte clipMask[],
+                                                     GLubyte *orMask,
+                                                     GLubyte *andMask )
+{
+   const GLuint stride = clip_vec->stride;
+   const GLfloat *from = (GLfloat *)clip_vec->start;
+   const GLuint count = clip_vec->count;
+   GLuint c = 0;
+   GLfloat (*vProj)[4] = (GLfloat (*)[4])proj_vec->start;
+   GLubyte tmpAndMask = *andMask;
+   GLubyte tmpOrMask = *orMask;
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat cx = from[0];
+      const GLfloat cy = from[1];
+      const GLfloat cz = from[2];
+      const GLfloat cw = from[3];
+#if defined(macintosh) || defined(__powerpc__)
+      /* on powerpc cliptest is 17% faster in this way. */
+      GLuint mask;
+      mask = (((cw < cx) << CLIP_RIGHT_SHIFT));
+      mask |= (((cw < -cx) << CLIP_LEFT_SHIFT));
+      mask |= (((cw < cy) << CLIP_TOP_SHIFT));
+      mask |= (((cw < -cy) << CLIP_BOTTOM_SHIFT));
+      mask |= (((cw < cz) << CLIP_FAR_SHIFT));
+      mask |= (((cw < -cz) << CLIP_NEAR_SHIFT));
+#else /* !defined(macintosh)) */
+      GLubyte mask = 0;
+      if (-cx + cw < 0) mask |= CLIP_RIGHT_BIT;
+      if ( cx + cw < 0) mask |= CLIP_LEFT_BIT;
+      if (-cy + cw < 0) mask |= CLIP_TOP_BIT;
+      if ( cy + cw < 0) mask |= CLIP_BOTTOM_BIT;
+      if (-cz + cw < 0) mask |= CLIP_FAR_BIT;
+      if ( cz + cw < 0) mask |= CLIP_NEAR_BIT;
+#endif /* defined(macintosh) */
+
+      clipMask[i] = mask;
+      if (mask) {
+	 c++;
+	 tmpAndMask &= mask;
+	 tmpOrMask |= mask;
+	 vProj[i][0] = 0;
+	 vProj[i][1] = 0;
+	 vProj[i][2] = 0;
+	 vProj[i][3] = 1;
+      } else {
+	 GLfloat oow = 1.0F / cw;
+	 vProj[i][0] = cx * oow;
+	 vProj[i][1] = cy * oow;
+	 vProj[i][2] = cz * oow;
+	 vProj[i][3] = oow;
+      }
+   }
+
+   *orMask = tmpOrMask;
+   *andMask = (GLubyte) (c < count ? 0 : tmpAndMask);
+
+   proj_vec->flags |= VEC_SIZE_4;
+   proj_vec->size = 4;
+   proj_vec->count = clip_vec->count;
+   return proj_vec;
+}
+
+
+
+/*
+ * \param clip_vec vector of incoming clip-space coords
+ * \param proj_vec vector of resultant NDC-space projected coords
+ * \param clipMask resulting array of clip flags
+ * \param orMask bitwise-OR of clipMask values
+ * \param andMask bitwise-AND of clipMask values
+ * \return clip_vec pointer
+ */
+static GLvector4f * _XFORMAPI TAG(cliptest_np_points4)( GLvector4f *clip_vec,
+							GLvector4f *proj_vec,
+							GLubyte clipMask[],
+							GLubyte *orMask,
+							GLubyte *andMask )
+{
+   const GLuint stride = clip_vec->stride;
+   const GLuint count = clip_vec->count;
+   const GLfloat *from = (GLfloat *)clip_vec->start;
+   GLuint c = 0;
+   GLubyte tmpAndMask = *andMask;
+   GLubyte tmpOrMask = *orMask;
+   GLuint i;
+   (void) proj_vec;
+   STRIDE_LOOP {
+      const GLfloat cx = from[0];
+      const GLfloat cy = from[1];
+      const GLfloat cz = from[2];
+      const GLfloat cw = from[3];
+#if defined(macintosh) || defined(__powerpc__)
+      /* on powerpc cliptest is 17% faster in this way. */
+      GLuint mask;
+      mask = (((cw < cx) << CLIP_RIGHT_SHIFT));
+      mask |= (((cw < -cx) << CLIP_LEFT_SHIFT));
+      mask |= (((cw < cy) << CLIP_TOP_SHIFT));
+      mask |= (((cw < -cy) << CLIP_BOTTOM_SHIFT));
+      mask |= (((cw < cz) << CLIP_FAR_SHIFT));
+      mask |= (((cw < -cz) << CLIP_NEAR_SHIFT));
+#else /* !defined(macintosh)) */
+      GLubyte mask = 0;
+      if (-cx + cw < 0) mask |= CLIP_RIGHT_BIT;
+      if ( cx + cw < 0) mask |= CLIP_LEFT_BIT;
+      if (-cy + cw < 0) mask |= CLIP_TOP_BIT;
+      if ( cy + cw < 0) mask |= CLIP_BOTTOM_BIT;
+      if (-cz + cw < 0) mask |= CLIP_FAR_BIT;
+      if ( cz + cw < 0) mask |= CLIP_NEAR_BIT;
+#endif /* defined(macintosh) */
+
+      clipMask[i] = mask;
+      if (mask) {
+	 c++;
+	 tmpAndMask &= mask;
+	 tmpOrMask |= mask;
+      }
+   }
+
+   *orMask = tmpOrMask;
+   *andMask = (GLubyte) (c < count ? 0 : tmpAndMask);
+   return clip_vec;
+}
+
+
+static GLvector4f * _XFORMAPI TAG(cliptest_points3)( GLvector4f *clip_vec,
+                                                     GLvector4f *proj_vec,
+                                                     GLubyte clipMask[],
+                                                     GLubyte *orMask,
+                                                     GLubyte *andMask )
+{
+   const GLuint stride = clip_vec->stride;
+   const GLuint count = clip_vec->count;
+   const GLfloat *from = (GLfloat *)clip_vec->start;
+   GLubyte tmpOrMask = *orMask;
+   GLubyte tmpAndMask = *andMask;
+   GLuint i;
+   (void) proj_vec;
+   STRIDE_LOOP {
+      const GLfloat cx = from[0], cy = from[1], cz = from[2];
+      GLubyte mask = 0;
+      if (cx >  1.0)       mask |= CLIP_RIGHT_BIT;
+      else if (cx < -1.0)  mask |= CLIP_LEFT_BIT;
+      if (cy >  1.0)       mask |= CLIP_TOP_BIT;
+      else if (cy < -1.0)  mask |= CLIP_BOTTOM_BIT;
+      if (cz >  1.0)       mask |= CLIP_FAR_BIT;
+      else if (cz < -1.0)  mask |= CLIP_NEAR_BIT;
+      clipMask[i] = mask;
+      tmpOrMask |= mask;
+      tmpAndMask &= mask;
+   }
+
+   *orMask = tmpOrMask;
+   *andMask = tmpAndMask;
+   return clip_vec;
+}
+
+
+static GLvector4f * _XFORMAPI TAG(cliptest_points2)( GLvector4f *clip_vec,
+                                                     GLvector4f *proj_vec,
+                                                     GLubyte clipMask[],
+                                                     GLubyte *orMask,
+                                                     GLubyte *andMask )
+{
+   const GLuint stride = clip_vec->stride;
+   const GLuint count = clip_vec->count;
+   const GLfloat *from = (GLfloat *)clip_vec->start;
+   GLubyte tmpOrMask = *orMask;
+   GLubyte tmpAndMask = *andMask;
+   GLuint i;
+   (void) proj_vec;
+   STRIDE_LOOP {
+      const GLfloat cx = from[0], cy = from[1];
+      GLubyte mask = 0;
+      if (cx >  1.0)       mask |= CLIP_RIGHT_BIT;
+      else if (cx < -1.0)  mask |= CLIP_LEFT_BIT;
+      if (cy >  1.0)       mask |= CLIP_TOP_BIT;
+      else if (cy < -1.0)  mask |= CLIP_BOTTOM_BIT;
+      clipMask[i] = mask;
+      tmpOrMask |= mask;
+      tmpAndMask &= mask;
+   }
+
+   *orMask = tmpOrMask;
+   *andMask = tmpAndMask;
+   return clip_vec;
+}
+
+
+static void TAG(init_c_cliptest)( void )
+{
+   _mesa_clip_tab[4] = TAG(cliptest_points4);
+   _mesa_clip_tab[3] = TAG(cliptest_points3);
+   _mesa_clip_tab[2] = TAG(cliptest_points2);
+
+   _mesa_clip_np_tab[4] = TAG(cliptest_np_points4);
+   _mesa_clip_np_tab[3] = TAG(cliptest_points3);
+   _mesa_clip_np_tab[2] = TAG(cliptest_points2);
+}
diff --git a/mesalib/src/mesa/math/m_copy_tmp.h b/mesalib/src/mesa/math/m_copy_tmp.h
new file mode 100644
index 000000000..07ab1f7b2
--- /dev/null
+++ b/mesalib/src/mesa/math/m_copy_tmp.h
@@ -0,0 +1,86 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * New (3.1) transformation code written by Keith Whitwell.
+ */
+
+
+#define COPY_FUNC( BITS )						\
+static void TAG2(copy, BITS)( GLvector4f *to, const GLvector4f *f )	\
+{									\
+   GLfloat (*t)[4] = (GLfloat (*)[4])to->start;				\
+   GLfloat *from = f->start;						\
+   GLuint stride = f->stride;				        	\
+   GLuint count = to->count;						\
+   GLuint i;								\
+									\
+   if (BITS)								\
+      STRIDE_LOOP {							\
+	 if (BITS&1) t[i][0] = from[0];					\
+	 if (BITS&2) t[i][1] = from[1];					\
+	 if (BITS&4) t[i][2] = from[2];					\
+	 if (BITS&8) t[i][3] = from[3];					\
+      }									\
+}
+
+/* We got them all here:
+ */
+COPY_FUNC( 0x0 )		/* noop */
+COPY_FUNC( 0x1 )
+COPY_FUNC( 0x2 )
+COPY_FUNC( 0x3 )
+COPY_FUNC( 0x4 )
+COPY_FUNC( 0x5 )
+COPY_FUNC( 0x6 )
+COPY_FUNC( 0x7 )
+COPY_FUNC( 0x8 )
+COPY_FUNC( 0x9 )
+COPY_FUNC( 0xa )
+COPY_FUNC( 0xb )
+COPY_FUNC( 0xc )
+COPY_FUNC( 0xd )
+COPY_FUNC( 0xe )
+COPY_FUNC( 0xf )
+
+static void TAG2(init_copy, 0)( void )
+{
+   _mesa_copy_tab[0x0] = TAG2(copy, 0x0);
+   _mesa_copy_tab[0x1] = TAG2(copy, 0x1);
+   _mesa_copy_tab[0x2] = TAG2(copy, 0x2);
+   _mesa_copy_tab[0x3] = TAG2(copy, 0x3);
+   _mesa_copy_tab[0x4] = TAG2(copy, 0x4);
+   _mesa_copy_tab[0x5] = TAG2(copy, 0x5);
+   _mesa_copy_tab[0x6] = TAG2(copy, 0x6);
+   _mesa_copy_tab[0x7] = TAG2(copy, 0x7);
+   _mesa_copy_tab[0x8] = TAG2(copy, 0x8);
+   _mesa_copy_tab[0x9] = TAG2(copy, 0x9);
+   _mesa_copy_tab[0xa] = TAG2(copy, 0xa);
+   _mesa_copy_tab[0xb] = TAG2(copy, 0xb);
+   _mesa_copy_tab[0xc] = TAG2(copy, 0xc);
+   _mesa_copy_tab[0xd] = TAG2(copy, 0xd);
+   _mesa_copy_tab[0xe] = TAG2(copy, 0xe);
+   _mesa_copy_tab[0xf] = TAG2(copy, 0xf);
+}
diff --git a/mesalib/src/mesa/math/m_debug.h b/mesalib/src/mesa/math/m_debug.h
new file mode 100644
index 000000000..6476b6de2
--- /dev/null
+++ b/mesalib/src/mesa/math/m_debug.h
@@ -0,0 +1,42 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Gareth Hughes
+ */
+
+#ifndef __M_DEBUG_H__
+#define __M_DEBUG_H__
+
+extern void _math_test_all_transform_functions( char *description );
+extern void _math_test_all_normal_transform_functions( char *description );
+extern void _math_test_all_cliptest_functions( char *description );
+
+/* Deprecated?
+ */
+extern void _math_test_all_vertex_functions( char *description );
+
+extern char *mesa_profile;
+
+#endif
diff --git a/mesalib/src/mesa/math/m_debug_clip.c b/mesalib/src/mesa/math/m_debug_clip.c
new file mode 100644
index 000000000..460fed4a7
--- /dev/null
+++ b/mesalib/src/mesa/math/m_debug_clip.c
@@ -0,0 +1,371 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.1
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Gareth Hughes
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/macros.h"
+#include "main/imports.h"
+
+#include "m_matrix.h"
+#include "m_xform.h"
+
+#include "m_debug.h"
+#include "m_debug_util.h"
+
+#ifdef __UNIXOS2__
+/* The linker doesn't like empty files */
+static char dummy;
+#endif
+
+#ifdef DEBUG_MATH  /* This code only used for debugging */
+
+static clip_func *clip_tab[2] = {
+   _mesa_clip_tab,
+   _mesa_clip_np_tab
+};
+static char *cnames[2] = {
+   "_mesa_clip_tab",
+   "_mesa_clip_np_tab"
+};
+#ifdef RUN_DEBUG_BENCHMARK
+static char *cstrings[2] = {
+   "clip, perspective divide",
+   "clip, no divide"
+};
+#endif
+
+
+/* =============================================================
+ * Reference cliptests
+ */
+
+static GLvector4f *ref_cliptest_points4( GLvector4f *clip_vec,
+					 GLvector4f *proj_vec,
+					 GLubyte clipMask[],
+					 GLubyte *orMask,
+					 GLubyte *andMask )
+{
+   const GLuint stride = clip_vec->stride;
+   const GLuint count = clip_vec->count;
+   const GLfloat *from = (GLfloat *)clip_vec->start;
+   GLuint c = 0;
+   GLfloat (*vProj)[4] = (GLfloat (*)[4])proj_vec->start;
+   GLubyte tmpAndMask = *andMask;
+   GLubyte tmpOrMask = *orMask;
+   GLuint i;
+   for ( i = 0 ; i < count ; i++, STRIDE_F(from, stride) ) {
+      const GLfloat cx = from[0];
+      const GLfloat cy = from[1];
+      const GLfloat cz = from[2];
+      const GLfloat cw = from[3];
+      GLubyte mask = 0;
+      if ( -cx + cw < 0 ) mask |= CLIP_RIGHT_BIT;
+      if (  cx + cw < 0 ) mask |= CLIP_LEFT_BIT;
+      if ( -cy + cw < 0 ) mask |= CLIP_TOP_BIT;
+      if (  cy + cw < 0 ) mask |= CLIP_BOTTOM_BIT;
+      if ( -cz + cw < 0 ) mask |= CLIP_FAR_BIT;
+      if (  cz + cw < 0 ) mask |= CLIP_NEAR_BIT;
+      clipMask[i] = mask;
+      if ( mask ) {
+	 c++;
+	 tmpAndMask &= mask;
+	 tmpOrMask |= mask;
+	 vProj[i][0] = 0;
+	 vProj[i][1] = 0;
+	 vProj[i][2] = 0;
+	 vProj[i][3] = 1;
+      } else {
+	 GLfloat oow = 1.0F / cw;
+	 vProj[i][0] = cx * oow;
+	 vProj[i][1] = cy * oow;
+	 vProj[i][2] = cz * oow;
+	 vProj[i][3] = oow;
+      }
+   }
+
+   *orMask = tmpOrMask;
+   *andMask = (GLubyte) (c < count ? 0 : tmpAndMask);
+
+   proj_vec->flags |= VEC_SIZE_4;
+   proj_vec->size = 4;
+   proj_vec->count = clip_vec->count;
+   return proj_vec;
+}
+
+/* Keep these here for now, even though we don't use them...
+ */
+static GLvector4f *ref_cliptest_points3( GLvector4f *clip_vec,
+					 GLvector4f *proj_vec,
+					 GLubyte clipMask[],
+					 GLubyte *orMask,
+					 GLubyte *andMask )
+{
+   const GLuint stride = clip_vec->stride;
+   const GLuint count = clip_vec->count;
+   const GLfloat *from = (GLfloat *)clip_vec->start;
+
+   GLubyte tmpOrMask = *orMask;
+   GLubyte tmpAndMask = *andMask;
+   GLuint i;
+   for ( i = 0 ; i < count ; i++, STRIDE_F(from, stride) ) {
+      const GLfloat cx = from[0], cy = from[1], cz = from[2];
+      GLubyte mask = 0;
+      if ( cx >  1.0 )		mask |= CLIP_RIGHT_BIT;
+      else if ( cx < -1.0 )	mask |= CLIP_LEFT_BIT;
+      if ( cy >  1.0 )		mask |= CLIP_TOP_BIT;
+      else if ( cy < -1.0 )	mask |= CLIP_BOTTOM_BIT;
+      if ( cz >  1.0 )		mask |= CLIP_FAR_BIT;
+      else if ( cz < -1.0 )	mask |= CLIP_NEAR_BIT;
+      clipMask[i] = mask;
+      tmpOrMask |= mask;
+      tmpAndMask &= mask;
+   }
+
+   *orMask = tmpOrMask;
+   *andMask = tmpAndMask;
+   return clip_vec;
+}
+
+static GLvector4f * ref_cliptest_points2( GLvector4f *clip_vec,
+					  GLvector4f *proj_vec,
+					  GLubyte clipMask[],
+					  GLubyte *orMask,
+					  GLubyte *andMask )
+{
+   const GLuint stride = clip_vec->stride;
+   const GLuint count = clip_vec->count;
+   const GLfloat *from = (GLfloat *)clip_vec->start;
+
+   GLubyte tmpOrMask = *orMask;
+   GLubyte tmpAndMask = *andMask;
+   GLuint i;
+   for ( i = 0 ; i < count ; i++, STRIDE_F(from, stride) ) {
+      const GLfloat cx = from[0], cy = from[1];
+      GLubyte mask = 0;
+      if ( cx >  1.0 )		mask |= CLIP_RIGHT_BIT;
+      else if ( cx < -1.0 )	mask |= CLIP_LEFT_BIT;
+      if ( cy >  1.0 )		mask |= CLIP_TOP_BIT;
+      else if ( cy < -1.0 )	mask |= CLIP_BOTTOM_BIT;
+      clipMask[i] = mask;
+      tmpOrMask |= mask;
+      tmpAndMask &= mask;
+   }
+
+   *orMask = tmpOrMask;
+   *andMask = tmpAndMask;
+   return clip_vec;
+}
+
+static clip_func ref_cliptest[5] = {
+   0,
+   0,
+   ref_cliptest_points2,
+   ref_cliptest_points3,
+   ref_cliptest_points4
+};
+
+
+/* =============================================================
+ * Cliptest tests
+ */
+
+ALIGN16(static GLfloat, s[TEST_COUNT][4]);
+ALIGN16(static GLfloat, d[TEST_COUNT][4]);
+ALIGN16(static GLfloat, r[TEST_COUNT][4]);
+
+
+static int test_cliptest_function( clip_func func, int np,
+				   int psize, long *cycles )
+{
+   GLvector4f source[1], dest[1], ref[1];
+   GLubyte dm[TEST_COUNT], dco, dca;
+   GLubyte rm[TEST_COUNT], rco, rca;
+   int i, j;
+#ifdef  RUN_DEBUG_BENCHMARK
+   int cycle_i;                /* the counter for the benchmarks we run */
+#endif
+
+   (void) cycles;
+
+   if ( psize > 4 ) {
+      _mesa_problem( NULL, "test_cliptest_function called with psize > 4\n" );
+      return 0;
+   }
+
+   for ( i = 0 ; i < TEST_COUNT ; i++) {
+      ASSIGN_4V( d[i], 0.0, 0.0, 0.0, 1.0 );
+      ASSIGN_4V( s[i], 0.0, 0.0, 0.0, 1.0 );
+      for ( j = 0 ; j < psize ; j++ )
+         s[i][j] = rnd();
+   }
+
+   source->data = (GLfloat(*)[4])s;
+   source->start = (GLfloat *)s;
+   source->count = TEST_COUNT;
+   source->stride = sizeof(s[0]);
+   source->size = 4;
+   source->flags = 0;
+
+   dest->data = (GLfloat(*)[4])d;
+   dest->start = (GLfloat *)d;
+   dest->count = TEST_COUNT;
+   dest->stride = sizeof(float[4]);
+   dest->size = 0;
+   dest->flags = 0;
+
+   ref->data = (GLfloat(*)[4])r;
+   ref->start = (GLfloat *)r;
+   ref->count = TEST_COUNT;
+   ref->stride = sizeof(float[4]);
+   ref->size = 0;
+   ref->flags = 0;
+
+   dco = rco = 0;
+   dca = rca = CLIP_FRUSTUM_BITS;
+
+   ref_cliptest[psize]( source, ref, rm, &rco, &rca );
+
+   if ( mesa_profile ) {
+      BEGIN_RACE( *cycles );
+      func( source, dest, dm, &dco, &dca );
+      END_RACE( *cycles );
+   }
+   else {
+      func( source, dest, dm, &dco, &dca );
+   }
+
+   if ( dco != rco ) {
+      _mesa_printf( "\n-----------------------------\n" );
+      _mesa_printf( "dco = 0x%02x   rco = 0x%02x\n", dco, rco );
+      return 0;
+   }
+   if ( dca != rca ) {
+      _mesa_printf( "\n-----------------------------\n" );
+      _mesa_printf( "dca = 0x%02x   rca = 0x%02x\n", dca, rca );
+      return 0;
+   }
+   for ( i = 0 ; i < TEST_COUNT ; i++ ) {
+      if ( dm[i] != rm[i] ) {
+	 _mesa_printf( "\n-----------------------------\n" );
+	 _mesa_printf( "(i = %i)\n", i );
+	 _mesa_printf( "dm = 0x%02x   rm = 0x%02x\n", dm[i], rm[i] );
+	 return 0;
+      }
+   }
+
+   /* Only verify output on projected points4 case.  FIXME: Do we need
+    * to test other cases?
+    */
+   if ( np || psize < 4 )
+      return 1;
+
+   for ( i = 0 ; i < TEST_COUNT ; i++ ) {
+      for ( j = 0 ; j < 4 ; j++ ) {
+         if ( significand_match( d[i][j], r[i][j] ) < REQUIRED_PRECISION ) {
+            _mesa_printf( "\n-----------------------------\n" );
+            _mesa_printf( "(i = %i, j = %i)  dm = 0x%02x   rm = 0x%02x\n",
+		    i, j, dm[i], rm[i] );
+            _mesa_printf( "%f \t %f \t [diff = %e - %i bit missed]\n",
+		    d[i][0], r[i][0], r[i][0]-d[i][0],
+		    MAX_PRECISION - significand_match( d[i][0], r[i][0] ) );
+            _mesa_printf( "%f \t %f \t [diff = %e - %i bit missed]\n",
+		    d[i][1], r[i][1], r[i][1]-d[i][1],
+		    MAX_PRECISION - significand_match( d[i][1], r[i][1] ) );
+            _mesa_printf( "%f \t %f \t [diff = %e - %i bit missed]\n",
+		    d[i][2], r[i][2], r[i][2]-d[i][2],
+		    MAX_PRECISION - significand_match( d[i][2], r[i][2] ) );
+            _mesa_printf( "%f \t %f \t [diff = %e - %i bit missed]\n",
+		    d[i][3], r[i][3], r[i][3]-d[i][3],
+		    MAX_PRECISION - significand_match( d[i][3], r[i][3] ) );
+            return 0;
+         }
+      }
+   }
+
+   return 1;
+}
+
+void _math_test_all_cliptest_functions( char *description )
+{
+   int np, psize;
+   long benchmark_tab[2][4];
+   static int first_time = 1;
+
+   if ( first_time ) {
+      first_time = 0;
+      mesa_profile = _mesa_getenv( "MESA_PROFILE" );
+   }
+
+#ifdef RUN_DEBUG_BENCHMARK
+   if ( mesa_profile ) {
+      if ( !counter_overhead ) {
+	 INIT_COUNTER();
+	 _mesa_printf( "counter overhead: %ld cycles\n\n", counter_overhead );
+      }
+      _mesa_printf( "cliptest results after hooking in %s functions:\n", description );
+   }
+#endif
+
+#ifdef RUN_DEBUG_BENCHMARK
+   if ( mesa_profile ) {
+      _mesa_printf( "\n\t" );
+      for ( psize = 2 ; psize <= 4 ; psize++ ) {
+	 _mesa_printf( " p%d\t", psize );
+      }
+      _mesa_printf( "\n--------------------------------------------------------\n\t" );
+   }
+#endif
+
+   for ( np = 0 ; np < 2 ; np++ ) {
+      for ( psize = 2 ; psize <= 4 ; psize++ ) {
+	 clip_func func = clip_tab[np][psize];
+	 long *cycles = &(benchmark_tab[np][psize-1]);
+
+	 if ( test_cliptest_function( func, np, psize, cycles ) == 0 ) {
+	    char buf[100];
+	    _mesa_sprintf( buf, "%s[%d] failed test (%s)",
+		     cnames[np], psize, description );
+	    _mesa_problem( NULL, buf );
+	 }
+#ifdef RUN_DEBUG_BENCHMARK
+	 if ( mesa_profile )
+	    _mesa_printf( " %li\t", benchmark_tab[np][psize-1] );
+#endif
+      }
+#ifdef RUN_DEBUG_BENCHMARK
+      if ( mesa_profile )
+	 _mesa_printf( " | [%s]\n\t", cstrings[np] );
+#endif
+   }
+#ifdef RUN_DEBUG_BENCHMARK
+   if ( mesa_profile )
+      _mesa_printf( "\n" );
+#endif
+}
+
+
+#endif /* DEBUG_MATH */
diff --git a/mesalib/src/mesa/math/m_debug_norm.c b/mesalib/src/mesa/math/m_debug_norm.c
new file mode 100644
index 000000000..89c632e7d
--- /dev/null
+++ b/mesalib/src/mesa/math/m_debug_norm.c
@@ -0,0 +1,383 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  5.1
+ *
+ * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Gareth Hughes
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/macros.h"
+#include "main/imports.h"
+
+#include "m_matrix.h"
+#include "m_xform.h"
+
+#include "m_debug.h"
+#include "m_debug_util.h"
+
+
+#ifdef __UNIXOS2__
+/* The linker doesn't like empty files */
+static char dummy;
+#endif
+
+#ifdef DEBUG_MATH  /* This code only used for debugging */
+
+
+static int m_norm_identity[16] = {
+   ONE, NIL, NIL, NIL,
+   NIL, ONE, NIL, NIL,
+   NIL, NIL, ONE, NIL,
+   NIL, NIL, NIL, NIL
+};
+static int m_norm_general[16] = {
+   VAR, VAR, VAR, NIL,
+   VAR, VAR, VAR, NIL,
+   VAR, VAR, VAR, NIL,
+   NIL, NIL, NIL, NIL
+};
+static int m_norm_no_rot[16] = {
+   VAR, NIL, NIL, NIL,
+   NIL, VAR, NIL, NIL,
+   NIL, NIL, VAR, NIL,
+   NIL, NIL, NIL, NIL
+};
+static int *norm_templates[8] = {
+   m_norm_no_rot,
+   m_norm_no_rot,
+   m_norm_no_rot,
+   m_norm_general,
+   m_norm_general,
+   m_norm_general,
+   m_norm_identity,
+   m_norm_identity
+};
+static int norm_types[8] = {
+   NORM_TRANSFORM_NO_ROT,
+   NORM_TRANSFORM_NO_ROT | NORM_RESCALE,
+   NORM_TRANSFORM_NO_ROT | NORM_NORMALIZE,
+   NORM_TRANSFORM,
+   NORM_TRANSFORM | NORM_RESCALE,
+   NORM_TRANSFORM | NORM_NORMALIZE,
+   NORM_RESCALE,
+   NORM_NORMALIZE
+};
+static int norm_scale_types[8] = {               /*  rescale factor          */
+   NIL,                                          /*  NIL disables rescaling  */
+   VAR,
+   NIL,
+   NIL,
+   VAR,
+   NIL,
+   VAR,
+   NIL
+};
+static int norm_normalize_types[8] = {           /*  normalizing ?? (no = 0) */
+   0,
+   0,
+   1,
+   0,
+   0,
+   1,
+   0,
+   1
+};
+static char *norm_strings[8] = {
+   "NORM_TRANSFORM_NO_ROT",
+   "NORM_TRANSFORM_NO_ROT | NORM_RESCALE",
+   "NORM_TRANSFORM_NO_ROT | NORM_NORMALIZE",
+   "NORM_TRANSFORM",
+   "NORM_TRANSFORM | NORM_RESCALE",
+   "NORM_TRANSFORM | NORM_NORMALIZE",
+   "NORM_RESCALE",
+   "NORM_NORMALIZE"
+};
+
+
+/* =============================================================
+ * Reference transformations
+ */
+
+static void ref_norm_transform_rescale( const GLmatrix *mat,
+					GLfloat scale,
+					const GLvector4f *in,
+					const GLfloat *lengths,
+					GLvector4f *dest )
+{
+   GLuint i;
+   const GLfloat *s = in->start;
+   const GLfloat *m = mat->inv;
+   GLfloat (*out)[4] = (GLfloat (*)[4]) dest->start;
+
+   (void) lengths;
+
+   for ( i = 0 ; i < in->count ; i++ ) {
+      GLfloat t[3];
+
+      TRANSFORM_NORMAL( t, s, m );
+      SCALE_SCALAR_3V( out[i], scale, t );
+
+      s = (GLfloat *)((char *)s + in->stride);
+   }
+}
+
+static void ref_norm_transform_normalize( const GLmatrix *mat,
+					  GLfloat scale,
+					  const GLvector4f *in,
+					  const GLfloat *lengths,
+					  GLvector4f *dest )
+{
+   GLuint i;
+   const GLfloat *s = in->start;
+   const GLfloat *m = mat->inv;
+   GLfloat (*out)[4] = (GLfloat (*)[4]) dest->start;
+
+   for ( i = 0 ; i < in->count ; i++ ) {
+      GLfloat t[3];
+
+      TRANSFORM_NORMAL( t, s, m );
+
+      if ( !lengths ) {
+         GLfloat len = LEN_SQUARED_3FV( t );
+         if ( len > 1e-20 ) {
+	    /* Hmmm, don't know how we could test the precalculated
+	     * length case...
+	     */
+            scale = 1.0 / SQRTF( len );
+	    SCALE_SCALAR_3V( out[i], scale, t );
+         } else {
+            out[i][0] = out[i][1] = out[i][2] = 0;
+         }
+      } else {
+         scale = lengths[i];;
+	 SCALE_SCALAR_3V( out[i], scale, t );
+      }
+
+      s = (GLfloat *)((char *)s + in->stride);
+   }
+}
+
+
+/* =============================================================
+ * Normal transformation tests
+ */
+
+static void init_matrix( GLfloat *m )
+{
+   m[0] = 63.0; m[4] = 43.0; m[ 8] = 29.0; m[12] = 43.0;
+   m[1] = 55.0; m[5] = 17.0; m[ 9] = 31.0; m[13] =  7.0;
+   m[2] = 44.0; m[6] =  9.0; m[10] =  7.0; m[14] =  3.0;
+   m[3] = 11.0; m[7] = 23.0; m[11] = 91.0; m[15] =  9.0;
+}
+
+
+static int test_norm_function( normal_func func, int mtype, long *cycles )
+{
+   GLvector4f source[1], dest[1], dest2[1], ref[1], ref2[1];
+   GLmatrix mat[1];
+   GLfloat s[TEST_COUNT][5], d[TEST_COUNT][4], r[TEST_COUNT][4];
+   GLfloat d2[TEST_COUNT][4], r2[TEST_COUNT][4], length[TEST_COUNT];
+   GLfloat scale;
+   GLfloat *m;
+   int i, j;
+#ifdef  RUN_DEBUG_BENCHMARK
+   int cycle_i;		/* the counter for the benchmarks we run */
+#endif
+
+   (void) cycles;
+
+   mat->m = (GLfloat *) ALIGN_MALLOC( 16 * sizeof(GLfloat), 16 );
+   mat->inv = m = mat->m;
+
+   init_matrix( m );
+
+   scale = 1.0F + rnd () * norm_scale_types[mtype];
+
+   for ( i = 0 ; i < 4 ; i++ ) {
+      for ( j = 0 ; j < 4 ; j++ ) {
+         switch ( norm_templates[mtype][i * 4 + j] ) {
+         case NIL:
+            m[j * 4 + i] = 0.0;
+            break;
+         case ONE:
+            m[j * 4 + i] = 1.0;
+            break;
+         case NEG:
+            m[j * 4 + i] = -1.0;
+            break;
+         case VAR:
+            break;
+         default:
+            _mesa_exit(1);
+         }
+      }
+   }
+
+   for ( i = 0 ; i < TEST_COUNT ; i++ ) {
+      ASSIGN_3V( d[i],  0.0, 0.0, 0.0 );
+      ASSIGN_3V( s[i],  0.0, 0.0, 0.0 );
+      ASSIGN_3V( d2[i], 0.0, 0.0, 0.0 );
+      for ( j = 0 ; j < 3 ; j++ )
+         s[i][j] = rnd();
+      length[i] = 1 / SQRTF( LEN_SQUARED_3FV( s[i] ) );
+   }
+
+   source->data = (GLfloat(*)[4]) s;
+   source->start = (GLfloat *) s;
+   source->count = TEST_COUNT;
+   source->stride = sizeof(s[0]);
+   source->flags = 0;
+
+   dest->data = d;
+   dest->start = (GLfloat *) d;
+   dest->count = TEST_COUNT;
+   dest->stride = sizeof(float[4]);
+   dest->flags = 0;
+
+   dest2->data = d2;
+   dest2->start = (GLfloat *) d2;
+   dest2->count = TEST_COUNT;
+   dest2->stride = sizeof(float[4]);
+   dest2->flags = 0;
+
+   ref->data = r;
+   ref->start = (GLfloat *) r;
+   ref->count = TEST_COUNT;
+   ref->stride = sizeof(float[4]);
+   ref->flags = 0;
+
+   ref2->data = r2;
+   ref2->start = (GLfloat *) r2;
+   ref2->count = TEST_COUNT;
+   ref2->stride = sizeof(float[4]);
+   ref2->flags = 0;
+
+   if ( norm_normalize_types[mtype] == 0 ) {
+      ref_norm_transform_rescale( mat, scale, source, NULL, ref );
+   } else {
+      ref_norm_transform_normalize( mat, scale, source, NULL, ref );
+      ref_norm_transform_normalize( mat, scale, source, length, ref2 );
+   }
+
+   if ( mesa_profile ) {
+      BEGIN_RACE( *cycles );
+      func( mat, scale, source, NULL, dest );
+      END_RACE( *cycles );
+      func( mat, scale, source, length, dest2 );
+   } else {
+      func( mat, scale, source, NULL, dest );
+      func( mat, scale, source, length, dest2 );
+   }
+
+   for ( i = 0 ; i < TEST_COUNT ; i++ ) {
+      for ( j = 0 ; j < 3 ; j++ ) {
+         if ( significand_match( d[i][j], r[i][j] ) < REQUIRED_PRECISION ) {
+            _mesa_printf( "-----------------------------\n" );
+            _mesa_printf( "(i = %i, j = %i)\n", i, j );
+            _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
+		    d[i][0], r[i][0], r[i][0]/d[i][0],
+		    MAX_PRECISION - significand_match( d[i][0], r[i][0] ) );
+            _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
+		    d[i][1], r[i][1], r[i][1]/d[i][1],
+		    MAX_PRECISION - significand_match( d[i][1], r[i][1] ) );
+            _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
+		    d[i][2], r[i][2], r[i][2]/d[i][2],
+		    MAX_PRECISION - significand_match( d[i][2], r[i][2] ) );
+            return 0;
+         }
+
+         if ( norm_normalize_types[mtype] != 0 ) {
+            if ( significand_match( d2[i][j], r2[i][j] ) < REQUIRED_PRECISION ) {
+               _mesa_printf( "------------------- precalculated length case ------\n" );
+               _mesa_printf( "(i = %i, j = %i)\n", i, j );
+               _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
+		       d2[i][0], r2[i][0], r2[i][0]/d2[i][0],
+		       MAX_PRECISION - significand_match( d2[i][0], r2[i][0] ) );
+               _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
+		       d2[i][1], r2[i][1], r2[i][1]/d2[i][1],
+		       MAX_PRECISION - significand_match( d2[i][1], r2[i][1] ) );
+               _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
+		       d2[i][2], r2[i][2], r2[i][2]/d2[i][2],
+		       MAX_PRECISION - significand_match( d2[i][2], r2[i][2] ) );
+               return 0;
+            }
+         }
+      }
+   }
+
+   ALIGN_FREE( mat->m );
+   return 1;
+}
+
+void _math_test_all_normal_transform_functions( char *description )
+{
+   int mtype;
+   long benchmark_tab[0xf];
+   static int first_time = 1;
+
+   if ( first_time ) {
+      first_time = 0;
+      mesa_profile = _mesa_getenv( "MESA_PROFILE" );
+   }
+
+#ifdef RUN_DEBUG_BENCHMARK
+   if ( mesa_profile ) {
+      if ( !counter_overhead ) {
+	 INIT_COUNTER();
+	 _mesa_printf( "counter overhead: %ld cycles\n\n", counter_overhead );
+      }
+      _mesa_printf( "normal transform results after hooking in %s functions:\n",
+	      description );
+      _mesa_printf( "\n-------------------------------------------------------\n" );
+   }
+#endif
+
+   for ( mtype = 0 ; mtype < 8 ; mtype++ ) {
+      normal_func func = _mesa_normal_tab[norm_types[mtype]];
+      long *cycles = &benchmark_tab[mtype];
+
+      if ( test_norm_function( func, mtype, cycles ) == 0 ) {
+	 char buf[100];
+	 _mesa_sprintf( buf, "_mesa_normal_tab[0][%s] failed test (%s)",
+		  norm_strings[mtype], description );
+	 _mesa_problem( NULL, buf );
+      }
+
+#ifdef RUN_DEBUG_BENCHMARK
+      if ( mesa_profile ) {
+	 _mesa_printf( " %li\t", benchmark_tab[mtype] );
+	 _mesa_printf( " | [%s]\n", norm_strings[mtype] );
+      }
+#endif
+   }
+#ifdef RUN_DEBUG_BENCHMARK
+   if ( mesa_profile ) {
+      _mesa_printf( "\n" );
+   }
+#endif
+}
+
+
+#endif /* DEBUG_MATH */
diff --git a/mesalib/src/mesa/math/m_debug_util.h b/mesalib/src/mesa/math/m_debug_util.h
new file mode 100644
index 000000000..2e67db8e5
--- /dev/null
+++ b/mesalib/src/mesa/math/m_debug_util.h
@@ -0,0 +1,320 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.1
+ *
+ * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Gareth Hughes
+ */
+
+#ifndef __M_DEBUG_UTIL_H__
+#define __M_DEBUG_UTIL_H__
+
+
+#ifdef DEBUG_MATH  /* This code only used for debugging */
+
+
+/* Comment this out to deactivate the cycle counter.
+ * NOTE: it works only on CPUs which know the 'rdtsc' command (586 or higher)
+ * (hope, you don't try to debug Mesa on a 386 ;)
+ */
+#if defined(__GNUC__) && \
+    ((defined(__i386__) && defined(USE_X86_ASM)) || \
+     (defined(__sparc__) && defined(USE_SPARC_ASM)))
+#define  RUN_DEBUG_BENCHMARK
+#endif
+
+#define TEST_COUNT		128	/* size of the tested vector array   */
+
+#define REQUIRED_PRECISION	10	/* allow 4 bits to miss              */
+#define MAX_PRECISION		24	/* max. precision possible           */
+
+
+#ifdef  RUN_DEBUG_BENCHMARK
+/* Overhead of profiling counter in cycles.  Automatically adjusted to
+ * your machine at run time - counter initialization should give very
+ * consistent results.
+ */
+extern long counter_overhead;
+
+/* This is the value of the environment variable MESA_PROFILE, and is
+ * used to determine if we should benchmark the functions as well as
+ * verify their correctness.
+ */
+extern char *mesa_profile;
+
+/* Modify the the number of tests if you like.
+ * We take the minimum of all results, because every error should be
+ * positive (time used by other processes, task switches etc).
+ * It is assumed that all calculations are done in the cache.
+ */
+
+#if defined(__i386__)
+
+#if 1 /* PPro, PII, PIII version */
+
+/* Profiling on the P6 architecture requires a little more work, due to
+ * the internal out-of-order execution.  We must perform a serializing
+ * 'cpuid' instruction before and after the 'rdtsc' instructions to make
+ * sure no other uops are executed when we sample the timestamp counter.
+ */
+#define  INIT_COUNTER()							\
+   do {									\
+      int cycle_i;							\
+      counter_overhead = LONG_MAX;					\
+      for ( cycle_i = 0 ; cycle_i < 8 ; cycle_i++ ) {			\
+	 long cycle_tmp1 = 0, cycle_tmp2 = 0;				\
+	 __asm__ __volatile__ ( "push %%ebx       \n"			\
+				"xor %%eax, %%eax \n"			\
+				"cpuid            \n"			\
+				"rdtsc            \n"			\
+				"mov %%eax, %0    \n"			\
+				"xor %%eax, %%eax \n"			\
+				"cpuid            \n"			\
+				"pop %%ebx        \n"			\
+				"push %%ebx       \n"			\
+				"xor %%eax, %%eax \n"			\
+				"cpuid            \n"			\
+				"rdtsc            \n"			\
+				"mov %%eax, %1    \n"			\
+				"xor %%eax, %%eax \n"			\
+				"cpuid            \n"			\
+				"pop %%ebx        \n"			\
+				: "=m" (cycle_tmp1), "=m" (cycle_tmp2)	\
+				: : "eax", "ecx", "edx" );		\
+	 if ( counter_overhead > (cycle_tmp2 - cycle_tmp1) ) {		\
+	    counter_overhead = cycle_tmp2 - cycle_tmp1;			\
+	 }								\
+      }									\
+   } while (0)
+
+#define  BEGIN_RACE(x)							\
+   x = LONG_MAX;							\
+   for ( cycle_i = 0 ; cycle_i < 10 ; cycle_i++ ) {			\
+      long cycle_tmp1 = 0, cycle_tmp2 = 0;				\
+      __asm__ __volatile__ ( "push %%ebx       \n"			\
+			     "xor %%eax, %%eax \n"			\
+			     "cpuid            \n"			\
+			     "rdtsc            \n"			\
+			     "mov %%eax, %0    \n"			\
+			     "xor %%eax, %%eax \n"			\
+			     "cpuid            \n"			\
+			     "pop %%ebx        \n"			\
+			     : "=m" (cycle_tmp1)			\
+			     : : "eax", "ecx", "edx" );
+
+#define END_RACE(x)							\
+      __asm__ __volatile__ ( "push %%ebx       \n"			\
+			     "xor %%eax, %%eax \n"			\
+			     "cpuid            \n"			\
+			     "rdtsc            \n"			\
+			     "mov %%eax, %0    \n"			\
+			     "xor %%eax, %%eax \n"			\
+			     "cpuid            \n"			\
+			     "pop %%ebx        \n"			\
+			     : "=m" (cycle_tmp2)			\
+			     : : "eax", "ecx", "edx" );			\
+      if ( x > (cycle_tmp2 - cycle_tmp1) ) {				\
+	 x = cycle_tmp2 - cycle_tmp1;					\
+      }									\
+   }									\
+   x -= counter_overhead;
+
+#else /* PPlain, PMMX version */
+
+/* To ensure accurate results, we stall the pipelines with the
+ * non-pairable 'cdq' instruction.  This ensures all the code being
+ * profiled is complete when the 'rdtsc' instruction executes.
+ */
+#define  INIT_COUNTER(x)						\
+   do {									\
+      int cycle_i;							\
+      x = LONG_MAX;							\
+      for ( cycle_i = 0 ; cycle_i < 32 ; cycle_i++ ) {			\
+	 long cycle_tmp1, cycle_tmp2, dummy;				\
+	 __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp1) );		\
+	 __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp2) );		\
+	 __asm__ ( "cdq" );						\
+	 __asm__ ( "cdq" );						\
+	 __asm__ ( "rdtsc" : "=a" (cycle_tmp1), "=d" (dummy) );		\
+	 __asm__ ( "cdq" );						\
+	 __asm__ ( "cdq" );						\
+	 __asm__ ( "rdtsc" : "=a" (cycle_tmp2), "=d" (dummy) );		\
+	 if ( x > (cycle_tmp2 - cycle_tmp1) )				\
+	    x = cycle_tmp2 - cycle_tmp1;				\
+      }									\
+   } while (0)
+
+#define  BEGIN_RACE(x)							\
+   x = LONG_MAX;							\
+   for ( cycle_i = 0 ; cycle_i < 16 ; cycle_i++ ) {			\
+      long cycle_tmp1, cycle_tmp2, dummy;				\
+      __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp1) );			\
+      __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp2) );			\
+      __asm__ ( "cdq" );						\
+      __asm__ ( "cdq" );						\
+      __asm__ ( "rdtsc" : "=a" (cycle_tmp1), "=d" (dummy) );
+
+
+#define END_RACE(x)							\
+      __asm__ ( "cdq" );						\
+      __asm__ ( "cdq" );						\
+      __asm__ ( "rdtsc" : "=a" (cycle_tmp2), "=d" (dummy) );		\
+      if ( x > (cycle_tmp2 - cycle_tmp1) )				\
+	 x = cycle_tmp2 - cycle_tmp1;					\
+   }									\
+   x -= counter_overhead;
+
+#endif
+
+#elif defined(__x86_64__)
+
+#define rdtscll(val) do { \
+     unsigned int a,d; \
+     __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); \
+     (val) = ((unsigned long)a) | (((unsigned long)d)<<32); \
+} while(0) 
+
+/* Copied from i386 PIII version */
+#define  INIT_COUNTER()							\
+   do {									\
+      int cycle_i;							\
+      counter_overhead = LONG_MAX;					\
+      for ( cycle_i = 0 ; cycle_i < 16 ; cycle_i++ ) {			\
+	 unsigned long cycle_tmp1, cycle_tmp2;        			\
+	 rdtscll(cycle_tmp1);						\
+	 rdtscll(cycle_tmp2);						\
+	 if ( counter_overhead > (cycle_tmp2 - cycle_tmp1) ) {		\
+	    counter_overhead = cycle_tmp2 - cycle_tmp1;			\
+	 }								\
+      }									\
+   } while (0)
+
+
+#define  BEGIN_RACE(x)							\
+   x = LONG_MAX;							\
+   for ( cycle_i = 0 ; cycle_i < 10 ; cycle_i++ ) {			\
+      unsigned long cycle_tmp1, cycle_tmp2;				\
+      rdtscll(cycle_tmp1);						\
+
+#define END_RACE(x)							\
+      rdtscll(cycle_tmp2);						\
+      if ( x > (cycle_tmp2 - cycle_tmp1) ) {				\
+	 x = cycle_tmp2 - cycle_tmp1;					\
+      }									\
+   }									\
+   x -= counter_overhead;
+
+#elif defined(__sparc__)
+
+#define  INIT_COUNTER()	\
+	 do { counter_overhead = 5; } while(0)
+
+#define  BEGIN_RACE(x)                                                        \
+x = LONG_MAX;                                                                 \
+for (cycle_i = 0; cycle_i <10; cycle_i++) {                                   \
+   register long cycle_tmp1 __asm__("l0");				      \
+   register long cycle_tmp2 __asm__("l1");				      \
+   /* rd %tick, %l0 */							      \
+   __asm__ __volatile__ (".word 0xa1410000" : "=r" (cycle_tmp1));  /*  save timestamp   */
+
+#define END_RACE(x)                                                           \
+   /* rd %tick, %l1 */							      \
+   __asm__ __volatile__ (".word 0xa3410000" : "=r" (cycle_tmp2));	      \
+   if (x > (cycle_tmp2-cycle_tmp1)) x = cycle_tmp2 - cycle_tmp1;              \
+}                                                                             \
+x -= counter_overhead;
+
+#else
+#error Your processor is not supported for RUN_XFORM_BENCHMARK
+#endif
+
+#else
+
+#define BEGIN_RACE(x)
+#define END_RACE(x)
+
+#endif
+
+
+/* =============================================================
+ * Helper functions
+ */
+
+static GLfloat rnd( void )
+{
+   GLfloat f = (GLfloat)rand() / (GLfloat)RAND_MAX;
+   GLfloat gran = (GLfloat)(1 << 13);
+
+   f = (GLfloat)(GLint)(f * gran) / gran;
+
+   return f * 2.0 - 1.0;
+}
+
+static int significand_match( GLfloat a, GLfloat b )
+{
+   GLfloat d = a - b;
+   int a_ex, b_ex, d_ex;
+
+   if ( d == 0.0F ) {
+      return MAX_PRECISION;   /* Exact match */
+   }
+
+   if ( a == 0.0F || b == 0.0F ) {
+      /* It would probably be better to check if the
+       * non-zero number is denormalized and return
+       * the index of the highest set bit here.
+       */
+      return 0;
+   }
+
+   FREXPF( a, &a_ex );
+   FREXPF( b, &b_ex );
+   FREXPF( d, &d_ex );
+
+   if ( a_ex < b_ex ) {
+      return a_ex - d_ex;
+   } else {
+      return b_ex - d_ex;
+   }
+}
+
+enum { NIL = 0, ONE = 1, NEG = -1, VAR = 2 };
+
+/* Ensure our arrays are correctly aligned.
+ */
+#if defined(__GNUC__)
+#  define ALIGN16(type, array)	type array __attribute__ ((aligned (16)))
+#elif defined(_MSC_VER)
+#  define ALIGN16(type, array)	type array __declspec(align(16)) /* GH: Does this work? */
+#elif defined(__WATCOMC__)
+#  define ALIGN16(type, array)	                    /* Watcom does not support this */ 
+#elif defined(__xlC__)
+#  define ALIGN16(type, array)       type __align (16) array 
+#else
+#  warning "ALIGN16 will not 16-byte align!\n"
+#  define ALIGN16
+#endif
+
+
+#endif /* DEBUG_MATH */
+
+#endif /* __M_DEBUG_UTIL_H__ */
diff --git a/mesalib/src/mesa/math/m_debug_xform.c b/mesalib/src/mesa/math/m_debug_xform.c
new file mode 100644
index 000000000..df8cc066b
--- /dev/null
+++ b/mesalib/src/mesa/math/m_debug_xform.c
@@ -0,0 +1,339 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.1
+ *
+ * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Updated for P6 architecture by Gareth Hughes.
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/macros.h"
+#include "main/imports.h"
+
+#include "m_matrix.h"
+#include "m_xform.h"
+
+#include "m_debug.h"
+#include "m_debug_util.h"
+
+#ifdef __UNIXOS2__
+/* The linker doesn't like empty files */
+static char dummy;
+#endif
+
+#ifdef DEBUG_MATH  /* This code only used for debugging */
+
+
+/* Overhead of profiling counter in cycles.  Automatically adjusted to
+ * your machine at run time - counter initialization should give very
+ * consistent results.
+ */
+long counter_overhead = 0;
+
+/* This is the value of the environment variable MESA_PROFILE, and is
+ * used to determine if we should benchmark the functions as well as
+ * verify their correctness.
+ */
+char *mesa_profile = NULL;
+
+
+static int m_general[16] = {
+   VAR, VAR, VAR, VAR,
+   VAR, VAR, VAR, VAR,
+   VAR, VAR, VAR, VAR,
+   VAR, VAR, VAR, VAR
+};
+static int m_identity[16] = {
+   ONE, NIL, NIL, NIL,
+   NIL, ONE, NIL, NIL,
+   NIL, NIL, ONE, NIL,
+   NIL, NIL, NIL, ONE
+};
+static int  m_2d[16]  = {
+   VAR, VAR, NIL, VAR,
+   VAR, VAR, NIL, VAR,
+   NIL, NIL, ONE, NIL,
+   NIL, NIL, NIL, ONE
+};
+static int m_2d_no_rot[16] = {
+   VAR, NIL, NIL, VAR,
+   NIL, VAR, NIL, VAR,
+   NIL, NIL, ONE, NIL,
+   NIL, NIL, NIL, ONE
+};
+static int m_3d[16] = {
+   VAR, VAR, VAR, VAR,
+   VAR, VAR, VAR, VAR,
+   VAR, VAR, VAR, VAR,
+   NIL, NIL, NIL, ONE
+};
+static int m_3d_no_rot[16] = {
+   VAR, NIL, NIL, VAR,
+   NIL, VAR, NIL, VAR,
+   NIL, NIL, VAR, VAR,
+   NIL, NIL, NIL, ONE
+};
+static int m_perspective[16] = {
+   VAR, NIL, VAR, NIL,
+   NIL, VAR, VAR, NIL,
+   NIL, NIL, VAR, VAR,
+   NIL, NIL, NEG, NIL
+};
+static int *templates[7] = {
+   m_general,
+   m_identity,
+   m_3d_no_rot,
+   m_perspective,
+   m_2d,
+   m_2d_no_rot,
+   m_3d
+};
+static enum GLmatrixtype mtypes[7] = {
+   MATRIX_GENERAL,
+   MATRIX_IDENTITY,
+   MATRIX_3D_NO_ROT,
+   MATRIX_PERSPECTIVE,
+   MATRIX_2D,
+   MATRIX_2D_NO_ROT,
+   MATRIX_3D
+};
+static char *mstrings[7] = {
+   "MATRIX_GENERAL",
+   "MATRIX_IDENTITY",
+   "MATRIX_3D_NO_ROT",
+   "MATRIX_PERSPECTIVE",
+   "MATRIX_2D",
+   "MATRIX_2D_NO_ROT",
+   "MATRIX_3D"
+};
+
+
+/* =============================================================
+ * Reference transformations
+ */
+
+static void ref_transform( GLvector4f *dst,
+                           const GLmatrix *mat,
+                           const GLvector4f *src )
+{
+   GLuint i;
+   GLfloat *s = (GLfloat *)src->start;
+   GLfloat (*d)[4] = (GLfloat (*)[4])dst->start;
+   const GLfloat *m = mat->m;
+
+   for ( i = 0 ; i < src->count ; i++ ) {
+      TRANSFORM_POINT( d[i], m, s );
+      s = (GLfloat *)((char *)s + src->stride);
+   }
+}
+
+
+/* =============================================================
+ * Vertex transformation tests
+ */
+
+static void init_matrix( GLfloat *m )
+{
+   m[0] = 63.0; m[4] = 43.0; m[ 8] = 29.0; m[12] = 43.0;
+   m[1] = 55.0; m[5] = 17.0; m[ 9] = 31.0; m[13] =  7.0;
+   m[2] = 44.0; m[6] =  9.0; m[10] =  7.0; m[14] =  3.0;
+   m[3] = 11.0; m[7] = 23.0; m[11] = 91.0; m[15] =  9.0;
+}
+
+ALIGN16(static GLfloat, s[TEST_COUNT][4]);
+ALIGN16(static GLfloat, d[TEST_COUNT][4]);
+ALIGN16(static GLfloat, r[TEST_COUNT][4]);
+
+static int test_transform_function( transform_func func, int psize,
+				    int mtype, unsigned long *cycles )
+{
+   GLvector4f source[1], dest[1], ref[1];
+   GLmatrix mat[1];
+   GLfloat *m;
+   int i, j;
+#ifdef  RUN_DEBUG_BENCHMARK
+   int cycle_i;                /* the counter for the benchmarks we run */
+#endif
+
+   (void) cycles;
+
+   if ( psize > 4 ) {
+      _mesa_problem( NULL, "test_transform_function called with psize > 4\n" );
+      return 0;
+   }
+
+   mat->m = (GLfloat *) ALIGN_MALLOC( 16 * sizeof(GLfloat), 16 );
+   mat->type = mtypes[mtype];
+
+   m = mat->m;
+   ASSERT( ((long)m & 15) == 0 );
+
+   init_matrix( m );
+
+   for ( i = 0 ; i < 4 ; i++ ) {
+      for ( j = 0 ; j < 4 ; j++ ) {
+         switch ( templates[mtype][i * 4 + j] ) {
+         case NIL:
+            m[j * 4 + i] = 0.0;
+            break;
+         case ONE:
+            m[j * 4 + i] = 1.0;
+            break;
+         case NEG:
+            m[j * 4 + i] = -1.0;
+            break;
+         case VAR:
+            break;
+         default:
+            ASSERT(0);
+            return 0;
+         }
+      }
+   }
+
+   for ( i = 0 ; i < TEST_COUNT ; i++) {
+      ASSIGN_4V( d[i], 0.0, 0.0, 0.0, 1.0 );
+      ASSIGN_4V( s[i], 0.0, 0.0, 0.0, 1.0 );
+      for ( j = 0 ; j < psize ; j++ )
+         s[i][j] = rnd();
+   }
+
+   source->data = (GLfloat(*)[4])s;
+   source->start = (GLfloat *)s;
+   source->count = TEST_COUNT;
+   source->stride = sizeof(s[0]);
+   source->size = 4;
+   source->flags = 0;
+
+   dest->data = (GLfloat(*)[4])d;
+   dest->start = (GLfloat *)d;
+   dest->count = TEST_COUNT;
+   dest->stride = sizeof(float[4]);
+   dest->size = 0;
+   dest->flags = 0;
+
+   ref->data = (GLfloat(*)[4])r;
+   ref->start = (GLfloat *)r;
+   ref->count = TEST_COUNT;
+   ref->stride = sizeof(float[4]);
+   ref->size = 0;
+   ref->flags = 0;
+
+   ref_transform( ref, mat, source );
+
+   if ( mesa_profile ) {
+      BEGIN_RACE( *cycles );
+      func( dest, mat->m, source );
+      END_RACE( *cycles );
+   }
+   else {
+      func( dest, mat->m, source );
+   }
+
+   for ( i = 0 ; i < TEST_COUNT ; i++ ) {
+      for ( j = 0 ; j < 4 ; j++ ) {
+         if ( significand_match( d[i][j], r[i][j] ) < REQUIRED_PRECISION ) {
+            _mesa_printf("-----------------------------\n" );
+            _mesa_printf("(i = %i, j = %i)\n", i, j );
+            _mesa_printf("%f \t %f \t [diff = %e - %i bit missed]\n",
+		    d[i][0], r[i][0], r[i][0]-d[i][0],
+		    MAX_PRECISION - significand_match( d[i][0], r[i][0] ) );
+            _mesa_printf("%f \t %f \t [diff = %e - %i bit missed]\n",
+		    d[i][1], r[i][1], r[i][1]-d[i][1],
+		    MAX_PRECISION - significand_match( d[i][1], r[i][1] ) );
+            _mesa_printf("%f \t %f \t [diff = %e - %i bit missed]\n",
+		    d[i][2], r[i][2], r[i][2]-d[i][2],
+		    MAX_PRECISION - significand_match( d[i][2], r[i][2] ) );
+            _mesa_printf("%f \t %f \t [diff = %e - %i bit missed]\n",
+		    d[i][3], r[i][3], r[i][3]-d[i][3],
+		    MAX_PRECISION - significand_match( d[i][3], r[i][3] ) );
+            return 0;
+         }
+      }
+   }
+
+   ALIGN_FREE( mat->m );
+   return 1;
+}
+
+void _math_test_all_transform_functions( char *description )
+{
+   int psize, mtype;
+   unsigned long benchmark_tab[4][7];
+   static int first_time = 1;
+
+   if ( first_time ) {
+      first_time = 0;
+      mesa_profile = _mesa_getenv( "MESA_PROFILE" );
+   }
+
+#ifdef RUN_DEBUG_BENCHMARK
+   if ( mesa_profile ) {
+      if ( !counter_overhead ) {
+	 INIT_COUNTER();
+	 _mesa_printf("counter overhead: %lu cycles\n\n", counter_overhead );
+      }
+      _mesa_printf("transform results after hooking in %s functions:\n", description );
+   }
+#endif
+
+#ifdef RUN_DEBUG_BENCHMARK
+   if ( mesa_profile ) {
+      _mesa_printf("\n" );
+      for ( psize = 1 ; psize <= 4 ; psize++ ) {
+	 _mesa_printf(" p%d\t", psize );
+      }
+      _mesa_printf("\n--------------------------------------------------------\n" );
+   }
+#endif
+
+   for ( mtype = 0 ; mtype < 7 ; mtype++ ) {
+      for ( psize = 1 ; psize <= 4 ; psize++ ) {
+	 transform_func func = _mesa_transform_tab[psize][mtypes[mtype]];
+	 unsigned long *cycles = &(benchmark_tab[psize-1][mtype]);
+
+	 if ( test_transform_function( func, psize, mtype, cycles ) == 0 ) {
+	    char buf[100];
+	    _mesa_sprintf(buf, "_mesa_transform_tab[0][%d][%s] failed test (%s)",
+		     psize, mstrings[mtype], description );
+	    _mesa_problem( NULL, buf );
+	 }
+#ifdef RUN_DEBUG_BENCHMARK
+	 if ( mesa_profile )
+	    _mesa_printf(" %li\t", benchmark_tab[psize-1][mtype] );
+#endif
+      }
+#ifdef RUN_DEBUG_BENCHMARK
+      if ( mesa_profile )
+	 _mesa_printf(" | [%s]\n", mstrings[mtype] );
+#endif
+   }
+#ifdef RUN_DEBUG_BENCHMARK
+   if ( mesa_profile )
+      _mesa_printf( "\n" );
+#endif
+}
+
+
+#endif /* DEBUG_MATH */
diff --git a/mesalib/src/mesa/math/m_dotprod_tmp.h b/mesalib/src/mesa/math/m_dotprod_tmp.h
new file mode 100644
index 000000000..03e65af6c
--- /dev/null
+++ b/mesalib/src/mesa/math/m_dotprod_tmp.h
@@ -0,0 +1,102 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * New (3.1) transformation code written by Keith Whitwell.
+ */
+
+
+/* Note - respects the stride of the output vector.
+ */
+static void TAG(dotprod_vec2)( GLfloat *out,
+			       GLuint outstride,
+			       const GLvector4f *coord_vec,
+			       const GLfloat plane[4] )
+{
+   GLuint stride = coord_vec->stride;
+   GLfloat *coord = coord_vec->start;
+   GLuint count = coord_vec->count;
+
+   GLuint i;
+
+   const GLfloat plane0 = plane[0], plane1 = plane[1], plane3 = plane[3];
+
+   for (i=0;i<count;i++,STRIDE_F(coord,stride),STRIDE_F(out,outstride)) {
+      *out = (coord[0] * plane0 +
+	      coord[1] * plane1 +
+	      plane3);
+   }
+}
+
+static void TAG(dotprod_vec3)( GLfloat *out,
+			       GLuint outstride,
+			       const GLvector4f *coord_vec,
+			       const GLfloat plane[4] )
+{
+   GLuint stride = coord_vec->stride;
+   GLfloat *coord = coord_vec->start;
+   GLuint count = coord_vec->count;
+
+   GLuint i;
+
+   const GLfloat plane0 = plane[0], plane1 = plane[1], plane2 = plane[2];
+   const GLfloat plane3 = plane[3];
+
+   for (i=0;i<count;i++,STRIDE_F(coord,stride),STRIDE_F(out,outstride)) {
+      *out = (coord[0] * plane0 +
+	      coord[1] * plane1 +
+	      coord[2] * plane2 +
+	      plane3);
+   }
+}
+
+static void TAG(dotprod_vec4)( GLfloat *out,
+			       GLuint outstride,
+			       const GLvector4f *coord_vec,
+			       const GLfloat plane[4] )
+{
+   GLuint stride = coord_vec->stride;
+   GLfloat *coord = coord_vec->start;
+   GLuint count = coord_vec->count;
+   GLuint i;
+
+   const GLfloat plane0 = plane[0], plane1 = plane[1], plane2 = plane[2];
+   const GLfloat plane3 = plane[3];
+
+   for (i=0;i<count;i++,STRIDE_F(coord,stride),STRIDE_F(out,outstride)) {
+      *out = (coord[0] * plane0 +
+	      coord[1] * plane1 +
+	      coord[2] * plane2 +
+	      coord[3] * plane3);
+   }
+}
+
+
+static void TAG(init_dotprod)( void )
+{
+   _mesa_dotprod_tab[2] = TAG(dotprod_vec2);
+   _mesa_dotprod_tab[3] = TAG(dotprod_vec3);
+   _mesa_dotprod_tab[4] = TAG(dotprod_vec4);
+}
diff --git a/mesalib/src/mesa/math/m_eval.c b/mesalib/src/mesa/math/m_eval.c
new file mode 100644
index 000000000..d324673c5
--- /dev/null
+++ b/mesalib/src/mesa/math/m_eval.c
@@ -0,0 +1,461 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/*
+ * eval.c was written by
+ * Bernd Barsuhn (bdbarsuh@cip.informatik.uni-erlangen.de) and
+ * Volker Weiss (vrweiss@cip.informatik.uni-erlangen.de).
+ *
+ * My original implementation of evaluators was simplistic and didn't
+ * compute surface normal vectors properly.  Bernd and Volker applied
+ * used more sophisticated methods to get better results.
+ *
+ * Thanks guys!
+ */
+
+
+#include "main/glheader.h"
+#include "main/config.h"
+#include "m_eval.h"
+
+static GLfloat inv_tab[MAX_EVAL_ORDER];
+
+
+
+/*
+ * Horner scheme for Bezier curves
+ *
+ * Bezier curves can be computed via a Horner scheme.
+ * Horner is numerically less stable than the de Casteljau
+ * algorithm, but it is faster. For curves of degree n
+ * the complexity of Horner is O(n) and de Casteljau is O(n^2).
+ * Since stability is not important for displaying curve
+ * points I decided to use the Horner scheme.
+ *
+ * A cubic Bezier curve with control points b0, b1, b2, b3 can be
+ * written as
+ *
+ *        (([3]        [3]     )     [3]       )     [3]
+ * c(t) = (([0]*s*b0 + [1]*t*b1)*s + [2]*t^2*b2)*s + [3]*t^2*b3
+ *
+ *                                           [n]
+ * where s=1-t and the binomial coefficients [i]. These can
+ * be computed iteratively using the identity:
+ *
+ * [n]               [n  ]             [n]
+ * [i] = (n-i+1)/i * [i-1]     and     [0] = 1
+ */
+
+
+void
+_math_horner_bezier_curve(const GLfloat * cp, GLfloat * out, GLfloat t,
+			  GLuint dim, GLuint order)
+{
+   GLfloat s, powert, bincoeff;
+   GLuint i, k;
+
+   if (order >= 2) {
+      bincoeff = (GLfloat) (order - 1);
+      s = 1.0F - t;
+
+      for (k = 0; k < dim; k++)
+	 out[k] = s * cp[k] + bincoeff * t * cp[dim + k];
+
+      for (i = 2, cp += 2 * dim, powert = t * t; i < order;
+	   i++, powert *= t, cp += dim) {
+	 bincoeff *= (GLfloat) (order - i);
+	 bincoeff *= inv_tab[i];
+
+	 for (k = 0; k < dim; k++)
+	    out[k] = s * out[k] + bincoeff * powert * cp[k];
+      }
+   }
+   else {			/* order=1 -> constant curve */
+
+      for (k = 0; k < dim; k++)
+	 out[k] = cp[k];
+   }
+}
+
+/*
+ * Tensor product Bezier surfaces
+ *
+ * Again the Horner scheme is used to compute a point on a
+ * TP Bezier surface. First a control polygon for a curve
+ * on the surface in one parameter direction is computed,
+ * then the point on the curve for the other parameter
+ * direction is evaluated.
+ *
+ * To store the curve control polygon additional storage
+ * for max(uorder,vorder) points is needed in the
+ * control net cn.
+ */
+
+void
+_math_horner_bezier_surf(GLfloat * cn, GLfloat * out, GLfloat u, GLfloat v,
+			 GLuint dim, GLuint uorder, GLuint vorder)
+{
+   GLfloat *cp = cn + uorder * vorder * dim;
+   GLuint i, uinc = vorder * dim;
+
+   if (vorder > uorder) {
+      if (uorder >= 2) {
+	 GLfloat s, poweru, bincoeff;
+	 GLuint j, k;
+
+	 /* Compute the control polygon for the surface-curve in u-direction */
+	 for (j = 0; j < vorder; j++) {
+	    GLfloat *ucp = &cn[j * dim];
+
+	    /* Each control point is the point for parameter u on a */
+	    /* curve defined by the control polygons in u-direction */
+	    bincoeff = (GLfloat) (uorder - 1);
+	    s = 1.0F - u;
+
+	    for (k = 0; k < dim; k++)
+	       cp[j * dim + k] = s * ucp[k] + bincoeff * u * ucp[uinc + k];
+
+	    for (i = 2, ucp += 2 * uinc, poweru = u * u; i < uorder;
+		 i++, poweru *= u, ucp += uinc) {
+	       bincoeff *= (GLfloat) (uorder - i);
+	       bincoeff *= inv_tab[i];
+
+	       for (k = 0; k < dim; k++)
+		  cp[j * dim + k] =
+		     s * cp[j * dim + k] + bincoeff * poweru * ucp[k];
+	    }
+	 }
+
+	 /* Evaluate curve point in v */
+	 _math_horner_bezier_curve(cp, out, v, dim, vorder);
+      }
+      else			/* uorder=1 -> cn defines a curve in v */
+	 _math_horner_bezier_curve(cn, out, v, dim, vorder);
+   }
+   else {			/* vorder <= uorder */
+
+      if (vorder > 1) {
+	 GLuint i;
+
+	 /* Compute the control polygon for the surface-curve in u-direction */
+	 for (i = 0; i < uorder; i++, cn += uinc) {
+	    /* For constant i all cn[i][j] (j=0..vorder) are located */
+	    /* on consecutive memory locations, so we can use        */
+	    /* horner_bezier_curve to compute the control points     */
+
+	    _math_horner_bezier_curve(cn, &cp[i * dim], v, dim, vorder);
+	 }
+
+	 /* Evaluate curve point in u */
+	 _math_horner_bezier_curve(cp, out, u, dim, uorder);
+      }
+      else			/* vorder=1 -> cn defines a curve in u */
+	 _math_horner_bezier_curve(cn, out, u, dim, uorder);
+   }
+}
+
+/*
+ * The direct de Casteljau algorithm is used when a point on the
+ * surface and the tangent directions spanning the tangent plane
+ * should be computed (this is needed to compute normals to the
+ * surface). In this case the de Casteljau algorithm approach is
+ * nicer because a point and the partial derivatives can be computed
+ * at the same time. To get the correct tangent length du and dv
+ * must be multiplied with the (u2-u1)/uorder-1 and (v2-v1)/vorder-1.
+ * Since only the directions are needed, this scaling step is omitted.
+ *
+ * De Casteljau needs additional storage for uorder*vorder
+ * values in the control net cn.
+ */
+
+void
+_math_de_casteljau_surf(GLfloat * cn, GLfloat * out, GLfloat * du,
+			GLfloat * dv, GLfloat u, GLfloat v, GLuint dim,
+			GLuint uorder, GLuint vorder)
+{
+   GLfloat *dcn = cn + uorder * vorder * dim;
+   GLfloat us = 1.0F - u, vs = 1.0F - v;
+   GLuint h, i, j, k;
+   GLuint minorder = uorder < vorder ? uorder : vorder;
+   GLuint uinc = vorder * dim;
+   GLuint dcuinc = vorder;
+
+   /* Each component is evaluated separately to save buffer space  */
+   /* This does not drasticaly decrease the performance of the     */
+   /* algorithm. If additional storage for (uorder-1)*(vorder-1)   */
+   /* points would be available, the components could be accessed  */
+   /* in the innermost loop which could lead to less cache misses. */
+
+#define CN(I,J,K) cn[(I)*uinc+(J)*dim+(K)]
+#define DCN(I, J) dcn[(I)*dcuinc+(J)]
+   if (minorder < 3) {
+      if (uorder == vorder) {
+	 for (k = 0; k < dim; k++) {
+	    /* Derivative direction in u */
+	    du[k] = vs * (CN(1, 0, k) - CN(0, 0, k)) +
+	       v * (CN(1, 1, k) - CN(0, 1, k));
+
+	    /* Derivative direction in v */
+	    dv[k] = us * (CN(0, 1, k) - CN(0, 0, k)) +
+	       u * (CN(1, 1, k) - CN(1, 0, k));
+
+	    /* bilinear de Casteljau step */
+	    out[k] = us * (vs * CN(0, 0, k) + v * CN(0, 1, k)) +
+	       u * (vs * CN(1, 0, k) + v * CN(1, 1, k));
+	 }
+      }
+      else if (minorder == uorder) {
+	 for (k = 0; k < dim; k++) {
+	    /* bilinear de Casteljau step */
+	    DCN(1, 0) = CN(1, 0, k) - CN(0, 0, k);
+	    DCN(0, 0) = us * CN(0, 0, k) + u * CN(1, 0, k);
+
+	    for (j = 0; j < vorder - 1; j++) {
+	       /* for the derivative in u */
+	       DCN(1, j + 1) = CN(1, j + 1, k) - CN(0, j + 1, k);
+	       DCN(1, j) = vs * DCN(1, j) + v * DCN(1, j + 1);
+
+	       /* for the `point' */
+	       DCN(0, j + 1) = us * CN(0, j + 1, k) + u * CN(1, j + 1, k);
+	       DCN(0, j) = vs * DCN(0, j) + v * DCN(0, j + 1);
+	    }
+
+	    /* remaining linear de Casteljau steps until the second last step */
+	    for (h = minorder; h < vorder - 1; h++)
+	       for (j = 0; j < vorder - h; j++) {
+		  /* for the derivative in u */
+		  DCN(1, j) = vs * DCN(1, j) + v * DCN(1, j + 1);
+
+		  /* for the `point' */
+		  DCN(0, j) = vs * DCN(0, j) + v * DCN(0, j + 1);
+	       }
+
+	    /* derivative direction in v */
+	    dv[k] = DCN(0, 1) - DCN(0, 0);
+
+	    /* derivative direction in u */
+	    du[k] = vs * DCN(1, 0) + v * DCN(1, 1);
+
+	    /* last linear de Casteljau step */
+	    out[k] = vs * DCN(0, 0) + v * DCN(0, 1);
+	 }
+      }
+      else {			/* minorder == vorder */
+
+	 for (k = 0; k < dim; k++) {
+	    /* bilinear de Casteljau step */
+	    DCN(0, 1) = CN(0, 1, k) - CN(0, 0, k);
+	    DCN(0, 0) = vs * CN(0, 0, k) + v * CN(0, 1, k);
+	    for (i = 0; i < uorder - 1; i++) {
+	       /* for the derivative in v */
+	       DCN(i + 1, 1) = CN(i + 1, 1, k) - CN(i + 1, 0, k);
+	       DCN(i, 1) = us * DCN(i, 1) + u * DCN(i + 1, 1);
+
+	       /* for the `point' */
+	       DCN(i + 1, 0) = vs * CN(i + 1, 0, k) + v * CN(i + 1, 1, k);
+	       DCN(i, 0) = us * DCN(i, 0) + u * DCN(i + 1, 0);
+	    }
+
+	    /* remaining linear de Casteljau steps until the second last step */
+	    for (h = minorder; h < uorder - 1; h++)
+	       for (i = 0; i < uorder - h; i++) {
+		  /* for the derivative in v */
+		  DCN(i, 1) = us * DCN(i, 1) + u * DCN(i + 1, 1);
+
+		  /* for the `point' */
+		  DCN(i, 0) = us * DCN(i, 0) + u * DCN(i + 1, 0);
+	       }
+
+	    /* derivative direction in u */
+	    du[k] = DCN(1, 0) - DCN(0, 0);
+
+	    /* derivative direction in v */
+	    dv[k] = us * DCN(0, 1) + u * DCN(1, 1);
+
+	    /* last linear de Casteljau step */
+	    out[k] = us * DCN(0, 0) + u * DCN(1, 0);
+	 }
+      }
+   }
+   else if (uorder == vorder) {
+      for (k = 0; k < dim; k++) {
+	 /* first bilinear de Casteljau step */
+	 for (i = 0; i < uorder - 1; i++) {
+	    DCN(i, 0) = us * CN(i, 0, k) + u * CN(i + 1, 0, k);
+	    for (j = 0; j < vorder - 1; j++) {
+	       DCN(i, j + 1) = us * CN(i, j + 1, k) + u * CN(i + 1, j + 1, k);
+	       DCN(i, j) = vs * DCN(i, j) + v * DCN(i, j + 1);
+	    }
+	 }
+
+	 /* remaining bilinear de Casteljau steps until the second last step */
+	 for (h = 2; h < minorder - 1; h++)
+	    for (i = 0; i < uorder - h; i++) {
+	       DCN(i, 0) = us * DCN(i, 0) + u * DCN(i + 1, 0);
+	       for (j = 0; j < vorder - h; j++) {
+		  DCN(i, j + 1) = us * DCN(i, j + 1) + u * DCN(i + 1, j + 1);
+		  DCN(i, j) = vs * DCN(i, j) + v * DCN(i, j + 1);
+	       }
+	    }
+
+	 /* derivative direction in u */
+	 du[k] = vs * (DCN(1, 0) - DCN(0, 0)) + v * (DCN(1, 1) - DCN(0, 1));
+
+	 /* derivative direction in v */
+	 dv[k] = us * (DCN(0, 1) - DCN(0, 0)) + u * (DCN(1, 1) - DCN(1, 0));
+
+	 /* last bilinear de Casteljau step */
+	 out[k] = us * (vs * DCN(0, 0) + v * DCN(0, 1)) +
+	    u * (vs * DCN(1, 0) + v * DCN(1, 1));
+      }
+   }
+   else if (minorder == uorder) {
+      for (k = 0; k < dim; k++) {
+	 /* first bilinear de Casteljau step */
+	 for (i = 0; i < uorder - 1; i++) {
+	    DCN(i, 0) = us * CN(i, 0, k) + u * CN(i + 1, 0, k);
+	    for (j = 0; j < vorder - 1; j++) {
+	       DCN(i, j + 1) = us * CN(i, j + 1, k) + u * CN(i + 1, j + 1, k);
+	       DCN(i, j) = vs * DCN(i, j) + v * DCN(i, j + 1);
+	    }
+	 }
+
+	 /* remaining bilinear de Casteljau steps until the second last step */
+	 for (h = 2; h < minorder - 1; h++)
+	    for (i = 0; i < uorder - h; i++) {
+	       DCN(i, 0) = us * DCN(i, 0) + u * DCN(i + 1, 0);
+	       for (j = 0; j < vorder - h; j++) {
+		  DCN(i, j + 1) = us * DCN(i, j + 1) + u * DCN(i + 1, j + 1);
+		  DCN(i, j) = vs * DCN(i, j) + v * DCN(i, j + 1);
+	       }
+	    }
+
+	 /* last bilinear de Casteljau step */
+	 DCN(2, 0) = DCN(1, 0) - DCN(0, 0);
+	 DCN(0, 0) = us * DCN(0, 0) + u * DCN(1, 0);
+	 for (j = 0; j < vorder - 1; j++) {
+	    /* for the derivative in u */
+	    DCN(2, j + 1) = DCN(1, j + 1) - DCN(0, j + 1);
+	    DCN(2, j) = vs * DCN(2, j) + v * DCN(2, j + 1);
+
+	    /* for the `point' */
+	    DCN(0, j + 1) = us * DCN(0, j + 1) + u * DCN(1, j + 1);
+	    DCN(0, j) = vs * DCN(0, j) + v * DCN(0, j + 1);
+	 }
+
+	 /* remaining linear de Casteljau steps until the second last step */
+	 for (h = minorder; h < vorder - 1; h++)
+	    for (j = 0; j < vorder - h; j++) {
+	       /* for the derivative in u */
+	       DCN(2, j) = vs * DCN(2, j) + v * DCN(2, j + 1);
+
+	       /* for the `point' */
+	       DCN(0, j) = vs * DCN(0, j) + v * DCN(0, j + 1);
+	    }
+
+	 /* derivative direction in v */
+	 dv[k] = DCN(0, 1) - DCN(0, 0);
+
+	 /* derivative direction in u */
+	 du[k] = vs * DCN(2, 0) + v * DCN(2, 1);
+
+	 /* last linear de Casteljau step */
+	 out[k] = vs * DCN(0, 0) + v * DCN(0, 1);
+      }
+   }
+   else {			/* minorder == vorder */
+
+      for (k = 0; k < dim; k++) {
+	 /* first bilinear de Casteljau step */
+	 for (i = 0; i < uorder - 1; i++) {
+	    DCN(i, 0) = us * CN(i, 0, k) + u * CN(i + 1, 0, k);
+	    for (j = 0; j < vorder - 1; j++) {
+	       DCN(i, j + 1) = us * CN(i, j + 1, k) + u * CN(i + 1, j + 1, k);
+	       DCN(i, j) = vs * DCN(i, j) + v * DCN(i, j + 1);
+	    }
+	 }
+
+	 /* remaining bilinear de Casteljau steps until the second last step */
+	 for (h = 2; h < minorder - 1; h++)
+	    for (i = 0; i < uorder - h; i++) {
+	       DCN(i, 0) = us * DCN(i, 0) + u * DCN(i + 1, 0);
+	       for (j = 0; j < vorder - h; j++) {
+		  DCN(i, j + 1) = us * DCN(i, j + 1) + u * DCN(i + 1, j + 1);
+		  DCN(i, j) = vs * DCN(i, j) + v * DCN(i, j + 1);
+	       }
+	    }
+
+	 /* last bilinear de Casteljau step */
+	 DCN(0, 2) = DCN(0, 1) - DCN(0, 0);
+	 DCN(0, 0) = vs * DCN(0, 0) + v * DCN(0, 1);
+	 for (i = 0; i < uorder - 1; i++) {
+	    /* for the derivative in v */
+	    DCN(i + 1, 2) = DCN(i + 1, 1) - DCN(i + 1, 0);
+	    DCN(i, 2) = us * DCN(i, 2) + u * DCN(i + 1, 2);
+
+	    /* for the `point' */
+	    DCN(i + 1, 0) = vs * DCN(i + 1, 0) + v * DCN(i + 1, 1);
+	    DCN(i, 0) = us * DCN(i, 0) + u * DCN(i + 1, 0);
+	 }
+
+	 /* remaining linear de Casteljau steps until the second last step */
+	 for (h = minorder; h < uorder - 1; h++)
+	    for (i = 0; i < uorder - h; i++) {
+	       /* for the derivative in v */
+	       DCN(i, 2) = us * DCN(i, 2) + u * DCN(i + 1, 2);
+
+	       /* for the `point' */
+	       DCN(i, 0) = us * DCN(i, 0) + u * DCN(i + 1, 0);
+	    }
+
+	 /* derivative direction in u */
+	 du[k] = DCN(1, 0) - DCN(0, 0);
+
+	 /* derivative direction in v */
+	 dv[k] = us * DCN(0, 2) + u * DCN(1, 2);
+
+	 /* last linear de Casteljau step */
+	 out[k] = us * DCN(0, 0) + u * DCN(1, 0);
+      }
+   }
+#undef DCN
+#undef CN
+}
+
+
+/*
+ * Do one-time initialization for evaluators.
+ */
+void
+_math_init_eval(void)
+{
+   GLuint i;
+
+   /* KW: precompute 1/x for useful x.
+    */
+   for (i = 1; i < MAX_EVAL_ORDER; i++)
+      inv_tab[i] = 1.0F / i;
+}
diff --git a/mesalib/src/mesa/math/m_eval.h b/mesalib/src/mesa/math/m_eval.h
new file mode 100644
index 000000000..d73ecaafb
--- /dev/null
+++ b/mesalib/src/mesa/math/m_eval.h
@@ -0,0 +1,103 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _M_EVAL_H
+#define _M_EVAL_H
+
+#include "main/glheader.h"
+
+void _math_init_eval( void );
+
+
+/*
+ * Horner scheme for Bezier curves
+ *
+ * Bezier curves can be computed via a Horner scheme.
+ * Horner is numerically less stable than the de Casteljau
+ * algorithm, but it is faster. For curves of degree n
+ * the complexity of Horner is O(n) and de Casteljau is O(n^2).
+ * Since stability is not important for displaying curve
+ * points I decided to use the Horner scheme.
+ *
+ * A cubic Bezier curve with control points b0, b1, b2, b3 can be
+ * written as
+ *
+ *        (([3]        [3]     )     [3]       )     [3]
+ * c(t) = (([0]*s*b0 + [1]*t*b1)*s + [2]*t^2*b2)*s + [3]*t^2*b3
+ *
+ *                                           [n]
+ * where s=1-t and the binomial coefficients [i]. These can
+ * be computed iteratively using the identity:
+ *
+ * [n]               [n  ]             [n]
+ * [i] = (n-i+1)/i * [i-1]     and     [0] = 1
+ */
+
+
+void
+_math_horner_bezier_curve(const GLfloat *cp, GLfloat *out, GLfloat t,
+			  GLuint dim, GLuint order);
+
+
+/*
+ * Tensor product Bezier surfaces
+ *
+ * Again the Horner scheme is used to compute a point on a
+ * TP Bezier surface. First a control polygon for a curve
+ * on the surface in one parameter direction is computed,
+ * then the point on the curve for the other parameter
+ * direction is evaluated.
+ *
+ * To store the curve control polygon additional storage
+ * for max(uorder,vorder) points is needed in the
+ * control net cn.
+ */
+
+void
+_math_horner_bezier_surf(GLfloat *cn, GLfloat *out, GLfloat u, GLfloat v,
+			 GLuint dim, GLuint uorder, GLuint vorder);
+
+
+/*
+ * The direct de Casteljau algorithm is used when a point on the
+ * surface and the tangent directions spanning the tangent plane
+ * should be computed (this is needed to compute normals to the
+ * surface). In this case the de Casteljau algorithm approach is
+ * nicer because a point and the partial derivatives can be computed
+ * at the same time. To get the correct tangent length du and dv
+ * must be multiplied with the (u2-u1)/uorder-1 and (v2-v1)/vorder-1.
+ * Since only the directions are needed, this scaling step is omitted.
+ *
+ * De Casteljau needs additional storage for uorder*vorder
+ * values in the control net cn.
+ */
+
+void
+_math_de_casteljau_surf(GLfloat *cn, GLfloat *out, GLfloat *du, GLfloat *dv,
+			GLfloat u, GLfloat v, GLuint dim,
+			GLuint uorder, GLuint vorder);
+
+
+#endif
diff --git a/mesalib/src/mesa/math/m_matrix.c b/mesalib/src/mesa/math/m_matrix.c
new file mode 100644
index 000000000..da6956efe
--- /dev/null
+++ b/mesalib/src/mesa/math/m_matrix.c
@@ -0,0 +1,1642 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.3
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * \file m_matrix.c
+ * Matrix operations.
+ *
+ * \note
+ * -# 4x4 transformation matrices are stored in memory in column major order.
+ * -# Points/vertices are to be thought of as column vectors.
+ * -# Transformation of a point p by a matrix M is: p' = M * p
+ */
+
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/imports.h"
+
+#include "m_matrix.h"
+
+
+/**
+ * \defgroup MatFlags MAT_FLAG_XXX-flags
+ *
+ * Bitmasks to indicate different kinds of 4x4 matrices in GLmatrix::flags
+ * It would be nice to make all these flags private to m_matrix.c
+ */
+/*@{*/
+#define MAT_FLAG_IDENTITY       0     /**< is an identity matrix flag.
+                                       *   (Not actually used - the identity
+                                       *   matrix is identified by the absense
+                                       *   of all other flags.)
+                                       */
+#define MAT_FLAG_GENERAL        0x1   /**< is a general matrix flag */
+#define MAT_FLAG_ROTATION       0x2   /**< is a rotation matrix flag */
+#define MAT_FLAG_TRANSLATION    0x4   /**< is a translation matrix flag */
+#define MAT_FLAG_UNIFORM_SCALE  0x8   /**< is an uniform scaling matrix flag */
+#define MAT_FLAG_GENERAL_SCALE  0x10  /**< is a general scaling matrix flag */
+#define MAT_FLAG_GENERAL_3D     0x20  /**< general 3D matrix flag */
+#define MAT_FLAG_PERSPECTIVE    0x40  /**< is a perspective proj matrix flag */
+#define MAT_FLAG_SINGULAR       0x80  /**< is a singular matrix flag */
+#define MAT_DIRTY_TYPE          0x100  /**< matrix type is dirty */
+#define MAT_DIRTY_FLAGS         0x200  /**< matrix flags are dirty */
+#define MAT_DIRTY_INVERSE       0x400  /**< matrix inverse is dirty */
+
+/** angle preserving matrix flags mask */
+#define MAT_FLAGS_ANGLE_PRESERVING (MAT_FLAG_ROTATION | \
+				    MAT_FLAG_TRANSLATION | \
+				    MAT_FLAG_UNIFORM_SCALE)
+
+/** geometry related matrix flags mask */
+#define MAT_FLAGS_GEOMETRY (MAT_FLAG_GENERAL | \
+			    MAT_FLAG_ROTATION | \
+			    MAT_FLAG_TRANSLATION | \
+			    MAT_FLAG_UNIFORM_SCALE | \
+			    MAT_FLAG_GENERAL_SCALE | \
+			    MAT_FLAG_GENERAL_3D | \
+			    MAT_FLAG_PERSPECTIVE | \
+	                    MAT_FLAG_SINGULAR)
+
+/** length preserving matrix flags mask */
+#define MAT_FLAGS_LENGTH_PRESERVING (MAT_FLAG_ROTATION | \
+				     MAT_FLAG_TRANSLATION)
+
+
+/** 3D (non-perspective) matrix flags mask */
+#define MAT_FLAGS_3D (MAT_FLAG_ROTATION | \
+		      MAT_FLAG_TRANSLATION | \
+		      MAT_FLAG_UNIFORM_SCALE | \
+		      MAT_FLAG_GENERAL_SCALE | \
+		      MAT_FLAG_GENERAL_3D)
+
+/** dirty matrix flags mask */
+#define MAT_DIRTY          (MAT_DIRTY_TYPE | \
+			    MAT_DIRTY_FLAGS | \
+			    MAT_DIRTY_INVERSE)
+
+/*@}*/
+
+
+/** 
+ * Test geometry related matrix flags.
+ * 
+ * \param mat a pointer to a GLmatrix structure.
+ * \param a flags mask.
+ *
+ * \returns non-zero if all geometry related matrix flags are contained within
+ * the mask, or zero otherwise.
+ */ 
+#define TEST_MAT_FLAGS(mat, a)  \
+    ((MAT_FLAGS_GEOMETRY & (~(a)) & ((mat)->flags) ) == 0)
+
+
+
+/**
+ * Names of the corresponding GLmatrixtype values.
+ */
+static const char *types[] = {
+   "MATRIX_GENERAL",
+   "MATRIX_IDENTITY",
+   "MATRIX_3D_NO_ROT",
+   "MATRIX_PERSPECTIVE",
+   "MATRIX_2D",
+   "MATRIX_2D_NO_ROT",
+   "MATRIX_3D"
+};
+
+
+/**
+ * Identity matrix.
+ */
+static GLfloat Identity[16] = {
+   1.0, 0.0, 0.0, 0.0,
+   0.0, 1.0, 0.0, 0.0,
+   0.0, 0.0, 1.0, 0.0,
+   0.0, 0.0, 0.0, 1.0
+};
+
+
+
+/**********************************************************************/
+/** \name Matrix multiplication */
+/*@{*/
+
+#define A(row,col)  a[(col<<2)+row]
+#define B(row,col)  b[(col<<2)+row]
+#define P(row,col)  product[(col<<2)+row]
+
+/**
+ * Perform a full 4x4 matrix multiplication.
+ *
+ * \param a matrix.
+ * \param b matrix.
+ * \param product will receive the product of \p a and \p b.
+ *
+ * \warning Is assumed that \p product != \p b. \p product == \p a is allowed.
+ *
+ * \note KW: 4*16 = 64 multiplications
+ * 
+ * \author This \c matmul was contributed by Thomas Malik
+ */
+static void matmul4( GLfloat *product, const GLfloat *a, const GLfloat *b )
+{
+   GLint i;
+   for (i = 0; i < 4; i++) {
+      const GLfloat ai0=A(i,0),  ai1=A(i,1),  ai2=A(i,2),  ai3=A(i,3);
+      P(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
+      P(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
+      P(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
+      P(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
+   }
+}
+
+/**
+ * Multiply two matrices known to occupy only the top three rows, such
+ * as typical model matrices, and orthogonal matrices.
+ *
+ * \param a matrix.
+ * \param b matrix.
+ * \param product will receive the product of \p a and \p b.
+ */
+static void matmul34( GLfloat *product, const GLfloat *a, const GLfloat *b )
+{
+   GLint i;
+   for (i = 0; i < 3; i++) {
+      const GLfloat ai0=A(i,0),  ai1=A(i,1),  ai2=A(i,2),  ai3=A(i,3);
+      P(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0);
+      P(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1);
+      P(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2);
+      P(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3;
+   }
+   P(3,0) = 0;
+   P(3,1) = 0;
+   P(3,2) = 0;
+   P(3,3) = 1;
+}
+
+#undef A
+#undef B
+#undef P
+
+/**
+ * Multiply a matrix by an array of floats with known properties.
+ *
+ * \param mat pointer to a GLmatrix structure containing the left multiplication
+ * matrix, and that will receive the product result.
+ * \param m right multiplication matrix array.
+ * \param flags flags of the matrix \p m.
+ * 
+ * Joins both flags and marks the type and inverse as dirty.  Calls matmul34()
+ * if both matrices are 3D, or matmul4() otherwise.
+ */
+static void matrix_multf( GLmatrix *mat, const GLfloat *m, GLuint flags )
+{
+   mat->flags |= (flags | MAT_DIRTY_TYPE | MAT_DIRTY_INVERSE);
+
+   if (TEST_MAT_FLAGS(mat, MAT_FLAGS_3D))
+      matmul34( mat->m, mat->m, m );
+   else
+      matmul4( mat->m, mat->m, m );
+}
+
+/**
+ * Matrix multiplication.
+ *
+ * \param dest destination matrix.
+ * \param a left matrix.
+ * \param b right matrix.
+ * 
+ * Joins both flags and marks the type and inverse as dirty.  Calls matmul34()
+ * if both matrices are 3D, or matmul4() otherwise.
+ */
+void
+_math_matrix_mul_matrix( GLmatrix *dest, const GLmatrix *a, const GLmatrix *b )
+{
+   dest->flags = (a->flags |
+		  b->flags |
+		  MAT_DIRTY_TYPE |
+		  MAT_DIRTY_INVERSE);
+
+   if (TEST_MAT_FLAGS(dest, MAT_FLAGS_3D))
+      matmul34( dest->m, a->m, b->m );
+   else
+      matmul4( dest->m, a->m, b->m );
+}
+
+/**
+ * Matrix multiplication.
+ *
+ * \param dest left and destination matrix.
+ * \param m right matrix array.
+ * 
+ * Marks the matrix flags with general flag, and type and inverse dirty flags.
+ * Calls matmul4() for the multiplication.
+ */
+void
+_math_matrix_mul_floats( GLmatrix *dest, const GLfloat *m )
+{
+   dest->flags |= (MAT_FLAG_GENERAL |
+		   MAT_DIRTY_TYPE |
+		   MAT_DIRTY_INVERSE |
+                   MAT_DIRTY_FLAGS);
+
+   matmul4( dest->m, dest->m, m );
+}
+
+/*@}*/
+
+
+/**********************************************************************/
+/** \name Matrix output */
+/*@{*/
+
+/**
+ * Print a matrix array.
+ *
+ * \param m matrix array.
+ *
+ * Called by _math_matrix_print() to print a matrix or its inverse.
+ */
+static void print_matrix_floats( const GLfloat m[16] )
+{
+   int i;
+   for (i=0;i<4;i++) {
+      _mesa_debug(NULL,"\t%f %f %f %f\n", m[i], m[4+i], m[8+i], m[12+i] );
+   }
+}
+
+/**
+ * Dumps the contents of a GLmatrix structure.
+ * 
+ * \param m pointer to the GLmatrix structure.
+ */
+void
+_math_matrix_print( const GLmatrix *m )
+{
+   _mesa_debug(NULL, "Matrix type: %s, flags: %x\n", types[m->type], m->flags);
+   print_matrix_floats(m->m);
+   _mesa_debug(NULL, "Inverse: \n");
+   if (m->inv) {
+      GLfloat prod[16];
+      print_matrix_floats(m->inv);
+      matmul4(prod, m->m, m->inv);
+      _mesa_debug(NULL, "Mat * Inverse:\n");
+      print_matrix_floats(prod);
+   }
+   else {
+      _mesa_debug(NULL, "  - not available\n");
+   }
+}
+
+/*@}*/
+
+
+/**
+ * References an element of 4x4 matrix.
+ *
+ * \param m matrix array.
+ * \param c column of the desired element.
+ * \param r row of the desired element.
+ * 
+ * \return value of the desired element.
+ *
+ * Calculate the linear storage index of the element and references it. 
+ */
+#define MAT(m,r,c) (m)[(c)*4+(r)]
+
+
+/**********************************************************************/
+/** \name Matrix inversion */
+/*@{*/
+
+/**
+ * Swaps the values of two floating pointer variables.
+ *
+ * Used by invert_matrix_general() to swap the row pointers.
+ */
+#define SWAP_ROWS(a, b) { GLfloat *_tmp = a; (a)=(b); (b)=_tmp; }
+
+/**
+ * Compute inverse of 4x4 transformation matrix.
+ * 
+ * \param mat pointer to a GLmatrix structure. The matrix inverse will be
+ * stored in the GLmatrix::inv attribute.
+ * 
+ * \return GL_TRUE for success, GL_FALSE for failure (\p singular matrix).
+ * 
+ * \author
+ * Code contributed by Jacques Leroy jle@star.be
+ *
+ * Calculates the inverse matrix by performing the gaussian matrix reduction
+ * with partial pivoting followed by back/substitution with the loops manually
+ * unrolled.
+ */
+static GLboolean invert_matrix_general( GLmatrix *mat )
+{
+   const GLfloat *m = mat->m;
+   GLfloat *out = mat->inv;
+   GLfloat wtmp[4][8];
+   GLfloat m0, m1, m2, m3, s;
+   GLfloat *r0, *r1, *r2, *r3;
+
+   r0 = wtmp[0], r1 = wtmp[1], r2 = wtmp[2], r3 = wtmp[3];
+
+   r0[0] = MAT(m,0,0), r0[1] = MAT(m,0,1),
+   r0[2] = MAT(m,0,2), r0[3] = MAT(m,0,3),
+   r0[4] = 1.0, r0[5] = r0[6] = r0[7] = 0.0,
+
+   r1[0] = MAT(m,1,0), r1[1] = MAT(m,1,1),
+   r1[2] = MAT(m,1,2), r1[3] = MAT(m,1,3),
+   r1[5] = 1.0, r1[4] = r1[6] = r1[7] = 0.0,
+
+   r2[0] = MAT(m,2,0), r2[1] = MAT(m,2,1),
+   r2[2] = MAT(m,2,2), r2[3] = MAT(m,2,3),
+   r2[6] = 1.0, r2[4] = r2[5] = r2[7] = 0.0,
+
+   r3[0] = MAT(m,3,0), r3[1] = MAT(m,3,1),
+   r3[2] = MAT(m,3,2), r3[3] = MAT(m,3,3),
+   r3[7] = 1.0, r3[4] = r3[5] = r3[6] = 0.0;
+
+   /* choose pivot - or die */
+   if (FABSF(r3[0])>FABSF(r2[0])) SWAP_ROWS(r3, r2);
+   if (FABSF(r2[0])>FABSF(r1[0])) SWAP_ROWS(r2, r1);
+   if (FABSF(r1[0])>FABSF(r0[0])) SWAP_ROWS(r1, r0);
+   if (0.0 == r0[0])  return GL_FALSE;
+
+   /* eliminate first variable     */
+   m1 = r1[0]/r0[0]; m2 = r2[0]/r0[0]; m3 = r3[0]/r0[0];
+   s = r0[1]; r1[1] -= m1 * s; r2[1] -= m2 * s; r3[1] -= m3 * s;
+   s = r0[2]; r1[2] -= m1 * s; r2[2] -= m2 * s; r3[2] -= m3 * s;
+   s = r0[3]; r1[3] -= m1 * s; r2[3] -= m2 * s; r3[3] -= m3 * s;
+   s = r0[4];
+   if (s != 0.0) { r1[4] -= m1 * s; r2[4] -= m2 * s; r3[4] -= m3 * s; }
+   s = r0[5];
+   if (s != 0.0) { r1[5] -= m1 * s; r2[5] -= m2 * s; r3[5] -= m3 * s; }
+   s = r0[6];
+   if (s != 0.0) { r1[6] -= m1 * s; r2[6] -= m2 * s; r3[6] -= m3 * s; }
+   s = r0[7];
+   if (s != 0.0) { r1[7] -= m1 * s; r2[7] -= m2 * s; r3[7] -= m3 * s; }
+
+   /* choose pivot - or die */
+   if (FABSF(r3[1])>FABSF(r2[1])) SWAP_ROWS(r3, r2);
+   if (FABSF(r2[1])>FABSF(r1[1])) SWAP_ROWS(r2, r1);
+   if (0.0 == r1[1])  return GL_FALSE;
+
+   /* eliminate second variable */
+   m2 = r2[1]/r1[1]; m3 = r3[1]/r1[1];
+   r2[2] -= m2 * r1[2]; r3[2] -= m3 * r1[2];
+   r2[3] -= m2 * r1[3]; r3[3] -= m3 * r1[3];
+   s = r1[4]; if (0.0 != s) { r2[4] -= m2 * s; r3[4] -= m3 * s; }
+   s = r1[5]; if (0.0 != s) { r2[5] -= m2 * s; r3[5] -= m3 * s; }
+   s = r1[6]; if (0.0 != s) { r2[6] -= m2 * s; r3[6] -= m3 * s; }
+   s = r1[7]; if (0.0 != s) { r2[7] -= m2 * s; r3[7] -= m3 * s; }
+
+   /* choose pivot - or die */
+   if (FABSF(r3[2])>FABSF(r2[2])) SWAP_ROWS(r3, r2);
+   if (0.0 == r2[2])  return GL_FALSE;
+
+   /* eliminate third variable */
+   m3 = r3[2]/r2[2];
+   r3[3] -= m3 * r2[3], r3[4] -= m3 * r2[4],
+   r3[5] -= m3 * r2[5], r3[6] -= m3 * r2[6],
+   r3[7] -= m3 * r2[7];
+
+   /* last check */
+   if (0.0 == r3[3]) return GL_FALSE;
+
+   s = 1.0F/r3[3];             /* now back substitute row 3 */
+   r3[4] *= s; r3[5] *= s; r3[6] *= s; r3[7] *= s;
+
+   m2 = r2[3];                 /* now back substitute row 2 */
+   s  = 1.0F/r2[2];
+   r2[4] = s * (r2[4] - r3[4] * m2), r2[5] = s * (r2[5] - r3[5] * m2),
+   r2[6] = s * (r2[6] - r3[6] * m2), r2[7] = s * (r2[7] - r3[7] * m2);
+   m1 = r1[3];
+   r1[4] -= r3[4] * m1, r1[5] -= r3[5] * m1,
+   r1[6] -= r3[6] * m1, r1[7] -= r3[7] * m1;
+   m0 = r0[3];
+   r0[4] -= r3[4] * m0, r0[5] -= r3[5] * m0,
+   r0[6] -= r3[6] * m0, r0[7] -= r3[7] * m0;
+
+   m1 = r1[2];                 /* now back substitute row 1 */
+   s  = 1.0F/r1[1];
+   r1[4] = s * (r1[4] - r2[4] * m1), r1[5] = s * (r1[5] - r2[5] * m1),
+   r1[6] = s * (r1[6] - r2[6] * m1), r1[7] = s * (r1[7] - r2[7] * m1);
+   m0 = r0[2];
+   r0[4] -= r2[4] * m0, r0[5] -= r2[5] * m0,
+   r0[6] -= r2[6] * m0, r0[7] -= r2[7] * m0;
+
+   m0 = r0[1];                 /* now back substitute row 0 */
+   s  = 1.0F/r0[0];
+   r0[4] = s * (r0[4] - r1[4] * m0), r0[5] = s * (r0[5] - r1[5] * m0),
+   r0[6] = s * (r0[6] - r1[6] * m0), r0[7] = s * (r0[7] - r1[7] * m0);
+
+   MAT(out,0,0) = r0[4]; MAT(out,0,1) = r0[5],
+   MAT(out,0,2) = r0[6]; MAT(out,0,3) = r0[7],
+   MAT(out,1,0) = r1[4]; MAT(out,1,1) = r1[5],
+   MAT(out,1,2) = r1[6]; MAT(out,1,3) = r1[7],
+   MAT(out,2,0) = r2[4]; MAT(out,2,1) = r2[5],
+   MAT(out,2,2) = r2[6]; MAT(out,2,3) = r2[7],
+   MAT(out,3,0) = r3[4]; MAT(out,3,1) = r3[5],
+   MAT(out,3,2) = r3[6]; MAT(out,3,3) = r3[7];
+
+   return GL_TRUE;
+}
+#undef SWAP_ROWS
+
+/**
+ * Compute inverse of a general 3d transformation matrix.
+ * 
+ * \param mat pointer to a GLmatrix structure. The matrix inverse will be
+ * stored in the GLmatrix::inv attribute.
+ * 
+ * \return GL_TRUE for success, GL_FALSE for failure (\p singular matrix).
+ *
+ * \author Adapted from graphics gems II.
+ *
+ * Calculates the inverse of the upper left by first calculating its
+ * determinant and multiplying it to the symmetric adjust matrix of each
+ * element. Finally deals with the translation part by transforming the
+ * original translation vector using by the calculated submatrix inverse.
+ */
+static GLboolean invert_matrix_3d_general( GLmatrix *mat )
+{
+   const GLfloat *in = mat->m;
+   GLfloat *out = mat->inv;
+   GLfloat pos, neg, t;
+   GLfloat det;
+
+   /* Calculate the determinant of upper left 3x3 submatrix and
+    * determine if the matrix is singular.
+    */
+   pos = neg = 0.0;
+   t =  MAT(in,0,0) * MAT(in,1,1) * MAT(in,2,2);
+   if (t >= 0.0) pos += t; else neg += t;
+
+   t =  MAT(in,1,0) * MAT(in,2,1) * MAT(in,0,2);
+   if (t >= 0.0) pos += t; else neg += t;
+
+   t =  MAT(in,2,0) * MAT(in,0,1) * MAT(in,1,2);
+   if (t >= 0.0) pos += t; else neg += t;
+
+   t = -MAT(in,2,0) * MAT(in,1,1) * MAT(in,0,2);
+   if (t >= 0.0) pos += t; else neg += t;
+
+   t = -MAT(in,1,0) * MAT(in,0,1) * MAT(in,2,2);
+   if (t >= 0.0) pos += t; else neg += t;
+
+   t = -MAT(in,0,0) * MAT(in,2,1) * MAT(in,1,2);
+   if (t >= 0.0) pos += t; else neg += t;
+
+   det = pos + neg;
+
+   if (det*det < 1e-25)
+      return GL_FALSE;
+
+   det = 1.0F / det;
+   MAT(out,0,0) = (  (MAT(in,1,1)*MAT(in,2,2) - MAT(in,2,1)*MAT(in,1,2) )*det);
+   MAT(out,0,1) = (- (MAT(in,0,1)*MAT(in,2,2) - MAT(in,2,1)*MAT(in,0,2) )*det);
+   MAT(out,0,2) = (  (MAT(in,0,1)*MAT(in,1,2) - MAT(in,1,1)*MAT(in,0,2) )*det);
+   MAT(out,1,0) = (- (MAT(in,1,0)*MAT(in,2,2) - MAT(in,2,0)*MAT(in,1,2) )*det);
+   MAT(out,1,1) = (  (MAT(in,0,0)*MAT(in,2,2) - MAT(in,2,0)*MAT(in,0,2) )*det);
+   MAT(out,1,2) = (- (MAT(in,0,0)*MAT(in,1,2) - MAT(in,1,0)*MAT(in,0,2) )*det);
+   MAT(out,2,0) = (  (MAT(in,1,0)*MAT(in,2,1) - MAT(in,2,0)*MAT(in,1,1) )*det);
+   MAT(out,2,1) = (- (MAT(in,0,0)*MAT(in,2,1) - MAT(in,2,0)*MAT(in,0,1) )*det);
+   MAT(out,2,2) = (  (MAT(in,0,0)*MAT(in,1,1) - MAT(in,1,0)*MAT(in,0,1) )*det);
+
+   /* Do the translation part */
+   MAT(out,0,3) = - (MAT(in,0,3) * MAT(out,0,0) +
+		     MAT(in,1,3) * MAT(out,0,1) +
+		     MAT(in,2,3) * MAT(out,0,2) );
+   MAT(out,1,3) = - (MAT(in,0,3) * MAT(out,1,0) +
+		     MAT(in,1,3) * MAT(out,1,1) +
+		     MAT(in,2,3) * MAT(out,1,2) );
+   MAT(out,2,3) = - (MAT(in,0,3) * MAT(out,2,0) +
+		     MAT(in,1,3) * MAT(out,2,1) +
+		     MAT(in,2,3) * MAT(out,2,2) );
+
+   return GL_TRUE;
+}
+
+/**
+ * Compute inverse of a 3d transformation matrix.
+ * 
+ * \param mat pointer to a GLmatrix structure. The matrix inverse will be
+ * stored in the GLmatrix::inv attribute.
+ * 
+ * \return GL_TRUE for success, GL_FALSE for failure (\p singular matrix).
+ *
+ * If the matrix is not an angle preserving matrix then calls
+ * invert_matrix_3d_general for the actual calculation. Otherwise calculates
+ * the inverse matrix analyzing and inverting each of the scaling, rotation and
+ * translation parts.
+ */
+static GLboolean invert_matrix_3d( GLmatrix *mat )
+{
+   const GLfloat *in = mat->m;
+   GLfloat *out = mat->inv;
+
+   if (!TEST_MAT_FLAGS(mat, MAT_FLAGS_ANGLE_PRESERVING)) {
+      return invert_matrix_3d_general( mat );
+   }
+
+   if (mat->flags & MAT_FLAG_UNIFORM_SCALE) {
+      GLfloat scale = (MAT(in,0,0) * MAT(in,0,0) +
+                       MAT(in,0,1) * MAT(in,0,1) +
+                       MAT(in,0,2) * MAT(in,0,2));
+
+      if (scale == 0.0)
+         return GL_FALSE;
+
+      scale = 1.0F / scale;
+
+      /* Transpose and scale the 3 by 3 upper-left submatrix. */
+      MAT(out,0,0) = scale * MAT(in,0,0);
+      MAT(out,1,0) = scale * MAT(in,0,1);
+      MAT(out,2,0) = scale * MAT(in,0,2);
+      MAT(out,0,1) = scale * MAT(in,1,0);
+      MAT(out,1,1) = scale * MAT(in,1,1);
+      MAT(out,2,1) = scale * MAT(in,1,2);
+      MAT(out,0,2) = scale * MAT(in,2,0);
+      MAT(out,1,2) = scale * MAT(in,2,1);
+      MAT(out,2,2) = scale * MAT(in,2,2);
+   }
+   else if (mat->flags & MAT_FLAG_ROTATION) {
+      /* Transpose the 3 by 3 upper-left submatrix. */
+      MAT(out,0,0) = MAT(in,0,0);
+      MAT(out,1,0) = MAT(in,0,1);
+      MAT(out,2,0) = MAT(in,0,2);
+      MAT(out,0,1) = MAT(in,1,0);
+      MAT(out,1,1) = MAT(in,1,1);
+      MAT(out,2,1) = MAT(in,1,2);
+      MAT(out,0,2) = MAT(in,2,0);
+      MAT(out,1,2) = MAT(in,2,1);
+      MAT(out,2,2) = MAT(in,2,2);
+   }
+   else {
+      /* pure translation */
+      MEMCPY( out, Identity, sizeof(Identity) );
+      MAT(out,0,3) = - MAT(in,0,3);
+      MAT(out,1,3) = - MAT(in,1,3);
+      MAT(out,2,3) = - MAT(in,2,3);
+      return GL_TRUE;
+   }
+
+   if (mat->flags & MAT_FLAG_TRANSLATION) {
+      /* Do the translation part */
+      MAT(out,0,3) = - (MAT(in,0,3) * MAT(out,0,0) +
+			MAT(in,1,3) * MAT(out,0,1) +
+			MAT(in,2,3) * MAT(out,0,2) );
+      MAT(out,1,3) = - (MAT(in,0,3) * MAT(out,1,0) +
+			MAT(in,1,3) * MAT(out,1,1) +
+			MAT(in,2,3) * MAT(out,1,2) );
+      MAT(out,2,3) = - (MAT(in,0,3) * MAT(out,2,0) +
+			MAT(in,1,3) * MAT(out,2,1) +
+			MAT(in,2,3) * MAT(out,2,2) );
+   }
+   else {
+      MAT(out,0,3) = MAT(out,1,3) = MAT(out,2,3) = 0.0;
+   }
+
+   return GL_TRUE;
+}
+
+/**
+ * Compute inverse of an identity transformation matrix.
+ * 
+ * \param mat pointer to a GLmatrix structure. The matrix inverse will be
+ * stored in the GLmatrix::inv attribute.
+ * 
+ * \return always GL_TRUE.
+ *
+ * Simply copies Identity into GLmatrix::inv.
+ */
+static GLboolean invert_matrix_identity( GLmatrix *mat )
+{
+   MEMCPY( mat->inv, Identity, sizeof(Identity) );
+   return GL_TRUE;
+}
+
+/**
+ * Compute inverse of a no-rotation 3d transformation matrix.
+ * 
+ * \param mat pointer to a GLmatrix structure. The matrix inverse will be
+ * stored in the GLmatrix::inv attribute.
+ * 
+ * \return GL_TRUE for success, GL_FALSE for failure (\p singular matrix).
+ *
+ * Calculates the 
+ */
+static GLboolean invert_matrix_3d_no_rot( GLmatrix *mat )
+{
+   const GLfloat *in = mat->m;
+   GLfloat *out = mat->inv;
+
+   if (MAT(in,0,0) == 0 || MAT(in,1,1) == 0 || MAT(in,2,2) == 0 )
+      return GL_FALSE;
+
+   MEMCPY( out, Identity, 16 * sizeof(GLfloat) );
+   MAT(out,0,0) = 1.0F / MAT(in,0,0);
+   MAT(out,1,1) = 1.0F / MAT(in,1,1);
+   MAT(out,2,2) = 1.0F / MAT(in,2,2);
+
+   if (mat->flags & MAT_FLAG_TRANSLATION) {
+      MAT(out,0,3) = - (MAT(in,0,3) * MAT(out,0,0));
+      MAT(out,1,3) = - (MAT(in,1,3) * MAT(out,1,1));
+      MAT(out,2,3) = - (MAT(in,2,3) * MAT(out,2,2));
+   }
+
+   return GL_TRUE;
+}
+
+/**
+ * Compute inverse of a no-rotation 2d transformation matrix.
+ * 
+ * \param mat pointer to a GLmatrix structure. The matrix inverse will be
+ * stored in the GLmatrix::inv attribute.
+ * 
+ * \return GL_TRUE for success, GL_FALSE for failure (\p singular matrix).
+ *
+ * Calculates the inverse matrix by applying the inverse scaling and
+ * translation to the identity matrix.
+ */
+static GLboolean invert_matrix_2d_no_rot( GLmatrix *mat )
+{
+   const GLfloat *in = mat->m;
+   GLfloat *out = mat->inv;
+
+   if (MAT(in,0,0) == 0 || MAT(in,1,1) == 0)
+      return GL_FALSE;
+
+   MEMCPY( out, Identity, 16 * sizeof(GLfloat) );
+   MAT(out,0,0) = 1.0F / MAT(in,0,0);
+   MAT(out,1,1) = 1.0F / MAT(in,1,1);
+
+   if (mat->flags & MAT_FLAG_TRANSLATION) {
+      MAT(out,0,3) = - (MAT(in,0,3) * MAT(out,0,0));
+      MAT(out,1,3) = - (MAT(in,1,3) * MAT(out,1,1));
+   }
+
+   return GL_TRUE;
+}
+
+#if 0
+/* broken */
+static GLboolean invert_matrix_perspective( GLmatrix *mat )
+{
+   const GLfloat *in = mat->m;
+   GLfloat *out = mat->inv;
+
+   if (MAT(in,2,3) == 0)
+      return GL_FALSE;
+
+   MEMCPY( out, Identity, 16 * sizeof(GLfloat) );
+
+   MAT(out,0,0) = 1.0F / MAT(in,0,0);
+   MAT(out,1,1) = 1.0F / MAT(in,1,1);
+
+   MAT(out,0,3) = MAT(in,0,2);
+   MAT(out,1,3) = MAT(in,1,2);
+
+   MAT(out,2,2) = 0;
+   MAT(out,2,3) = -1;
+
+   MAT(out,3,2) = 1.0F / MAT(in,2,3);
+   MAT(out,3,3) = MAT(in,2,2) * MAT(out,3,2);
+
+   return GL_TRUE;
+}
+#endif
+
+/**
+ * Matrix inversion function pointer type.
+ */
+typedef GLboolean (*inv_mat_func)( GLmatrix *mat );
+
+/**
+ * Table of the matrix inversion functions according to the matrix type.
+ */
+static inv_mat_func inv_mat_tab[7] = {
+   invert_matrix_general,
+   invert_matrix_identity,
+   invert_matrix_3d_no_rot,
+#if 0
+   /* Don't use this function for now - it fails when the projection matrix
+    * is premultiplied by a translation (ala Chromium's tilesort SPU).
+    */
+   invert_matrix_perspective,
+#else
+   invert_matrix_general,
+#endif
+   invert_matrix_3d,		/* lazy! */
+   invert_matrix_2d_no_rot,
+   invert_matrix_3d
+};
+
+/**
+ * Compute inverse of a transformation matrix.
+ * 
+ * \param mat pointer to a GLmatrix structure. The matrix inverse will be
+ * stored in the GLmatrix::inv attribute.
+ * 
+ * \return GL_TRUE for success, GL_FALSE for failure (\p singular matrix).
+ *
+ * Calls the matrix inversion function in inv_mat_tab corresponding to the
+ * given matrix type.  In case of failure, updates the MAT_FLAG_SINGULAR flag,
+ * and copies the identity matrix into GLmatrix::inv.
+ */
+static GLboolean matrix_invert( GLmatrix *mat )
+{
+   if (inv_mat_tab[mat->type](mat)) {
+      mat->flags &= ~MAT_FLAG_SINGULAR;
+      return GL_TRUE;
+   } else {
+      mat->flags |= MAT_FLAG_SINGULAR;
+      MEMCPY( mat->inv, Identity, sizeof(Identity) );
+      return GL_FALSE;
+   }
+}
+
+/*@}*/
+
+
+/**********************************************************************/
+/** \name Matrix generation */
+/*@{*/
+
+/**
+ * Generate a 4x4 transformation matrix from glRotate parameters, and
+ * post-multiply the input matrix by it.
+ *
+ * \author
+ * This function was contributed by Erich Boleyn (erich@uruk.org).
+ * Optimizations contributed by Rudolf Opalla (rudi@khm.de).
+ */
+void
+_math_matrix_rotate( GLmatrix *mat,
+		     GLfloat angle, GLfloat x, GLfloat y, GLfloat z )
+{
+   GLfloat xx, yy, zz, xy, yz, zx, xs, ys, zs, one_c, s, c;
+   GLfloat m[16];
+   GLboolean optimized;
+
+   s = (GLfloat) _mesa_sin( angle * DEG2RAD );
+   c = (GLfloat) _mesa_cos( angle * DEG2RAD );
+
+   MEMCPY(m, Identity, sizeof(GLfloat)*16);
+   optimized = GL_FALSE;
+
+#define M(row,col)  m[col*4+row]
+
+   if (x == 0.0F) {
+      if (y == 0.0F) {
+         if (z != 0.0F) {
+            optimized = GL_TRUE;
+            /* rotate only around z-axis */
+            M(0,0) = c;
+            M(1,1) = c;
+            if (z < 0.0F) {
+               M(0,1) = s;
+               M(1,0) = -s;
+            }
+            else {
+               M(0,1) = -s;
+               M(1,0) = s;
+            }
+         }
+      }
+      else if (z == 0.0F) {
+         optimized = GL_TRUE;
+         /* rotate only around y-axis */
+         M(0,0) = c;
+         M(2,2) = c;
+         if (y < 0.0F) {
+            M(0,2) = -s;
+            M(2,0) = s;
+         }
+         else {
+            M(0,2) = s;
+            M(2,0) = -s;
+         }
+      }
+   }
+   else if (y == 0.0F) {
+      if (z == 0.0F) {
+         optimized = GL_TRUE;
+         /* rotate only around x-axis */
+         M(1,1) = c;
+         M(2,2) = c;
+         if (x < 0.0F) {
+            M(1,2) = s;
+            M(2,1) = -s;
+         }
+         else {
+            M(1,2) = -s;
+            M(2,1) = s;
+         }
+      }
+   }
+
+   if (!optimized) {
+      const GLfloat mag = SQRTF(x * x + y * y + z * z);
+
+      if (mag <= 1.0e-4) {
+         /* no rotation, leave mat as-is */
+         return;
+      }
+
+      x /= mag;
+      y /= mag;
+      z /= mag;
+
+
+      /*
+       *     Arbitrary axis rotation matrix.
+       *
+       *  This is composed of 5 matrices, Rz, Ry, T, Ry', Rz', multiplied
+       *  like so:  Rz * Ry * T * Ry' * Rz'.  T is the final rotation
+       *  (which is about the X-axis), and the two composite transforms
+       *  Ry' * Rz' and Rz * Ry are (respectively) the rotations necessary
+       *  from the arbitrary axis to the X-axis then back.  They are
+       *  all elementary rotations.
+       *
+       *  Rz' is a rotation about the Z-axis, to bring the axis vector
+       *  into the x-z plane.  Then Ry' is applied, rotating about the
+       *  Y-axis to bring the axis vector parallel with the X-axis.  The
+       *  rotation about the X-axis is then performed.  Ry and Rz are
+       *  simply the respective inverse transforms to bring the arbitrary
+       *  axis back to it's original orientation.  The first transforms
+       *  Rz' and Ry' are considered inverses, since the data from the
+       *  arbitrary axis gives you info on how to get to it, not how
+       *  to get away from it, and an inverse must be applied.
+       *
+       *  The basic calculation used is to recognize that the arbitrary
+       *  axis vector (x, y, z), since it is of unit length, actually
+       *  represents the sines and cosines of the angles to rotate the
+       *  X-axis to the same orientation, with theta being the angle about
+       *  Z and phi the angle about Y (in the order described above)
+       *  as follows:
+       *
+       *  cos ( theta ) = x / sqrt ( 1 - z^2 )
+       *  sin ( theta ) = y / sqrt ( 1 - z^2 )
+       *
+       *  cos ( phi ) = sqrt ( 1 - z^2 )
+       *  sin ( phi ) = z
+       *
+       *  Note that cos ( phi ) can further be inserted to the above
+       *  formulas:
+       *
+       *  cos ( theta ) = x / cos ( phi )
+       *  sin ( theta ) = y / sin ( phi )
+       *
+       *  ...etc.  Because of those relations and the standard trigonometric
+       *  relations, it is pssible to reduce the transforms down to what
+       *  is used below.  It may be that any primary axis chosen will give the
+       *  same results (modulo a sign convention) using thie method.
+       *
+       *  Particularly nice is to notice that all divisions that might
+       *  have caused trouble when parallel to certain planes or
+       *  axis go away with care paid to reducing the expressions.
+       *  After checking, it does perform correctly under all cases, since
+       *  in all the cases of division where the denominator would have
+       *  been zero, the numerator would have been zero as well, giving
+       *  the expected result.
+       */
+
+      xx = x * x;
+      yy = y * y;
+      zz = z * z;
+      xy = x * y;
+      yz = y * z;
+      zx = z * x;
+      xs = x * s;
+      ys = y * s;
+      zs = z * s;
+      one_c = 1.0F - c;
+
+      /* We already hold the identity-matrix so we can skip some statements */
+      M(0,0) = (one_c * xx) + c;
+      M(0,1) = (one_c * xy) - zs;
+      M(0,2) = (one_c * zx) + ys;
+/*    M(0,3) = 0.0F; */
+
+      M(1,0) = (one_c * xy) + zs;
+      M(1,1) = (one_c * yy) + c;
+      M(1,2) = (one_c * yz) - xs;
+/*    M(1,3) = 0.0F; */
+
+      M(2,0) = (one_c * zx) - ys;
+      M(2,1) = (one_c * yz) + xs;
+      M(2,2) = (one_c * zz) + c;
+/*    M(2,3) = 0.0F; */
+
+/*
+      M(3,0) = 0.0F;
+      M(3,1) = 0.0F;
+      M(3,2) = 0.0F;
+      M(3,3) = 1.0F;
+*/
+   }
+#undef M
+
+   matrix_multf( mat, m, MAT_FLAG_ROTATION );
+}
+
+/**
+ * Apply a perspective projection matrix.
+ *
+ * \param mat matrix to apply the projection.
+ * \param left left clipping plane coordinate.
+ * \param right right clipping plane coordinate.
+ * \param bottom bottom clipping plane coordinate.
+ * \param top top clipping plane coordinate.
+ * \param nearval distance to the near clipping plane.
+ * \param farval distance to the far clipping plane.
+ *
+ * Creates the projection matrix and multiplies it with \p mat, marking the
+ * MAT_FLAG_PERSPECTIVE flag.
+ */
+void
+_math_matrix_frustum( GLmatrix *mat,
+		      GLfloat left, GLfloat right,
+		      GLfloat bottom, GLfloat top,
+		      GLfloat nearval, GLfloat farval )
+{
+   GLfloat x, y, a, b, c, d;
+   GLfloat m[16];
+
+   x = (2.0F*nearval) / (right-left);
+   y = (2.0F*nearval) / (top-bottom);
+   a = (right+left) / (right-left);
+   b = (top+bottom) / (top-bottom);
+   c = -(farval+nearval) / ( farval-nearval);
+   d = -(2.0F*farval*nearval) / (farval-nearval);  /* error? */
+
+#define M(row,col)  m[col*4+row]
+   M(0,0) = x;     M(0,1) = 0.0F;  M(0,2) = a;      M(0,3) = 0.0F;
+   M(1,0) = 0.0F;  M(1,1) = y;     M(1,2) = b;      M(1,3) = 0.0F;
+   M(2,0) = 0.0F;  M(2,1) = 0.0F;  M(2,2) = c;      M(2,3) = d;
+   M(3,0) = 0.0F;  M(3,1) = 0.0F;  M(3,2) = -1.0F;  M(3,3) = 0.0F;
+#undef M
+
+   matrix_multf( mat, m, MAT_FLAG_PERSPECTIVE );
+}
+
+/**
+ * Apply an orthographic projection matrix.
+ *
+ * \param mat matrix to apply the projection.
+ * \param left left clipping plane coordinate.
+ * \param right right clipping plane coordinate.
+ * \param bottom bottom clipping plane coordinate.
+ * \param top top clipping plane coordinate.
+ * \param nearval distance to the near clipping plane.
+ * \param farval distance to the far clipping plane.
+ *
+ * Creates the projection matrix and multiplies it with \p mat, marking the
+ * MAT_FLAG_GENERAL_SCALE and MAT_FLAG_TRANSLATION flags.
+ */
+void
+_math_matrix_ortho( GLmatrix *mat,
+		    GLfloat left, GLfloat right,
+		    GLfloat bottom, GLfloat top,
+		    GLfloat nearval, GLfloat farval )
+{
+   GLfloat m[16];
+
+#define M(row,col)  m[col*4+row]
+   M(0,0) = 2.0F / (right-left);
+   M(0,1) = 0.0F;
+   M(0,2) = 0.0F;
+   M(0,3) = -(right+left) / (right-left);
+
+   M(1,0) = 0.0F;
+   M(1,1) = 2.0F / (top-bottom);
+   M(1,2) = 0.0F;
+   M(1,3) = -(top+bottom) / (top-bottom);
+
+   M(2,0) = 0.0F;
+   M(2,1) = 0.0F;
+   M(2,2) = -2.0F / (farval-nearval);
+   M(2,3) = -(farval+nearval) / (farval-nearval);
+
+   M(3,0) = 0.0F;
+   M(3,1) = 0.0F;
+   M(3,2) = 0.0F;
+   M(3,3) = 1.0F;
+#undef M
+
+   matrix_multf( mat, m, (MAT_FLAG_GENERAL_SCALE|MAT_FLAG_TRANSLATION));
+}
+
+/**
+ * Multiply a matrix with a general scaling matrix.
+ *
+ * \param mat matrix.
+ * \param x x axis scale factor.
+ * \param y y axis scale factor.
+ * \param z z axis scale factor.
+ *
+ * Multiplies in-place the elements of \p mat by the scale factors. Checks if
+ * the scales factors are roughly the same, marking the MAT_FLAG_UNIFORM_SCALE
+ * flag, or MAT_FLAG_GENERAL_SCALE. Marks the MAT_DIRTY_TYPE and
+ * MAT_DIRTY_INVERSE dirty flags.
+ */
+void
+_math_matrix_scale( GLmatrix *mat, GLfloat x, GLfloat y, GLfloat z )
+{
+   GLfloat *m = mat->m;
+   m[0] *= x;   m[4] *= y;   m[8]  *= z;
+   m[1] *= x;   m[5] *= y;   m[9]  *= z;
+   m[2] *= x;   m[6] *= y;   m[10] *= z;
+   m[3] *= x;   m[7] *= y;   m[11] *= z;
+
+   if (FABSF(x - y) < 1e-8 && FABSF(x - z) < 1e-8)
+      mat->flags |= MAT_FLAG_UNIFORM_SCALE;
+   else
+      mat->flags |= MAT_FLAG_GENERAL_SCALE;
+
+   mat->flags |= (MAT_DIRTY_TYPE |
+		  MAT_DIRTY_INVERSE);
+}
+
+/**
+ * Multiply a matrix with a translation matrix.
+ *
+ * \param mat matrix.
+ * \param x translation vector x coordinate.
+ * \param y translation vector y coordinate.
+ * \param z translation vector z coordinate.
+ *
+ * Adds the translation coordinates to the elements of \p mat in-place.  Marks
+ * the MAT_FLAG_TRANSLATION flag, and the MAT_DIRTY_TYPE and MAT_DIRTY_INVERSE
+ * dirty flags.
+ */
+void
+_math_matrix_translate( GLmatrix *mat, GLfloat x, GLfloat y, GLfloat z )
+{
+   GLfloat *m = mat->m;
+   m[12] = m[0] * x + m[4] * y + m[8]  * z + m[12];
+   m[13] = m[1] * x + m[5] * y + m[9]  * z + m[13];
+   m[14] = m[2] * x + m[6] * y + m[10] * z + m[14];
+   m[15] = m[3] * x + m[7] * y + m[11] * z + m[15];
+
+   mat->flags |= (MAT_FLAG_TRANSLATION |
+		  MAT_DIRTY_TYPE |
+		  MAT_DIRTY_INVERSE);
+}
+
+
+/**
+ * Set matrix to do viewport and depthrange mapping.
+ * Transforms Normalized Device Coords to window/Z values.
+ */
+void
+_math_matrix_viewport(GLmatrix *m, GLint x, GLint y, GLint width, GLint height,
+                      GLfloat zNear, GLfloat zFar, GLfloat depthMax)
+{
+   m->m[MAT_SX] = (GLfloat) width / 2.0F;
+   m->m[MAT_TX] = m->m[MAT_SX] + x;
+   m->m[MAT_SY] = (GLfloat) height / 2.0F;
+   m->m[MAT_TY] = m->m[MAT_SY] + y;
+   m->m[MAT_SZ] = depthMax * ((zFar - zNear) / 2.0F);
+   m->m[MAT_TZ] = depthMax * ((zFar - zNear) / 2.0F + zNear);
+   m->flags = MAT_FLAG_GENERAL_SCALE | MAT_FLAG_TRANSLATION;
+   m->type = MATRIX_3D_NO_ROT;
+}
+
+
+/**
+ * Set a matrix to the identity matrix.
+ *
+ * \param mat matrix.
+ *
+ * Copies ::Identity into \p GLmatrix::m, and into GLmatrix::inv if not NULL.
+ * Sets the matrix type to identity, and clear the dirty flags.
+ */
+void
+_math_matrix_set_identity( GLmatrix *mat )
+{
+   MEMCPY( mat->m, Identity, 16*sizeof(GLfloat) );
+
+   if (mat->inv)
+      MEMCPY( mat->inv, Identity, 16*sizeof(GLfloat) );
+
+   mat->type = MATRIX_IDENTITY;
+   mat->flags &= ~(MAT_DIRTY_FLAGS|
+		   MAT_DIRTY_TYPE|
+		   MAT_DIRTY_INVERSE);
+}
+
+/*@}*/
+
+
+/**********************************************************************/
+/** \name Matrix analysis */
+/*@{*/
+
+#define ZERO(x) (1<<x)
+#define ONE(x)  (1<<(x+16))
+
+#define MASK_NO_TRX      (ZERO(12) | ZERO(13) | ZERO(14))
+#define MASK_NO_2D_SCALE ( ONE(0)  | ONE(5))
+
+#define MASK_IDENTITY    ( ONE(0)  | ZERO(4)  | ZERO(8)  | ZERO(12) |\
+			  ZERO(1)  |  ONE(5)  | ZERO(9)  | ZERO(13) |\
+			  ZERO(2)  | ZERO(6)  |  ONE(10) | ZERO(14) |\
+			  ZERO(3)  | ZERO(7)  | ZERO(11) |  ONE(15) )
+
+#define MASK_2D_NO_ROT   (           ZERO(4)  | ZERO(8)  |           \
+			  ZERO(1)  |            ZERO(9)  |           \
+			  ZERO(2)  | ZERO(6)  |  ONE(10) | ZERO(14) |\
+			  ZERO(3)  | ZERO(7)  | ZERO(11) |  ONE(15) )
+
+#define MASK_2D          (                      ZERO(8)  |           \
+			                        ZERO(9)  |           \
+			  ZERO(2)  | ZERO(6)  |  ONE(10) | ZERO(14) |\
+			  ZERO(3)  | ZERO(7)  | ZERO(11) |  ONE(15) )
+
+
+#define MASK_3D_NO_ROT   (           ZERO(4)  | ZERO(8)  |           \
+			  ZERO(1)  |            ZERO(9)  |           \
+			  ZERO(2)  | ZERO(6)  |                      \
+			  ZERO(3)  | ZERO(7)  | ZERO(11) |  ONE(15) )
+
+#define MASK_3D          (                                           \
+			                                             \
+			                                             \
+			  ZERO(3)  | ZERO(7)  | ZERO(11) |  ONE(15) )
+
+
+#define MASK_PERSPECTIVE (           ZERO(4)  |            ZERO(12) |\
+			  ZERO(1)  |                       ZERO(13) |\
+			  ZERO(2)  | ZERO(6)  |                      \
+			  ZERO(3)  | ZERO(7)  |            ZERO(15) )
+
+#define SQ(x) ((x)*(x))
+
+/**
+ * Determine type and flags from scratch.  
+ *
+ * \param mat matrix.
+ * 
+ * This is expensive enough to only want to do it once.
+ */
+static void analyse_from_scratch( GLmatrix *mat )
+{
+   const GLfloat *m = mat->m;
+   GLuint mask = 0;
+   GLuint i;
+
+   for (i = 0 ; i < 16 ; i++) {
+      if (m[i] == 0.0) mask |= (1<<i);
+   }
+
+   if (m[0] == 1.0F) mask |= (1<<16);
+   if (m[5] == 1.0F) mask |= (1<<21);
+   if (m[10] == 1.0F) mask |= (1<<26);
+   if (m[15] == 1.0F) mask |= (1<<31);
+
+   mat->flags &= ~MAT_FLAGS_GEOMETRY;
+
+   /* Check for translation - no-one really cares
+    */
+   if ((mask & MASK_NO_TRX) != MASK_NO_TRX)
+      mat->flags |= MAT_FLAG_TRANSLATION;
+
+   /* Do the real work
+    */
+   if (mask == (GLuint) MASK_IDENTITY) {
+      mat->type = MATRIX_IDENTITY;
+   }
+   else if ((mask & MASK_2D_NO_ROT) == (GLuint) MASK_2D_NO_ROT) {
+      mat->type = MATRIX_2D_NO_ROT;
+
+      if ((mask & MASK_NO_2D_SCALE) != MASK_NO_2D_SCALE)
+	 mat->flags |= MAT_FLAG_GENERAL_SCALE;
+   }
+   else if ((mask & MASK_2D) == (GLuint) MASK_2D) {
+      GLfloat mm = DOT2(m, m);
+      GLfloat m4m4 = DOT2(m+4,m+4);
+      GLfloat mm4 = DOT2(m,m+4);
+
+      mat->type = MATRIX_2D;
+
+      /* Check for scale */
+      if (SQ(mm-1) > SQ(1e-6) ||
+	  SQ(m4m4-1) > SQ(1e-6))
+	 mat->flags |= MAT_FLAG_GENERAL_SCALE;
+
+      /* Check for rotation */
+      if (SQ(mm4) > SQ(1e-6))
+	 mat->flags |= MAT_FLAG_GENERAL_3D;
+      else
+	 mat->flags |= MAT_FLAG_ROTATION;
+
+   }
+   else if ((mask & MASK_3D_NO_ROT) == (GLuint) MASK_3D_NO_ROT) {
+      mat->type = MATRIX_3D_NO_ROT;
+
+      /* Check for scale */
+      if (SQ(m[0]-m[5]) < SQ(1e-6) &&
+	  SQ(m[0]-m[10]) < SQ(1e-6)) {
+	 if (SQ(m[0]-1.0) > SQ(1e-6)) {
+	    mat->flags |= MAT_FLAG_UNIFORM_SCALE;
+         }
+      }
+      else {
+	 mat->flags |= MAT_FLAG_GENERAL_SCALE;
+      }
+   }
+   else if ((mask & MASK_3D) == (GLuint) MASK_3D) {
+      GLfloat c1 = DOT3(m,m);
+      GLfloat c2 = DOT3(m+4,m+4);
+      GLfloat c3 = DOT3(m+8,m+8);
+      GLfloat d1 = DOT3(m, m+4);
+      GLfloat cp[3];
+
+      mat->type = MATRIX_3D;
+
+      /* Check for scale */
+      if (SQ(c1-c2) < SQ(1e-6) && SQ(c1-c3) < SQ(1e-6)) {
+	 if (SQ(c1-1.0) > SQ(1e-6))
+	    mat->flags |= MAT_FLAG_UNIFORM_SCALE;
+	 /* else no scale at all */
+      }
+      else {
+	 mat->flags |= MAT_FLAG_GENERAL_SCALE;
+      }
+
+      /* Check for rotation */
+      if (SQ(d1) < SQ(1e-6)) {
+	 CROSS3( cp, m, m+4 );
+	 SUB_3V( cp, cp, (m+8) );
+	 if (LEN_SQUARED_3FV(cp) < SQ(1e-6))
+	    mat->flags |= MAT_FLAG_ROTATION;
+	 else
+	    mat->flags |= MAT_FLAG_GENERAL_3D;
+      }
+      else {
+	 mat->flags |= MAT_FLAG_GENERAL_3D; /* shear, etc */
+      }
+   }
+   else if ((mask & MASK_PERSPECTIVE) == MASK_PERSPECTIVE && m[11]==-1.0F) {
+      mat->type = MATRIX_PERSPECTIVE;
+      mat->flags |= MAT_FLAG_GENERAL;
+   }
+   else {
+      mat->type = MATRIX_GENERAL;
+      mat->flags |= MAT_FLAG_GENERAL;
+   }
+}
+
+/**
+ * Analyze a matrix given that its flags are accurate.
+ * 
+ * This is the more common operation, hopefully.
+ */
+static void analyse_from_flags( GLmatrix *mat )
+{
+   const GLfloat *m = mat->m;
+
+   if (TEST_MAT_FLAGS(mat, 0)) {
+      mat->type = MATRIX_IDENTITY;
+   }
+   else if (TEST_MAT_FLAGS(mat, (MAT_FLAG_TRANSLATION |
+				 MAT_FLAG_UNIFORM_SCALE |
+				 MAT_FLAG_GENERAL_SCALE))) {
+      if ( m[10]==1.0F && m[14]==0.0F ) {
+	 mat->type = MATRIX_2D_NO_ROT;
+      }
+      else {
+	 mat->type = MATRIX_3D_NO_ROT;
+      }
+   }
+   else if (TEST_MAT_FLAGS(mat, MAT_FLAGS_3D)) {
+      if (                                 m[ 8]==0.0F
+            &&                             m[ 9]==0.0F
+            && m[2]==0.0F && m[6]==0.0F && m[10]==1.0F && m[14]==0.0F) {
+	 mat->type = MATRIX_2D;
+      }
+      else {
+	 mat->type = MATRIX_3D;
+      }
+   }
+   else if (                 m[4]==0.0F                 && m[12]==0.0F
+            && m[1]==0.0F                               && m[13]==0.0F
+            && m[2]==0.0F && m[6]==0.0F
+            && m[3]==0.0F && m[7]==0.0F && m[11]==-1.0F && m[15]==0.0F) {
+      mat->type = MATRIX_PERSPECTIVE;
+   }
+   else {
+      mat->type = MATRIX_GENERAL;
+   }
+}
+
+/**
+ * Analyze and update a matrix.
+ *
+ * \param mat matrix.
+ *
+ * If the matrix type is dirty then calls either analyse_from_scratch() or
+ * analyse_from_flags() to determine its type, according to whether the flags
+ * are dirty or not, respectively. If the matrix has an inverse and it's dirty
+ * then calls matrix_invert(). Finally clears the dirty flags.
+ */
+void
+_math_matrix_analyse( GLmatrix *mat )
+{
+   if (mat->flags & MAT_DIRTY_TYPE) {
+      if (mat->flags & MAT_DIRTY_FLAGS)
+	 analyse_from_scratch( mat );
+      else
+	 analyse_from_flags( mat );
+   }
+
+   if (mat->inv && (mat->flags & MAT_DIRTY_INVERSE)) {
+      matrix_invert( mat );
+      mat->flags &= ~MAT_DIRTY_INVERSE;
+   }
+
+   mat->flags &= ~(MAT_DIRTY_FLAGS | MAT_DIRTY_TYPE);
+}
+
+/*@}*/
+
+
+/**
+ * Test if the given matrix preserves vector lengths.
+ */
+GLboolean
+_math_matrix_is_length_preserving( const GLmatrix *m )
+{
+   return TEST_MAT_FLAGS( m, MAT_FLAGS_LENGTH_PRESERVING);
+}
+
+
+/**
+ * Test if the given matrix does any rotation.
+ * (or perhaps if the upper-left 3x3 is non-identity)
+ */
+GLboolean
+_math_matrix_has_rotation( const GLmatrix *m )
+{
+   if (m->flags & (MAT_FLAG_GENERAL |
+                   MAT_FLAG_ROTATION |
+                   MAT_FLAG_GENERAL_3D |
+                   MAT_FLAG_PERSPECTIVE))
+      return GL_TRUE;
+   else
+      return GL_FALSE;
+}
+
+
+GLboolean
+_math_matrix_is_general_scale( const GLmatrix *m )
+{
+   return (m->flags & MAT_FLAG_GENERAL_SCALE) ? GL_TRUE : GL_FALSE;
+}
+
+
+GLboolean
+_math_matrix_is_dirty( const GLmatrix *m )
+{
+   return (m->flags & MAT_DIRTY) ? GL_TRUE : GL_FALSE;
+}
+
+
+/**********************************************************************/
+/** \name Matrix setup */
+/*@{*/
+
+/**
+ * Copy a matrix.
+ *
+ * \param to destination matrix.
+ * \param from source matrix.
+ *
+ * Copies all fields in GLmatrix, creating an inverse array if necessary.
+ */
+void
+_math_matrix_copy( GLmatrix *to, const GLmatrix *from )
+{
+   MEMCPY( to->m, from->m, sizeof(Identity) );
+   to->flags = from->flags;
+   to->type = from->type;
+
+   if (to->inv != 0) {
+      if (from->inv == 0) {
+	 matrix_invert( to );
+      }
+      else {
+	 MEMCPY(to->inv, from->inv, sizeof(GLfloat)*16);
+      }
+   }
+}
+
+/**
+ * Loads a matrix array into GLmatrix.
+ * 
+ * \param m matrix array.
+ * \param mat matrix.
+ *
+ * Copies \p m into GLmatrix::m and marks the MAT_FLAG_GENERAL and MAT_DIRTY
+ * flags.
+ */
+void
+_math_matrix_loadf( GLmatrix *mat, const GLfloat *m )
+{
+   MEMCPY( mat->m, m, 16*sizeof(GLfloat) );
+   mat->flags = (MAT_FLAG_GENERAL | MAT_DIRTY);
+}
+
+/**
+ * Matrix constructor.
+ *
+ * \param m matrix.
+ *
+ * Initialize the GLmatrix fields.
+ */
+void
+_math_matrix_ctr( GLmatrix *m )
+{
+   m->m = (GLfloat *) ALIGN_MALLOC( 16 * sizeof(GLfloat), 16 );
+   if (m->m)
+      MEMCPY( m->m, Identity, sizeof(Identity) );
+   m->inv = NULL;
+   m->type = MATRIX_IDENTITY;
+   m->flags = 0;
+}
+
+/**
+ * Matrix destructor.
+ *
+ * \param m matrix.
+ *
+ * Frees the data in a GLmatrix.
+ */
+void
+_math_matrix_dtr( GLmatrix *m )
+{
+   if (m->m) {
+      ALIGN_FREE( m->m );
+      m->m = NULL;
+   }
+   if (m->inv) {
+      ALIGN_FREE( m->inv );
+      m->inv = NULL;
+   }
+}
+
+/**
+ * Allocate a matrix inverse.
+ *
+ * \param m matrix.
+ *
+ * Allocates the matrix inverse, GLmatrix::inv, and sets it to Identity.
+ */
+void
+_math_matrix_alloc_inv( GLmatrix *m )
+{
+   if (!m->inv) {
+      m->inv = (GLfloat *) ALIGN_MALLOC( 16 * sizeof(GLfloat), 16 );
+      if (m->inv)
+         MEMCPY( m->inv, Identity, 16 * sizeof(GLfloat) );
+   }
+}
+
+/*@}*/
+
+
+/**********************************************************************/
+/** \name Matrix transpose */
+/*@{*/
+
+/**
+ * Transpose a GLfloat matrix.
+ *
+ * \param to destination array.
+ * \param from source array.
+ */
+void
+_math_transposef( GLfloat to[16], const GLfloat from[16] )
+{
+   to[0] = from[0];
+   to[1] = from[4];
+   to[2] = from[8];
+   to[3] = from[12];
+   to[4] = from[1];
+   to[5] = from[5];
+   to[6] = from[9];
+   to[7] = from[13];
+   to[8] = from[2];
+   to[9] = from[6];
+   to[10] = from[10];
+   to[11] = from[14];
+   to[12] = from[3];
+   to[13] = from[7];
+   to[14] = from[11];
+   to[15] = from[15];
+}
+
+/**
+ * Transpose a GLdouble matrix.
+ *
+ * \param to destination array.
+ * \param from source array.
+ */
+void
+_math_transposed( GLdouble to[16], const GLdouble from[16] )
+{
+   to[0] = from[0];
+   to[1] = from[4];
+   to[2] = from[8];
+   to[3] = from[12];
+   to[4] = from[1];
+   to[5] = from[5];
+   to[6] = from[9];
+   to[7] = from[13];
+   to[8] = from[2];
+   to[9] = from[6];
+   to[10] = from[10];
+   to[11] = from[14];
+   to[12] = from[3];
+   to[13] = from[7];
+   to[14] = from[11];
+   to[15] = from[15];
+}
+
+/**
+ * Transpose a GLdouble matrix and convert to GLfloat.
+ *
+ * \param to destination array.
+ * \param from source array.
+ */
+void
+_math_transposefd( GLfloat to[16], const GLdouble from[16] )
+{
+   to[0] = (GLfloat) from[0];
+   to[1] = (GLfloat) from[4];
+   to[2] = (GLfloat) from[8];
+   to[3] = (GLfloat) from[12];
+   to[4] = (GLfloat) from[1];
+   to[5] = (GLfloat) from[5];
+   to[6] = (GLfloat) from[9];
+   to[7] = (GLfloat) from[13];
+   to[8] = (GLfloat) from[2];
+   to[9] = (GLfloat) from[6];
+   to[10] = (GLfloat) from[10];
+   to[11] = (GLfloat) from[14];
+   to[12] = (GLfloat) from[3];
+   to[13] = (GLfloat) from[7];
+   to[14] = (GLfloat) from[11];
+   to[15] = (GLfloat) from[15];
+}
+
+/*@}*/
+
+
+/**
+ * Transform a 4-element row vector (1x4 matrix) by a 4x4 matrix.  This
+ * function is used for transforming clipping plane equations and spotlight
+ * directions.
+ * Mathematically,  u = v * m.
+ * Input:  v - input vector
+ *         m - transformation matrix
+ * Output:  u - transformed vector
+ */
+void
+_mesa_transform_vector( GLfloat u[4], const GLfloat v[4], const GLfloat m[16] )
+{
+   const GLfloat v0 = v[0], v1 = v[1], v2 = v[2], v3 = v[3];
+#define M(row,col)  m[row + col*4]
+   u[0] = v0 * M(0,0) + v1 * M(1,0) + v2 * M(2,0) + v3 * M(3,0);
+   u[1] = v0 * M(0,1) + v1 * M(1,1) + v2 * M(2,1) + v3 * M(3,1);
+   u[2] = v0 * M(0,2) + v1 * M(1,2) + v2 * M(2,2) + v3 * M(3,2);
+   u[3] = v0 * M(0,3) + v1 * M(1,3) + v2 * M(2,3) + v3 * M(3,3);
+#undef M
+}
diff --git a/mesalib/src/mesa/math/m_matrix.h b/mesalib/src/mesa/math/m_matrix.h
new file mode 100644
index 000000000..3bc5de6cd
--- /dev/null
+++ b/mesalib/src/mesa/math/m_matrix.h
@@ -0,0 +1,210 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.3
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * \file math/m_matrix.h
+ * Defines basic structures for matrix-handling.
+ */
+
+#ifndef _M_MATRIX_H
+#define _M_MATRIX_H
+
+
+
+/**
+ * \name Symbolic names to some of the entries in the matrix
+ *
+ * These are handy for the viewport mapping, which is expressed as a matrix.
+ */
+/*@{*/
+#define MAT_SX 0
+#define MAT_SY 5
+#define MAT_SZ 10
+#define MAT_TX 12
+#define MAT_TY 13
+#define MAT_TZ 14
+/*@}*/
+
+
+/**
+ * Different kinds of 4x4 transformation matrices.
+ * We use these to select specific optimized vertex transformation routines.
+ */
+enum GLmatrixtype {
+   MATRIX_GENERAL,	/**< general 4x4 matrix */
+   MATRIX_IDENTITY,	/**< identity matrix */
+   MATRIX_3D_NO_ROT,	/**< orthogonal projection and others... */
+   MATRIX_PERSPECTIVE,	/**< perspective projection matrix */
+   MATRIX_2D,		/**< 2-D transformation */
+   MATRIX_2D_NO_ROT,	/**< 2-D scale & translate only */
+   MATRIX_3D		/**< 3-D transformation */
+} ;
+
+/**
+ * Matrix type to represent 4x4 transformation matrices.
+ */
+typedef struct {
+   GLfloat *m;		/**< 16 matrix elements (16-byte aligned) */
+   GLfloat *inv;	/**< optional 16-element inverse (16-byte aligned) */
+   GLuint flags;        /**< possible values determined by (of \link
+                         * MatFlags MAT_FLAG_* flags\endlink)
+                         */
+   enum GLmatrixtype type;
+} GLmatrix;
+
+
+
+
+extern void
+_math_matrix_ctr( GLmatrix *m );
+
+extern void
+_math_matrix_dtr( GLmatrix *m );
+
+extern void
+_math_matrix_alloc_inv( GLmatrix *m );
+
+extern void
+_math_matrix_mul_matrix( GLmatrix *dest, const GLmatrix *a, const GLmatrix *b );
+
+extern void
+_math_matrix_mul_floats( GLmatrix *dest, const GLfloat *b );
+
+extern void
+_math_matrix_loadf( GLmatrix *mat, const GLfloat *m );
+
+extern void
+_math_matrix_translate( GLmatrix *mat, GLfloat x, GLfloat y, GLfloat z );
+
+extern void
+_math_matrix_rotate( GLmatrix *m, GLfloat angle,
+		     GLfloat x, GLfloat y, GLfloat z );
+
+extern void
+_math_matrix_scale( GLmatrix *mat, GLfloat x, GLfloat y, GLfloat z );
+
+extern void
+_math_matrix_ortho( GLmatrix *mat,
+		    GLfloat left, GLfloat right,
+		    GLfloat bottom, GLfloat top,
+		    GLfloat nearval, GLfloat farval );
+
+extern void
+_math_matrix_frustum( GLmatrix *mat,
+		      GLfloat left, GLfloat right,
+		      GLfloat bottom, GLfloat top,
+		      GLfloat nearval, GLfloat farval );
+
+extern void
+_math_matrix_viewport(GLmatrix *m, GLint x, GLint y, GLint width, GLint height,
+                      GLfloat zNear, GLfloat zFar, GLfloat depthMax);
+
+extern void
+_math_matrix_set_identity( GLmatrix *dest );
+
+extern void
+_math_matrix_copy( GLmatrix *to, const GLmatrix *from );
+
+extern void
+_math_matrix_analyse( GLmatrix *mat );
+
+extern void
+_math_matrix_print( const GLmatrix *m );
+
+extern GLboolean
+_math_matrix_is_length_preserving( const GLmatrix *m );
+
+extern GLboolean
+_math_matrix_has_rotation( const GLmatrix *m );
+
+extern GLboolean
+_math_matrix_is_general_scale( const GLmatrix *m );
+
+extern GLboolean
+_math_matrix_is_dirty( const GLmatrix *m );
+
+
+/**
+ * \name Related functions that don't actually operate on GLmatrix structs
+ */
+/*@{*/
+
+extern void
+_math_transposef( GLfloat to[16], const GLfloat from[16] );
+
+extern void
+_math_transposed( GLdouble to[16], const GLdouble from[16] );
+
+extern void
+_math_transposefd( GLfloat to[16], const GLdouble from[16] );
+
+
+/*
+ * Transform a point (column vector) by a matrix:   Q = M * P
+ */
+#define TRANSFORM_POINT( Q, M, P )					\
+   Q[0] = M[0] * P[0] + M[4] * P[1] + M[8] *  P[2] + M[12] * P[3];	\
+   Q[1] = M[1] * P[0] + M[5] * P[1] + M[9] *  P[2] + M[13] * P[3];	\
+   Q[2] = M[2] * P[0] + M[6] * P[1] + M[10] * P[2] + M[14] * P[3];	\
+   Q[3] = M[3] * P[0] + M[7] * P[1] + M[11] * P[2] + M[15] * P[3];
+
+
+#define TRANSFORM_POINT3( Q, M, P )				\
+   Q[0] = M[0] * P[0] + M[4] * P[1] + M[8] *  P[2] + M[12];	\
+   Q[1] = M[1] * P[0] + M[5] * P[1] + M[9] *  P[2] + M[13];	\
+   Q[2] = M[2] * P[0] + M[6] * P[1] + M[10] * P[2] + M[14];	\
+   Q[3] = M[3] * P[0] + M[7] * P[1] + M[11] * P[2] + M[15];
+
+
+/*
+ * Transform a normal (row vector) by a matrix:  [NX NY NZ] = N * MAT
+ */
+#define TRANSFORM_NORMAL( TO, N, MAT )				\
+do {								\
+   TO[0] = N[0] * MAT[0] + N[1] * MAT[1] + N[2] * MAT[2];	\
+   TO[1] = N[0] * MAT[4] + N[1] * MAT[5] + N[2] * MAT[6];	\
+   TO[2] = N[0] * MAT[8] + N[1] * MAT[9] + N[2] * MAT[10];	\
+} while (0)
+
+
+/**
+ * Transform a direction by a matrix.
+ */
+#define TRANSFORM_DIRECTION( TO, DIR, MAT )			\
+do {								\
+   TO[0] = DIR[0] * MAT[0] + DIR[1] * MAT[4] + DIR[2] * MAT[8];	\
+   TO[1] = DIR[0] * MAT[1] + DIR[1] * MAT[5] + DIR[2] * MAT[9];	\
+   TO[2] = DIR[0] * MAT[2] + DIR[1] * MAT[6] + DIR[2] * MAT[10];\
+} while (0)
+
+
+extern void
+_mesa_transform_vector(GLfloat u[4], const GLfloat v[4], const GLfloat m[16]);
+
+
+/*@}*/
+
+
+#endif
diff --git a/mesalib/src/mesa/math/m_norm_tmp.h b/mesalib/src/mesa/math/m_norm_tmp.h
new file mode 100644
index 000000000..a20cb0501
--- /dev/null
+++ b/mesalib/src/mesa/math/m_norm_tmp.h
@@ -0,0 +1,390 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  5.1
+ *
+ * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * New (3.1) transformation code written by Keith Whitwell.
+ */
+
+/* Functions to tranform a vector of normals.  This includes applying
+ * the transformation matrix, rescaling and normalization.
+ */
+
+/*
+ * mat - the 4x4 transformation matrix
+ * scale - uniform scale factor of the transformation matrix (not always used)
+ * in - the source vector of normals
+ * lengths - length of each incoming normal (may be NULL) (a display list
+ *           optimization)
+ * dest - the destination vector of normals
+ */
+static void _XFORMAPI
+TAG(transform_normalize_normals)( const GLmatrix *mat,
+                                  GLfloat scale,
+                                  const GLvector4f *in,
+                                  const GLfloat *lengths,
+                                  GLvector4f *dest )
+{
+   GLfloat (*out)[4] = (GLfloat (*)[4])dest->start;
+   const GLfloat *from = in->start;
+   const GLuint stride = in->stride;
+   const GLuint count = in->count;
+   const GLfloat *m = mat->inv;
+   GLfloat m0 = m[0],  m4 = m[4],  m8 = m[8];
+   GLfloat m1 = m[1],  m5 = m[5],  m9 = m[9];
+   GLfloat m2 = m[2],  m6 = m[6],  m10 = m[10];
+   GLuint i;
+
+   if (!lengths) {
+      STRIDE_LOOP {
+	 GLfloat tx, ty, tz;
+	 {
+	    const GLfloat ux = from[0],  uy = from[1],  uz = from[2];
+	    tx = ux * m0 + uy * m1 + uz * m2;
+	    ty = ux * m4 + uy * m5 + uz * m6;
+	    tz = ux * m8 + uy * m9 + uz * m10;
+	 }
+	 {
+	    GLdouble len = tx*tx + ty*ty + tz*tz;
+	    if (len > 1e-20) {
+	       GLfloat scale = INV_SQRTF(len);
+	       out[i][0] = tx * scale;
+	       out[i][1] = ty * scale;
+	       out[i][2] = tz * scale;
+	    }
+	    else {
+	       out[i][0] = out[i][1] = out[i][2] = 0;
+	    }
+	 }
+      }
+   }
+   else {
+      if (scale != 1.0) {
+	 m0 *= scale,  m4 *= scale,  m8 *= scale;
+	 m1 *= scale,  m5 *= scale,  m9 *= scale;
+	 m2 *= scale,  m6 *= scale,  m10 *= scale;
+      }
+
+      STRIDE_LOOP {
+	 GLfloat tx, ty, tz;
+	 {
+	    const GLfloat ux = from[0],  uy = from[1],  uz = from[2];
+	    tx = ux * m0 + uy * m1 + uz * m2;
+	    ty = ux * m4 + uy * m5 + uz * m6;
+	    tz = ux * m8 + uy * m9 + uz * m10;
+	 }
+	 {
+	    GLfloat len = lengths[i];
+	    out[i][0] = tx * len;
+	    out[i][1] = ty * len;
+	    out[i][2] = tz * len;
+	 }
+      }
+   }
+   dest->count = in->count;
+}
+
+
+static void _XFORMAPI
+TAG(transform_normalize_normals_no_rot)( const GLmatrix *mat,
+                                         GLfloat scale,
+                                         const GLvector4f *in,
+                                         const GLfloat *lengths,
+                                         GLvector4f *dest )
+{
+   GLfloat (*out)[4] = (GLfloat (*)[4])dest->start;
+   const GLfloat *from = in->start;
+   const GLuint stride = in->stride;
+   const GLuint count = in->count;
+   const GLfloat *m = mat->inv;
+   GLfloat m0 = m[0];
+   GLfloat m5 = m[5];
+   GLfloat m10 = m[10];
+   GLuint i;
+
+   if (!lengths) {
+      STRIDE_LOOP {
+	 GLfloat tx, ty, tz;
+	 {
+	    const GLfloat ux = from[0],  uy = from[1],  uz = from[2];
+	    tx = ux * m0                    ;
+	    ty =           uy * m5          ;
+	    tz =                     uz * m10;
+	 }
+	 {
+	    GLdouble len = tx*tx + ty*ty + tz*tz;
+	    if (len > 1e-20) {
+	       GLfloat scale = INV_SQRTF(len);
+	       out[i][0] = tx * scale;
+	       out[i][1] = ty * scale;
+	       out[i][2] = tz * scale;
+	    }
+	    else {
+	       out[i][0] = out[i][1] = out[i][2] = 0;
+	    }
+	 }
+      }
+   }
+   else {
+      m0 *= scale;
+      m5 *= scale;
+      m10 *= scale;
+
+      STRIDE_LOOP {
+	 GLfloat tx, ty, tz;
+	 {
+	    const GLfloat ux = from[0],  uy = from[1],  uz = from[2];
+	    tx = ux * m0                    ;
+	    ty =           uy * m5          ;
+	    tz =                     uz * m10;
+	 }
+	 {
+	    GLfloat len = lengths[i];
+	    out[i][0] = tx * len;
+	    out[i][1] = ty * len;
+	    out[i][2] = tz * len;
+	 }
+      }
+   }
+   dest->count = in->count;
+}
+
+
+static void _XFORMAPI
+TAG(transform_rescale_normals_no_rot)( const GLmatrix *mat,
+                                       GLfloat scale,
+                                       const GLvector4f *in,
+                                       const GLfloat *lengths,
+                                       GLvector4f *dest )
+{
+   GLfloat (*out)[4] = (GLfloat (*)[4])dest->start;
+   const GLfloat *from = in->start;
+   const GLuint stride = in->stride;
+   const GLuint count = in->count;
+   const GLfloat *m = mat->inv;
+   const GLfloat m0 = scale*m[0];
+   const GLfloat m5 = scale*m[5];
+   const GLfloat m10 = scale*m[10];
+   GLuint i;
+
+   (void) lengths;
+
+   STRIDE_LOOP {
+      GLfloat ux = from[0],  uy = from[1],  uz = from[2];
+      out[i][0] = ux * m0;
+      out[i][1] =           uy * m5;
+      out[i][2] =                     uz * m10;
+   }
+   dest->count = in->count;
+}
+
+
+static void _XFORMAPI
+TAG(transform_rescale_normals)( const GLmatrix *mat,
+                                GLfloat scale,
+                                const GLvector4f *in,
+                                const GLfloat *lengths,
+                                GLvector4f *dest )
+{
+   GLfloat (*out)[4] = (GLfloat (*)[4])dest->start;
+   const GLfloat *from = in->start;
+   const GLuint stride = in->stride;
+   const GLuint count = in->count;
+   /* Since we are unlikely to have < 3 vertices in the buffer,
+    * it makes sense to pre-multiply by scale.
+    */
+   const GLfloat *m = mat->inv;
+   const GLfloat m0 = scale*m[0],  m4 = scale*m[4],  m8 = scale*m[8];
+   const GLfloat m1 = scale*m[1],  m5 = scale*m[5],  m9 = scale*m[9];
+   const GLfloat m2 = scale*m[2],  m6 = scale*m[6],  m10 = scale*m[10];
+   GLuint i;
+
+   (void) lengths;
+
+   STRIDE_LOOP {
+      GLfloat ux = from[0],  uy = from[1],  uz = from[2];
+      out[i][0] = ux * m0 + uy * m1 + uz * m2;
+      out[i][1] = ux * m4 + uy * m5 + uz * m6;
+      out[i][2] = ux * m8 + uy * m9 + uz * m10;
+   }
+   dest->count = in->count;
+}
+
+
+static void _XFORMAPI
+TAG(transform_normals_no_rot)( const GLmatrix *mat,
+			       GLfloat scale,
+			       const GLvector4f *in,
+			       const GLfloat *lengths,
+			       GLvector4f *dest )
+{
+   GLfloat (*out)[4] = (GLfloat (*)[4])dest->start;
+   const GLfloat *from = in->start;
+   const GLuint stride = in->stride;
+   const GLuint count = in->count;
+   const GLfloat *m = mat->inv;
+   const GLfloat m0 = m[0];
+   const GLfloat m5 = m[5];
+   const GLfloat m10 = m[10];
+   GLuint i;
+
+   (void) scale;
+   (void) lengths;
+
+   STRIDE_LOOP {
+      GLfloat ux = from[0],  uy = from[1],  uz = from[2];
+      out[i][0] = ux * m0;
+      out[i][1] =           uy * m5;
+      out[i][2] =                     uz * m10;
+   }
+   dest->count = in->count;
+}
+
+
+static void _XFORMAPI
+TAG(transform_normals)( const GLmatrix *mat,
+                        GLfloat scale,
+                        const GLvector4f *in,
+                        const GLfloat *lengths,
+                        GLvector4f *dest )
+{
+   GLfloat (*out)[4] = (GLfloat (*)[4])dest->start;
+   const GLfloat *from = in->start;
+   const GLuint stride = in->stride;
+   const GLuint count = in->count;
+   const GLfloat *m = mat->inv;
+   const GLfloat m0 = m[0],  m4 = m[4],  m8 = m[8];
+   const GLfloat m1 = m[1],  m5 = m[5],  m9 = m[9];
+   const GLfloat m2 = m[2],  m6 = m[6],  m10 = m[10];
+   GLuint i;
+
+   (void) scale;
+   (void) lengths;
+
+   STRIDE_LOOP {
+      GLfloat ux = from[0],  uy = from[1],  uz = from[2];
+      out[i][0] = ux * m0 + uy * m1 + uz * m2;
+      out[i][1] = ux * m4 + uy * m5 + uz * m6;
+      out[i][2] = ux * m8 + uy * m9 + uz * m10;
+   }
+   dest->count = in->count;
+}
+
+
+static void _XFORMAPI
+TAG(normalize_normals)( const GLmatrix *mat,
+                        GLfloat scale,
+                        const GLvector4f *in,
+                        const GLfloat *lengths,
+                        GLvector4f *dest )
+{
+   GLfloat (*out)[4] = (GLfloat (*)[4])dest->start;
+   const GLfloat *from = in->start;
+   const GLuint stride = in->stride;
+   const GLuint count = in->count;
+   GLuint i;
+
+   (void) mat;
+   (void) scale;
+
+   if (lengths) {
+      STRIDE_LOOP {
+	 const GLfloat x = from[0], y = from[1], z = from[2];
+	 GLfloat invlen = lengths[i];
+	 out[i][0] = x * invlen;
+	 out[i][1] = y * invlen;
+	 out[i][2] = z * invlen;
+      }
+   }
+   else {
+      STRIDE_LOOP {
+	 const GLfloat x = from[0], y = from[1], z = from[2];
+	 GLdouble len = x * x + y * y + z * z;
+	 if (len > 1e-50) {
+	    len = INV_SQRTF(len);
+	    out[i][0] = (GLfloat)(x * len);
+	    out[i][1] = (GLfloat)(y * len);
+	    out[i][2] = (GLfloat)(z * len);
+	 }
+	 else {
+	    out[i][0] = x;
+	    out[i][1] = y;
+	    out[i][2] = z;
+	 }
+      }
+   }
+   dest->count = in->count;
+}
+
+
+static void _XFORMAPI
+TAG(rescale_normals)( const GLmatrix *mat,
+                      GLfloat scale,
+                      const GLvector4f *in,
+                      const GLfloat *lengths,
+                      GLvector4f *dest )
+{
+   GLfloat (*out)[4] = (GLfloat (*)[4])dest->start;
+   const GLfloat *from = in->start;
+   const GLuint stride = in->stride;
+   const GLuint count = in->count;
+   GLuint i;
+
+   (void) mat;
+   (void) lengths;
+
+   STRIDE_LOOP {
+      SCALE_SCALAR_3V( out[i], scale, from );
+   }
+   dest->count = in->count;
+}
+
+
+static void _XFORMAPI
+TAG(init_c_norm_transform)( void )
+{
+   _mesa_normal_tab[NORM_TRANSFORM_NO_ROT] =
+      TAG(transform_normals_no_rot);
+
+   _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE] =
+      TAG(transform_rescale_normals_no_rot);
+
+   _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_NORMALIZE] =
+      TAG(transform_normalize_normals_no_rot);
+
+   _mesa_normal_tab[NORM_TRANSFORM] =
+      TAG(transform_normals);
+
+   _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE] =
+      TAG(transform_rescale_normals);
+
+   _mesa_normal_tab[NORM_TRANSFORM | NORM_NORMALIZE] =
+      TAG(transform_normalize_normals);
+
+   _mesa_normal_tab[NORM_RESCALE] =
+      TAG(rescale_normals);
+
+   _mesa_normal_tab[NORM_NORMALIZE] =
+      TAG(normalize_normals);
+}
diff --git a/mesalib/src/mesa/math/m_trans_tmp.h b/mesalib/src/mesa/math/m_trans_tmp.h
new file mode 100644
index 000000000..08fb4d1e9
--- /dev/null
+++ b/mesalib/src/mesa/math/m_trans_tmp.h
@@ -0,0 +1,281 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.5.1
+ *
+ * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \brief  Templates for vector conversions.
+ * \author Keith Whitwell.
+ */
+
+#ifdef DEST_4F
+static void DEST_4F( GLfloat (*t)[4],
+		     CONST void *ptr,
+		     GLuint stride,
+		     ARGS )
+{
+   const GLubyte *f = (GLubyte *) ptr + SRC_START * stride;
+   const GLubyte *first = f;
+   GLuint i;
+
+   (void) first;
+   (void) start;
+   for (i = DST_START ; i < n ; i++, NEXT_F) {
+      CHECK {
+         NEXT_F2;
+	 if (SZ >= 1) t[i][0] = TRX_4F(f, 0);
+	 if (SZ >= 2) t[i][1] = TRX_4F(f, 1);
+	 if (SZ >= 3) t[i][2] = TRX_4F(f, 2);
+	 if (SZ == 4) t[i][3] = TRX_4F(f, 3); else t[i][3] = 1.0;
+      }
+   }
+}
+#endif
+
+
+
+#ifdef DEST_4FN
+static void DEST_4FN( GLfloat (*t)[4],
+		      CONST void *ptr,
+		      GLuint stride,
+		      ARGS )
+{
+   const GLubyte *f = (GLubyte *) ptr + SRC_START * stride;
+   const GLubyte *first = f;
+   GLuint i;
+
+   (void) first;
+   (void) start;
+   for (i = DST_START ; i < n ; i++, NEXT_F) {
+      CHECK {
+         NEXT_F2;
+	 if (SZ >= 1) t[i][0] = TRX_4FN(f, 0);
+	 if (SZ >= 2) t[i][1] = TRX_4FN(f, 1);
+	 if (SZ >= 3) t[i][2] = TRX_4FN(f, 2);
+	 if (SZ == 4) t[i][3] = TRX_4FN(f, 3); else t[i][3] = 1.0;
+      }
+   }
+}
+#endif
+
+
+#ifdef DEST_3FN
+static void DEST_3FN( GLfloat (*t)[3],
+		     CONST void *ptr,
+		     GLuint stride,
+		     ARGS )
+{
+   const GLubyte *f = (GLubyte *) ptr + SRC_START * stride;
+   const GLubyte *first = f;
+   GLuint i;
+   (void) first;
+   (void) start;
+   for (i = DST_START ; i < n ; i++, NEXT_F) {
+      CHECK {
+         NEXT_F2;
+	 t[i][0] = TRX_3FN(f, 0);
+	 t[i][1] = TRX_3FN(f, 1);
+	 t[i][2] = TRX_3FN(f, 2);
+      }
+   }
+}
+#endif
+
+#ifdef DEST_1F
+static void DEST_1F( GLfloat *t,
+		     CONST void *ptr,
+		     GLuint stride,
+		     ARGS )
+{
+   const GLubyte *f = (GLubyte *) ptr + SRC_START * stride;
+   const GLubyte *first = f;
+   GLuint i;
+   (void) first;
+   (void) start;
+   for (i = DST_START ; i < n ; i++, NEXT_F) {
+      CHECK {
+         NEXT_F2;
+	 t[i] = TRX_1F(f, 0);
+      }
+   }
+}
+#endif
+
+#ifdef DEST_4UB
+static void DEST_4UB( GLubyte (*t)[4],
+                      CONST void *ptr,
+                      GLuint stride,
+                      ARGS )
+{
+   const GLubyte *f = (GLubyte *) ptr + SRC_START * stride;
+   const GLubyte *first = f;
+   GLuint i;
+   (void) start;
+   (void) first;
+   for (i = DST_START ; i < n ; i++, NEXT_F) {
+      CHECK {
+         NEXT_F2;
+	 if (SZ >= 1) TRX_UB(t[i][0], f, 0);
+	 if (SZ >= 2) TRX_UB(t[i][1], f, 1);
+	 if (SZ >= 3) TRX_UB(t[i][2], f, 2);
+	 if (SZ == 4) TRX_UB(t[i][3], f, 3); else t[i][3] = 255;
+      }
+   }
+}
+#endif
+
+
+#ifdef DEST_4US
+static void DEST_4US( GLushort (*t)[4],
+                      CONST void *ptr,
+                      GLuint stride,
+                      ARGS )
+{
+   const GLubyte *f = (GLubyte *) ((GLubyte *) ptr + SRC_START * stride);
+   const GLubyte *first = f;
+   GLuint i;
+   (void) start;
+   (void) first;
+   for (i = DST_START ; i < n ; i++, NEXT_F) {
+      CHECK {
+         NEXT_F2;
+	 if (SZ >= 1) TRX_US(t[i][0], f, 0);
+	 if (SZ >= 2) TRX_US(t[i][1], f, 1);
+	 if (SZ >= 3) TRX_US(t[i][2], f, 2);
+	 if (SZ == 4) TRX_US(t[i][3], f, 3); else t[i][3] = 65535;
+      }
+   }
+}
+#endif
+
+
+#ifdef DEST_1UB
+static void DEST_1UB( GLubyte *t,
+		      CONST void *ptr,
+		      GLuint stride,
+		      ARGS )
+{
+   const GLubyte *f = (GLubyte *) ptr + SRC_START * stride;
+   const GLubyte *first = f;
+   GLuint i;
+   (void) start;
+   (void) first;
+   for (i = DST_START ; i < n ; i++, NEXT_F) {
+      CHECK {
+         NEXT_F2;
+	  TRX_UB(t[i], f, 0);
+      }
+   }
+}
+#endif
+
+
+#ifdef DEST_1UI
+static void DEST_1UI( GLuint *t,
+		      CONST void *ptr,
+		      GLuint stride,
+		      ARGS )
+{
+   const GLubyte *f = (GLubyte *) ptr + SRC_START * stride;
+   const GLubyte *first = f;
+   GLuint i;
+   (void) start;
+   (void) first;
+
+   for (i = DST_START ; i < n ; i++, NEXT_F) {
+      CHECK {
+         NEXT_F2;
+	 t[i] = TRX_UI(f, 0);
+      }
+   }
+}
+#endif
+
+
+static void INIT(void)
+{
+#ifdef DEST_1UI
+   ASSERT(SZ == 1);
+   TAB(_1ui)[SRC_IDX] = DEST_1UI;
+#endif
+#ifdef DEST_1UB
+   ASSERT(SZ == 1);
+   TAB(_1ub)[SRC_IDX] = DEST_1UB;
+#endif
+#ifdef DEST_1F
+   ASSERT(SZ == 1);
+   TAB(_1f)[SRC_IDX] = DEST_1F;
+#endif
+#ifdef DEST_3FN
+   ASSERT(SZ == 3);
+   TAB(_3fn)[SRC_IDX] = DEST_3FN;
+#endif
+#ifdef DEST_4UB
+   TAB(_4ub)[SZ][SRC_IDX] = DEST_4UB;
+#endif
+#ifdef DEST_4US
+   TAB(_4us)[SZ][SRC_IDX] = DEST_4US;
+#endif
+#ifdef DEST_4F
+   TAB(_4f)[SZ][SRC_IDX] = DEST_4F;
+#endif
+#ifdef DEST_4FN
+   TAB(_4fn)[SZ][SRC_IDX] = DEST_4FN;
+#endif
+
+}
+
+
+#ifdef INIT
+#undef INIT
+#endif
+#ifdef DEST_1UI
+#undef DEST_1UI
+#endif
+#ifdef DEST_1UB
+#undef DEST_1UB
+#endif
+#ifdef DEST_4UB
+#undef DEST_4UB
+#endif
+#ifdef DEST_4US
+#undef DEST_4US
+#endif
+#ifdef DEST_3FN
+#undef DEST_3FN
+#endif
+#ifdef DEST_4F
+#undef DEST_4F
+#endif
+#ifdef DEST_4FN
+#undef DEST_4FN
+#endif
+#ifdef DEST_1F
+#undef DEST_1F
+#endif
+#ifdef SZ
+#undef SZ
+#endif
+#ifdef TAG
+#undef TAG
+#endif
+
diff --git a/mesalib/src/mesa/math/m_translate.c b/mesalib/src/mesa/math/m_translate.c
new file mode 100644
index 000000000..4a20f45ee
--- /dev/null
+++ b/mesalib/src/mesa/math/m_translate.c
@@ -0,0 +1,751 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.5.1
+ *
+ * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \brief  Translate vectors of numbers between various types.
+ * \author Keith Whitwell.
+ */
+
+
+#include "main/glheader.h"
+#include "main/mtypes.h"		/* GLchan hack */
+#include "main/colormac.h"
+
+#include "m_translate.h"
+
+
+
+typedef void (*trans_1f_func)(GLfloat *to,
+			      CONST void *ptr,
+			      GLuint stride,
+			      GLuint start,
+			      GLuint n );
+
+typedef void (*trans_1ui_func)(GLuint *to,
+			       CONST void *ptr,
+			       GLuint stride,
+			       GLuint start,
+			       GLuint n );
+
+typedef void (*trans_1ub_func)(GLubyte *to,
+			       CONST void *ptr,
+			       GLuint stride,
+			       GLuint start,
+			       GLuint n );
+
+typedef void (*trans_4ub_func)(GLubyte (*to)[4],
+                               CONST void *ptr,
+                               GLuint stride,
+                               GLuint start,
+                               GLuint n );
+
+typedef void (*trans_4us_func)(GLushort (*to)[4],
+                               CONST void *ptr,
+                               GLuint stride,
+                               GLuint start,
+                               GLuint n );
+
+typedef void (*trans_4f_func)(GLfloat (*to)[4],
+			      CONST void *ptr,
+			      GLuint stride,
+			      GLuint start,
+			      GLuint n );
+
+typedef void (*trans_3fn_func)(GLfloat (*to)[3],
+			      CONST void *ptr,
+			      GLuint stride,
+			      GLuint start,
+			      GLuint n );
+
+
+
+
+#define TYPE_IDX(t) ((t) & 0xf)
+#define MAX_TYPES TYPE_IDX(GL_DOUBLE)+1      /* 0xa + 1 */
+
+
+/* This macro is used on other systems, so undefine it for this module */
+
+#undef	CHECK
+
+static trans_1f_func  _math_trans_1f_tab[MAX_TYPES];
+static trans_1ui_func _math_trans_1ui_tab[MAX_TYPES];
+static trans_1ub_func _math_trans_1ub_tab[MAX_TYPES];
+static trans_3fn_func  _math_trans_3fn_tab[MAX_TYPES];
+static trans_4ub_func _math_trans_4ub_tab[5][MAX_TYPES];
+static trans_4us_func _math_trans_4us_tab[5][MAX_TYPES];
+static trans_4f_func  _math_trans_4f_tab[5][MAX_TYPES];
+static trans_4f_func  _math_trans_4fn_tab[5][MAX_TYPES];
+
+
+#define PTR_ELT(ptr, elt) (((SRC *)ptr)[elt])
+
+
+#define TAB(x) _math_trans##x##_tab
+#define ARGS   GLuint start, GLuint n
+#define SRC_START  start
+#define DST_START  0
+#define STRIDE stride
+#define NEXT_F f += stride
+#define NEXT_F2
+#define CHECK
+
+
+
+
+/**
+ * Translate from GL_BYTE.
+ */
+#define SRC GLbyte
+#define SRC_IDX TYPE_IDX(GL_BYTE)
+#define TRX_3FN(f,n)   BYTE_TO_FLOAT( PTR_ELT(f,n) )
+#if 1
+#define TRX_4F(f,n)   BYTE_TO_FLOAT( PTR_ELT(f,n) )
+#else
+#define TRX_4F(f,n)   (GLfloat)( PTR_ELT(f,n) )
+#endif
+#define TRX_4FN(f,n)   BYTE_TO_FLOAT( PTR_ELT(f,n) )
+#define TRX_UB(ub, f,n)  ub = BYTE_TO_UBYTE( PTR_ELT(f,n) )
+#define TRX_US(ch, f,n)  ch = BYTE_TO_USHORT( PTR_ELT(f,n) )
+#define TRX_UI(f,n)  (PTR_ELT(f,n) < 0 ? 0 : (GLuint)  PTR_ELT(f,n))
+
+
+#define SZ 4
+#define INIT init_trans_4_GLbyte_raw
+#define DEST_4F trans_4_GLbyte_4f_raw
+#define DEST_4FN trans_4_GLbyte_4fn_raw
+#define DEST_4UB trans_4_GLbyte_4ub_raw
+#define DEST_4US trans_4_GLbyte_4us_raw
+#include "m_trans_tmp.h"
+
+#define SZ 3
+#define INIT init_trans_3_GLbyte_raw
+#define DEST_4F trans_3_GLbyte_4f_raw
+#define DEST_4FN trans_3_GLbyte_4fn_raw
+#define DEST_4UB trans_3_GLbyte_4ub_raw
+#define DEST_4US trans_3_GLbyte_4us_raw
+#define DEST_3FN trans_3_GLbyte_3fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 2
+#define INIT init_trans_2_GLbyte_raw
+#define DEST_4F trans_2_GLbyte_4f_raw
+#define DEST_4FN trans_2_GLbyte_4fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 1
+#define INIT init_trans_1_GLbyte_raw
+#define DEST_4F trans_1_GLbyte_4f_raw
+#define DEST_4FN trans_1_GLbyte_4fn_raw
+#define DEST_1UB trans_1_GLbyte_1ub_raw
+#define DEST_1UI trans_1_GLbyte_1ui_raw
+#include "m_trans_tmp.h"
+
+#undef SRC
+#undef TRX_3FN
+#undef TRX_4F
+#undef TRX_4FN
+#undef TRX_UB
+#undef TRX_US
+#undef TRX_UI
+#undef SRC_IDX
+
+
+/**
+ * Translate from GL_UNSIGNED_BYTE.
+ */
+#define SRC GLubyte
+#define SRC_IDX TYPE_IDX(GL_UNSIGNED_BYTE)
+#define TRX_3FN(f,n)	     UBYTE_TO_FLOAT(PTR_ELT(f,n))
+#define TRX_4F(f,n)	     (GLfloat)( PTR_ELT(f,n) )
+#define TRX_4FN(f,n)	     UBYTE_TO_FLOAT(PTR_ELT(f,n))
+#define TRX_UB(ub, f,n)	     ub = PTR_ELT(f,n)
+#define TRX_US(us, f,n)      us = UBYTE_TO_USHORT(PTR_ELT(f,n))
+#define TRX_UI(f,n)          (GLuint)PTR_ELT(f,n)
+
+/* 4ub->4ub handled in special case below.
+ */
+#define SZ 4
+#define INIT init_trans_4_GLubyte_raw
+#define DEST_4F trans_4_GLubyte_4f_raw
+#define DEST_4FN trans_4_GLubyte_4fn_raw
+#define DEST_4US trans_4_GLubyte_4us_raw
+#include "m_trans_tmp.h"
+
+
+#define SZ 3
+#define INIT init_trans_3_GLubyte_raw
+#define DEST_4UB trans_3_GLubyte_4ub_raw
+#define DEST_4US trans_3_GLubyte_4us_raw
+#define DEST_3FN trans_3_GLubyte_3fn_raw
+#define DEST_4F trans_3_GLubyte_4f_raw
+#define DEST_4FN trans_3_GLubyte_4fn_raw
+#include "m_trans_tmp.h"
+
+
+#define SZ 1
+#define INIT init_trans_1_GLubyte_raw
+#define DEST_1UI trans_1_GLubyte_1ui_raw
+#define DEST_1UB trans_1_GLubyte_1ub_raw
+#include "m_trans_tmp.h"
+
+#undef SRC
+#undef SRC_IDX
+#undef TRX_3FN
+#undef TRX_4F
+#undef TRX_4FN
+#undef TRX_UB
+#undef TRX_US
+#undef TRX_UI
+
+
+/* GL_SHORT
+ */
+#define SRC GLshort
+#define SRC_IDX TYPE_IDX(GL_SHORT)
+#define TRX_3FN(f,n)   SHORT_TO_FLOAT( PTR_ELT(f,n) )
+#define TRX_4F(f,n)   (GLfloat)( PTR_ELT(f,n) )
+#define TRX_4FN(f,n)  SHORT_TO_FLOAT( PTR_ELT(f,n) )
+#define TRX_UB(ub, f,n)  ub = SHORT_TO_UBYTE(PTR_ELT(f,n))
+#define TRX_US(us, f,n)  us = SHORT_TO_USHORT(PTR_ELT(f,n))
+#define TRX_UI(f,n)  (PTR_ELT(f,n) < 0 ? 0 : (GLuint)  PTR_ELT(f,n))
+
+
+#define SZ  4
+#define INIT init_trans_4_GLshort_raw
+#define DEST_4F trans_4_GLshort_4f_raw
+#define DEST_4FN trans_4_GLshort_4fn_raw
+#define DEST_4UB trans_4_GLshort_4ub_raw
+#define DEST_4US trans_4_GLshort_4us_raw
+#include "m_trans_tmp.h"
+
+#define SZ 3
+#define INIT init_trans_3_GLshort_raw
+#define DEST_4F trans_3_GLshort_4f_raw
+#define DEST_4FN trans_3_GLshort_4fn_raw
+#define DEST_4UB trans_3_GLshort_4ub_raw
+#define DEST_4US trans_3_GLshort_4us_raw
+#define DEST_3FN trans_3_GLshort_3fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 2
+#define INIT init_trans_2_GLshort_raw
+#define DEST_4F trans_2_GLshort_4f_raw
+#define DEST_4FN trans_2_GLshort_4fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 1
+#define INIT init_trans_1_GLshort_raw
+#define DEST_4F trans_1_GLshort_4f_raw
+#define DEST_4FN trans_1_GLshort_4fn_raw
+#define DEST_1UB trans_1_GLshort_1ub_raw
+#define DEST_1UI trans_1_GLshort_1ui_raw
+#include "m_trans_tmp.h"
+
+
+#undef SRC
+#undef SRC_IDX
+#undef TRX_3FN
+#undef TRX_4F
+#undef TRX_4FN
+#undef TRX_UB
+#undef TRX_US
+#undef TRX_UI
+
+
+/* GL_UNSIGNED_SHORT
+ */
+#define SRC GLushort
+#define SRC_IDX TYPE_IDX(GL_UNSIGNED_SHORT)
+#define TRX_3FN(f,n)   USHORT_TO_FLOAT( PTR_ELT(f,n) )
+#define TRX_4F(f,n)   (GLfloat)( PTR_ELT(f,n) )
+#define TRX_4FN(f,n)  USHORT_TO_FLOAT( PTR_ELT(f,n) )
+#define TRX_UB(ub,f,n)  ub = (GLubyte) (PTR_ELT(f,n) >> 8)
+#define TRX_US(us,f,n)  us = PTR_ELT(f,n)
+#define TRX_UI(f,n)  (GLuint)   PTR_ELT(f,n)
+
+
+#define SZ 4
+#define INIT init_trans_4_GLushort_raw
+#define DEST_4F trans_4_GLushort_4f_raw
+#define DEST_4FN trans_4_GLushort_4fn_raw
+#define DEST_4UB trans_4_GLushort_4ub_raw
+#define DEST_4US trans_4_GLushort_4us_raw
+#include "m_trans_tmp.h"
+
+#define SZ 3
+#define INIT init_trans_3_GLushort_raw
+#define DEST_4F trans_3_GLushort_4f_raw
+#define DEST_4FN trans_3_GLushort_4fn_raw
+#define DEST_4UB trans_3_GLushort_4ub_raw
+#define DEST_4US trans_3_GLushort_4us_raw
+#define DEST_3FN trans_3_GLushort_3fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 2
+#define INIT init_trans_2_GLushort_raw
+#define DEST_4F trans_2_GLushort_4f_raw
+#define DEST_4FN trans_2_GLushort_4fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 1
+#define INIT init_trans_1_GLushort_raw
+#define DEST_4F trans_1_GLushort_4f_raw
+#define DEST_4FN trans_1_GLushort_4fn_raw
+#define DEST_1UB trans_1_GLushort_1ub_raw
+#define DEST_1UI trans_1_GLushort_1ui_raw
+#include "m_trans_tmp.h"
+
+#undef SRC
+#undef SRC_IDX
+#undef TRX_3FN
+#undef TRX_4F
+#undef TRX_4FN
+#undef TRX_UB
+#undef TRX_US
+#undef TRX_UI
+
+
+/* GL_INT
+ */
+#define SRC GLint
+#define SRC_IDX TYPE_IDX(GL_INT)
+#define TRX_3FN(f,n)   INT_TO_FLOAT( PTR_ELT(f,n) )
+#define TRX_4F(f,n)   (GLfloat)( PTR_ELT(f,n) )
+#define TRX_4FN(f,n)  INT_TO_FLOAT( PTR_ELT(f,n) )
+#define TRX_UB(ub, f,n)  ub = INT_TO_UBYTE(PTR_ELT(f,n))
+#define TRX_US(us, f,n)  us = INT_TO_USHORT(PTR_ELT(f,n))
+#define TRX_UI(f,n)  (PTR_ELT(f,n) < 0 ? 0 : (GLuint)  PTR_ELT(f,n))
+
+
+#define SZ 4
+#define INIT init_trans_4_GLint_raw
+#define DEST_4F trans_4_GLint_4f_raw
+#define DEST_4FN trans_4_GLint_4fn_raw
+#define DEST_4UB trans_4_GLint_4ub_raw
+#define DEST_4US trans_4_GLint_4us_raw
+#include "m_trans_tmp.h"
+
+#define SZ 3
+#define INIT init_trans_3_GLint_raw
+#define DEST_4F trans_3_GLint_4f_raw
+#define DEST_4FN trans_3_GLint_4fn_raw
+#define DEST_4UB trans_3_GLint_4ub_raw
+#define DEST_4US trans_3_GLint_4us_raw
+#define DEST_3FN trans_3_GLint_3fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 2
+#define INIT init_trans_2_GLint_raw
+#define DEST_4F trans_2_GLint_4f_raw
+#define DEST_4FN trans_2_GLint_4fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 1
+#define INIT init_trans_1_GLint_raw
+#define DEST_4F trans_1_GLint_4f_raw
+#define DEST_4FN trans_1_GLint_4fn_raw
+#define DEST_1UB trans_1_GLint_1ub_raw
+#define DEST_1UI trans_1_GLint_1ui_raw
+#include "m_trans_tmp.h"
+
+
+#undef SRC
+#undef SRC_IDX
+#undef TRX_3FN
+#undef TRX_4F
+#undef TRX_4FN
+#undef TRX_UB
+#undef TRX_US
+#undef TRX_UI
+
+
+/* GL_UNSIGNED_INT
+ */
+#define SRC GLuint
+#define SRC_IDX TYPE_IDX(GL_UNSIGNED_INT)
+#define TRX_3FN(f,n)   INT_TO_FLOAT( PTR_ELT(f,n) )
+#define TRX_4F(f,n)   (GLfloat)( PTR_ELT(f,n) )
+#define TRX_4FN(f,n)  UINT_TO_FLOAT( PTR_ELT(f,n) )
+#define TRX_UB(ub, f,n)  ub = (GLubyte) (PTR_ELT(f,n) >> 24)
+#define TRX_US(us, f,n)  us = (GLshort) (PTR_ELT(f,n) >> 16)
+#define TRX_UI(f,n)		PTR_ELT(f,n)
+
+
+#define SZ 4
+#define INIT init_trans_4_GLuint_raw
+#define DEST_4F trans_4_GLuint_4f_raw
+#define DEST_4FN trans_4_GLuint_4fn_raw
+#define DEST_4UB trans_4_GLuint_4ub_raw
+#define DEST_4US trans_4_GLuint_4us_raw
+#include "m_trans_tmp.h"
+
+#define SZ 3
+#define INIT init_trans_3_GLuint_raw
+#define DEST_4F trans_3_GLuint_4f_raw
+#define DEST_4FN trans_3_GLuint_4fn_raw
+#define DEST_4UB trans_3_GLuint_4ub_raw
+#define DEST_4US trans_3_GLuint_4us_raw
+#define DEST_3FN trans_3_GLuint_3fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 2
+#define INIT init_trans_2_GLuint_raw
+#define DEST_4F trans_2_GLuint_4f_raw
+#define DEST_4FN trans_2_GLuint_4fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 1
+#define INIT init_trans_1_GLuint_raw
+#define DEST_4F trans_1_GLuint_4f_raw
+#define DEST_4FN trans_1_GLuint_4fn_raw
+#define DEST_1UB trans_1_GLuint_1ub_raw
+#define DEST_1UI trans_1_GLuint_1ui_raw
+#include "m_trans_tmp.h"
+
+#undef SRC
+#undef SRC_IDX
+#undef TRX_3FN
+#undef TRX_4F
+#undef TRX_4FN
+#undef TRX_UB
+#undef TRX_US
+#undef TRX_UI
+
+
+/* GL_DOUBLE
+ */
+#define SRC GLdouble
+#define SRC_IDX TYPE_IDX(GL_DOUBLE)
+#define TRX_3FN(f,n)   (GLfloat) PTR_ELT(f,n)
+#define TRX_4F(f,n)   (GLfloat) PTR_ELT(f,n)
+#define TRX_4FN(f,n)   (GLfloat) PTR_ELT(f,n)
+#define TRX_UB(ub,f,n) UNCLAMPED_FLOAT_TO_UBYTE(ub, PTR_ELT(f,n))
+#define TRX_US(us,f,n) UNCLAMPED_FLOAT_TO_USHORT(us, PTR_ELT(f,n))
+#define TRX_UI(f,n)  (GLuint) (GLint) PTR_ELT(f,n)
+#define TRX_1F(f,n)   (GLfloat) PTR_ELT(f,n)
+
+
+#define SZ 4
+#define INIT init_trans_4_GLdouble_raw
+#define DEST_4F trans_4_GLdouble_4f_raw
+#define DEST_4FN trans_4_GLdouble_4fn_raw
+#define DEST_4UB trans_4_GLdouble_4ub_raw
+#define DEST_4US trans_4_GLdouble_4us_raw
+#include "m_trans_tmp.h"
+
+#define SZ 3
+#define INIT init_trans_3_GLdouble_raw
+#define DEST_4F trans_3_GLdouble_4f_raw
+#define DEST_4FN trans_3_GLdouble_4fn_raw
+#define DEST_4UB trans_3_GLdouble_4ub_raw
+#define DEST_4US trans_3_GLdouble_4us_raw
+#define DEST_3FN trans_3_GLdouble_3fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 2
+#define INIT init_trans_2_GLdouble_raw
+#define DEST_4F trans_2_GLdouble_4f_raw
+#define DEST_4FN trans_2_GLdouble_4fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 1
+#define INIT init_trans_1_GLdouble_raw
+#define DEST_4F trans_1_GLdouble_4f_raw
+#define DEST_4FN trans_1_GLdouble_4fn_raw
+#define DEST_1UB trans_1_GLdouble_1ub_raw
+#define DEST_1UI trans_1_GLdouble_1ui_raw
+#define DEST_1F trans_1_GLdouble_1f_raw
+#include "m_trans_tmp.h"
+
+#undef SRC
+#undef SRC_IDX
+
+/* GL_FLOAT
+ */
+#define SRC GLfloat
+#define SRC_IDX TYPE_IDX(GL_FLOAT)
+#define SZ 4
+#define INIT init_trans_4_GLfloat_raw
+#define DEST_4UB trans_4_GLfloat_4ub_raw
+#define DEST_4US trans_4_GLfloat_4us_raw
+#define DEST_4F  trans_4_GLfloat_4f_raw
+#define DEST_4FN  trans_4_GLfloat_4fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 3
+#define INIT init_trans_3_GLfloat_raw
+#define DEST_4F  trans_3_GLfloat_4f_raw
+#define DEST_4FN  trans_3_GLfloat_4fn_raw
+#define DEST_4UB trans_3_GLfloat_4ub_raw
+#define DEST_4US trans_3_GLfloat_4us_raw
+#define DEST_3FN trans_3_GLfloat_3fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 2
+#define INIT init_trans_2_GLfloat_raw
+#define DEST_4F trans_2_GLfloat_4f_raw
+#define DEST_4FN trans_2_GLfloat_4fn_raw
+#include "m_trans_tmp.h"
+
+#define SZ 1
+#define INIT init_trans_1_GLfloat_raw
+#define DEST_4F  trans_1_GLfloat_4f_raw
+#define DEST_4FN  trans_1_GLfloat_4fn_raw
+#define DEST_1UB trans_1_GLfloat_1ub_raw
+#define DEST_1UI trans_1_GLfloat_1ui_raw
+#define DEST_1F trans_1_GLfloat_1f_raw
+
+#include "m_trans_tmp.h"
+
+#undef SRC
+#undef SRC_IDX
+#undef TRX_3FN
+#undef TRX_4F
+#undef TRX_4FN
+#undef TRX_UB
+#undef TRX_US
+#undef TRX_UI
+
+
+static void trans_4_GLubyte_4ub_raw(GLubyte (*t)[4],
+				    CONST void *Ptr,
+				    GLuint stride,
+				    ARGS )
+{
+   const GLubyte *f = (GLubyte *) Ptr + SRC_START * stride;
+   GLuint i;
+
+   if (((((uintptr_t) f | (uintptr_t) stride)) & 3L) == 0L) {
+      /* Aligned.
+       */
+      for (i = DST_START ; i < n ; i++, f += stride) {
+	 COPY_4UBV( t[i], f );
+      }
+   } else {
+      for (i = DST_START ; i < n ; i++, f += stride) {
+	 t[i][0] = f[0];
+	 t[i][1] = f[1];
+	 t[i][2] = f[2];
+	 t[i][3] = f[3];
+      }
+   }
+}
+
+
+static void init_translate_raw(void)
+{
+   MEMSET( TAB(_1ui), 0, sizeof(TAB(_1ui)) );
+   MEMSET( TAB(_1ub), 0, sizeof(TAB(_1ub)) );
+   MEMSET( TAB(_3fn),  0, sizeof(TAB(_3fn)) );
+   MEMSET( TAB(_4ub), 0, sizeof(TAB(_4ub)) );
+   MEMSET( TAB(_4us), 0, sizeof(TAB(_4us)) );
+   MEMSET( TAB(_4f),  0, sizeof(TAB(_4f)) );
+   MEMSET( TAB(_4fn),  0, sizeof(TAB(_4fn)) );
+
+   init_trans_4_GLbyte_raw();
+   init_trans_3_GLbyte_raw();
+   init_trans_2_GLbyte_raw();
+   init_trans_1_GLbyte_raw();
+   init_trans_1_GLubyte_raw();
+   init_trans_3_GLubyte_raw();
+   init_trans_4_GLubyte_raw();
+   init_trans_4_GLshort_raw();
+   init_trans_3_GLshort_raw();
+   init_trans_2_GLshort_raw();
+   init_trans_1_GLshort_raw();
+   init_trans_4_GLushort_raw();
+   init_trans_3_GLushort_raw();
+   init_trans_2_GLushort_raw();
+   init_trans_1_GLushort_raw();
+   init_trans_4_GLint_raw();
+   init_trans_3_GLint_raw();
+   init_trans_2_GLint_raw();
+   init_trans_1_GLint_raw();
+   init_trans_4_GLuint_raw();
+   init_trans_3_GLuint_raw();
+   init_trans_2_GLuint_raw();
+   init_trans_1_GLuint_raw();
+   init_trans_4_GLdouble_raw();
+   init_trans_3_GLdouble_raw();
+   init_trans_2_GLdouble_raw();
+   init_trans_1_GLdouble_raw();
+   init_trans_4_GLfloat_raw();
+   init_trans_3_GLfloat_raw();
+   init_trans_2_GLfloat_raw();
+   init_trans_1_GLfloat_raw();
+
+   TAB(_4ub)[4][TYPE_IDX(GL_UNSIGNED_BYTE)] = trans_4_GLubyte_4ub_raw;
+}
+
+
+#undef TAB
+#ifdef CLASS
+#undef CLASS
+#endif
+#undef ARGS
+#undef CHECK
+#undef SRC_START
+#undef DST_START
+#undef NEXT_F
+#undef NEXT_F2
+
+
+
+
+
+void _math_init_translate( void )
+{
+   init_translate_raw();
+}
+
+
+/**
+ * Translate vector of values to GLfloat [1].
+ */
+void _math_trans_1f(GLfloat *to,
+		    CONST void *ptr,
+		    GLuint stride,
+		    GLenum type,
+		    GLuint start,
+		    GLuint n )
+{
+   _math_trans_1f_tab[TYPE_IDX(type)]( to, ptr, stride, start, n );
+}
+
+/**
+ * Translate vector of values to GLuint [1].
+ */
+void _math_trans_1ui(GLuint *to,
+		     CONST void *ptr,
+		     GLuint stride,
+		     GLenum type,
+		     GLuint start,
+		     GLuint n )
+{
+   _math_trans_1ui_tab[TYPE_IDX(type)]( to, ptr, stride, start, n );
+}
+
+/**
+ * Translate vector of values to GLubyte [1].
+ */
+void _math_trans_1ub(GLubyte *to,
+		     CONST void *ptr,
+		     GLuint stride,
+		     GLenum type,
+		     GLuint start,
+		     GLuint n )
+{
+   _math_trans_1ub_tab[TYPE_IDX(type)]( to, ptr, stride, start, n );
+}
+
+
+/**
+ * Translate vector of values to GLubyte [4].
+ */
+void _math_trans_4ub(GLubyte (*to)[4],
+		     CONST void *ptr,
+		     GLuint stride,
+		     GLenum type,
+		     GLuint size,
+		     GLuint start,
+		     GLuint n )
+{
+   _math_trans_4ub_tab[size][TYPE_IDX(type)]( to, ptr, stride, start, n );
+}
+
+/**
+ * Translate vector of values to GLchan [4].
+ */
+void _math_trans_4chan( GLchan (*to)[4],
+			CONST void *ptr,
+			GLuint stride,
+			GLenum type,
+			GLuint size,
+			GLuint start,
+			GLuint n )
+{
+#if CHAN_TYPE == GL_UNSIGNED_BYTE
+   _math_trans_4ub( to, ptr, stride, type, size, start, n );
+#elif CHAN_TYPE == GL_UNSIGNED_SHORT
+   _math_trans_4us( to, ptr, stride, type, size, start, n );
+#elif CHAN_TYPE == GL_FLOAT
+   _math_trans_4fn( to, ptr, stride, type, size, start, n );
+#endif
+}
+
+/**
+ * Translate vector of values to GLushort [4].
+ */
+void _math_trans_4us(GLushort (*to)[4],
+		     CONST void *ptr,
+		     GLuint stride,
+		     GLenum type,
+		     GLuint size,
+		     GLuint start,
+		     GLuint n )
+{
+   _math_trans_4us_tab[size][TYPE_IDX(type)]( to, ptr, stride, start, n );
+}
+
+/**
+ * Translate vector of values to GLfloat [4].
+ */
+void _math_trans_4f(GLfloat (*to)[4],
+		    CONST void *ptr,
+		    GLuint stride,
+		    GLenum type,
+		    GLuint size,
+		    GLuint start,
+		    GLuint n )
+{
+   _math_trans_4f_tab[size][TYPE_IDX(type)]( to, ptr, stride, start, n );
+}
+
+/**
+ * Translate vector of values to GLfloat[4], normalized to [-1, 1].
+ */
+void _math_trans_4fn(GLfloat (*to)[4],
+		    CONST void *ptr,
+		    GLuint stride,
+		    GLenum type,
+		    GLuint size,
+		    GLuint start,
+		    GLuint n )
+{
+   _math_trans_4fn_tab[size][TYPE_IDX(type)]( to, ptr, stride, start, n );
+}
+
+/**
+ * Translate vector of values to GLfloat[3], normalized to [-1, 1].
+ */
+void _math_trans_3fn(GLfloat (*to)[3],
+		    CONST void *ptr,
+		    GLuint stride,
+		    GLenum type,
+		    GLuint start,
+		    GLuint n )
+{
+   _math_trans_3fn_tab[TYPE_IDX(type)]( to, ptr, stride, start, n );
+}
diff --git a/mesalib/src/mesa/math/m_translate.h b/mesalib/src/mesa/math/m_translate.h
new file mode 100644
index 000000000..c677682d5
--- /dev/null
+++ b/mesalib/src/mesa/math/m_translate.h
@@ -0,0 +1,122 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.5.1
+ *
+ * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _M_TRANSLATE_H_
+#define _M_TRANSLATE_H_
+
+#include "main/config.h"
+#include "main/mtypes.h"		/* hack for GLchan */
+
+
+/**
+ * Array translation.
+ * For example, convert array of GLushort[3] to GLfloat[4].
+ * The function name specifies the destination format/size.
+ * \param  to  the destination address
+ * \param  ptr  the source address
+ * \param  stride  the source stride (in bytes) between elements
+ * \param  type  the source datatype (GL_SHORT, GL_UNSIGNED_INT, etc)
+ * \param  size  number of values per element in source array (1,2,3 or 4)
+ * \param  start  first element in source array to convert
+ * \param  n  number of elements to convert
+ *
+ * Note: "element" means a tuple like GLfloat[3] or GLubyte[4].
+ */
+
+
+extern void _math_trans_1f(GLfloat *to,
+			   CONST void *ptr,
+			   GLuint stride,
+			   GLenum type,
+			   GLuint start,
+			   GLuint n );
+
+extern void _math_trans_1ui(GLuint *to,
+			    CONST void *ptr,
+			    GLuint stride,
+			    GLenum type,
+			    GLuint start,
+			    GLuint n );
+
+extern void _math_trans_1ub(GLubyte *to,
+			    CONST void *ptr,
+			    GLuint stride,
+			    GLenum type,
+			    GLuint start,
+			    GLuint n );
+
+extern void _math_trans_4ub(GLubyte (*to)[4],
+			    CONST void *ptr,
+			    GLuint stride,
+			    GLenum type,
+			    GLuint size,
+			    GLuint start,
+			    GLuint n );
+
+extern void _math_trans_4chan( GLchan (*to)[4],
+			       CONST void *ptr,
+			       GLuint stride,
+			       GLenum type,
+			       GLuint size,
+			       GLuint start,
+			       GLuint n );
+
+extern void _math_trans_4us(GLushort (*to)[4],
+			    CONST void *ptr,
+			    GLuint stride,
+			    GLenum type,
+			    GLuint size,
+			    GLuint start,
+			    GLuint n );
+
+/** Convert to floats w/out normalization (i.e. just cast) */
+extern void _math_trans_4f(GLfloat (*to)[4],
+			   CONST void *ptr,
+			   GLuint stride,
+			   GLenum type,
+			   GLuint size,
+			   GLuint start,
+			   GLuint n );
+
+/** Convert to normalized floats in [0,1] or [-1, 1] */
+extern void _math_trans_4fn(GLfloat (*to)[4],
+			    CONST void *ptr,
+			    GLuint stride,
+			    GLenum type,
+			    GLuint size,
+			    GLuint start,
+			    GLuint n );
+
+extern void _math_trans_3fn(GLfloat (*to)[3],
+			   CONST void *ptr,
+			   GLuint stride,
+			   GLenum type,
+			   GLuint start,
+			   GLuint n );
+
+extern void _math_init_translate( void );
+
+
+#endif
diff --git a/mesalib/src/mesa/math/m_vector.c b/mesalib/src/mesa/math/m_vector.c
new file mode 100644
index 000000000..4cbab11a3
--- /dev/null
+++ b/mesalib/src/mesa/math/m_vector.c
@@ -0,0 +1,185 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * New (3.1) transformation code written by Keith Whitwell.
+ */
+
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/imports.h"
+
+#include "m_vector.h"
+
+
+
+/**
+ * Given a vector [count][4] of floats, set all the [][elt] values
+ * to 0 (if elt = 0, 1, 2) or 1.0 (if elt = 3).
+ */
+void
+_mesa_vector4f_clean_elem( GLvector4f *vec, GLuint count, GLuint elt )
+{
+   static const GLubyte elem_bits[4] = {
+      VEC_DIRTY_0,
+      VEC_DIRTY_1,
+      VEC_DIRTY_2,
+      VEC_DIRTY_3
+   };
+   static const GLfloat clean[4] = { 0, 0, 0, 1 };
+   const GLfloat v = clean[elt];
+   GLfloat (*data)[4] = (GLfloat (*)[4])vec->start;
+   GLuint i;
+
+   for (i = 0; i < count; i++)
+      data[i][elt] = v;
+
+   vec->flags &= ~elem_bits[elt];
+}
+
+
+static const GLubyte size_bits[5] = {
+   0,
+   VEC_SIZE_1,
+   VEC_SIZE_2,
+   VEC_SIZE_3,
+   VEC_SIZE_4,
+};
+
+
+/**
+ * Initialize GLvector objects.
+ * \param v  the vector object to initialize.
+ * \param flags  bitwise-OR of VEC_* flags
+ * \param storage  pointer to storage for the vector's data
+ */
+void
+_mesa_vector4f_init( GLvector4f *v, GLbitfield flags, GLfloat (*storage)[4] )
+{
+   v->stride = 4 * sizeof(GLfloat);
+   v->size = 2;   /* may change: 2-4 for vertices and 1-4 for texcoords */
+   v->data = storage;
+   v->start = (GLfloat *) storage;
+   v->count = 0;
+   v->flags = size_bits[4] | flags;
+}
+
+
+/**
+ * Initialize GLvector objects and allocate storage.
+ * \param v  the vector object
+ * \param flags  bitwise-OR of VEC_* flags
+ * \param count  number of elements to allocate in vector
+ * \param alignment  desired memory alignment for the data (in bytes)
+ */
+void
+_mesa_vector4f_alloc( GLvector4f *v, GLbitfield flags, GLuint count,
+                      GLuint alignment )
+{
+   v->stride = 4 * sizeof(GLfloat);
+   v->size = 2;
+   v->storage = ALIGN_MALLOC( count * 4 * sizeof(GLfloat), alignment );
+   v->storage_count = count;
+   v->start = (GLfloat *) v->storage;
+   v->data = (GLfloat (*)[4]) v->storage;
+   v->count = 0;
+   v->flags = size_bits[4] | flags | VEC_MALLOC;
+}
+
+
+/**
+ * Vector deallocation.  Free whatever memory is pointed to by the
+ * vector's storage field if the VEC_MALLOC flag is set.
+ * DO NOT free the GLvector object itself, though.
+ */
+void
+_mesa_vector4f_free( GLvector4f *v )
+{
+   if (v->flags & VEC_MALLOC) {
+      ALIGN_FREE( v->storage );
+      v->data = NULL;
+      v->start = NULL;
+      v->storage = NULL;
+      v->flags &= ~VEC_MALLOC;
+   }
+}
+
+
+/**
+ * For debugging
+ */
+void
+_mesa_vector4f_print( const GLvector4f *v, const GLubyte *cullmask,
+                      GLboolean culling )
+{
+   static const GLfloat c[4] = { 0, 0, 0, 1 };
+   static const char *templates[5] = {
+      "%d:\t0, 0, 0, 1\n",
+      "%d:\t%f, 0, 0, 1\n",
+      "%d:\t%f, %f, 0, 1\n",
+      "%d:\t%f, %f, %f, 1\n",
+      "%d:\t%f, %f, %f, %f\n"
+   };
+
+   const char *t = templates[v->size];
+   GLfloat *d = (GLfloat *)v->data;
+   GLuint j, i = 0, count;
+
+   _mesa_printf("data-start\n");
+   for (; d != v->start; STRIDE_F(d, v->stride), i++)
+      _mesa_printf(t, i, d[0], d[1], d[2], d[3]);
+
+   _mesa_printf("start-count(%u)\n", v->count);
+   count = i + v->count;
+
+   if (culling) {
+      for (; i < count; STRIDE_F(d, v->stride), i++)
+	 if (cullmask[i])
+	    _mesa_printf(t, i, d[0], d[1], d[2], d[3]);
+   }
+   else {
+      for (; i < count; STRIDE_F(d, v->stride), i++)
+	 _mesa_printf(t, i, d[0], d[1], d[2], d[3]);
+   }
+
+   for (j = v->size; j < 4; j++) {
+      if ((v->flags & (1<<j)) == 0) {
+
+	 _mesa_printf("checking col %u is clean as advertised ", j);
+
+	 for (i = 0, d = (GLfloat *) v->data;
+	      i < count && d[j] == c[j];
+	      i++, STRIDE_F(d, v->stride)) {
+            /* no-op */
+         }
+
+	 if (i == count)
+	    _mesa_printf(" --> ok\n");
+	 else
+	    _mesa_printf(" --> Failed at %u ******\n", i);
+      }
+   }
+}
diff --git a/mesalib/src/mesa/math/m_vector.h b/mesalib/src/mesa/math/m_vector.h
new file mode 100644
index 000000000..71281d575
--- /dev/null
+++ b/mesalib/src/mesa/math/m_vector.h
@@ -0,0 +1,92 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.3
+ *
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * New (3.1) transformation code written by Keith Whitwell.
+ */
+
+
+#ifndef _M_VECTOR_H_
+#define _M_VECTOR_H_
+
+#include "main/glheader.h"
+
+
+#define VEC_DIRTY_0        0x1
+#define VEC_DIRTY_1        0x2
+#define VEC_DIRTY_2        0x4
+#define VEC_DIRTY_3        0x8
+#define VEC_MALLOC         0x10 /* storage field points to self-allocated mem*/
+#define VEC_NOT_WRITEABLE  0x40	/* writable elements to hold clipped data */
+#define VEC_BAD_STRIDE     0x100 /* matches tnl's prefered stride */
+
+
+#define VEC_SIZE_1   VEC_DIRTY_0
+#define VEC_SIZE_2   (VEC_DIRTY_0|VEC_DIRTY_1)
+#define VEC_SIZE_3   (VEC_DIRTY_0|VEC_DIRTY_1|VEC_DIRTY_2)
+#define VEC_SIZE_4   (VEC_DIRTY_0|VEC_DIRTY_1|VEC_DIRTY_2|VEC_DIRTY_3)
+
+
+
+/**
+ * Wrap all the information about vectors up in a struct.  Has
+ * additional fields compared to the other vectors to help us track of
+ * different vertex sizes, and whether we need to clean columns out
+ * because they contain non-(0,0,0,1) values.
+ *
+ * The start field is used to reserve data for copied vertices at the
+ * end of _mesa_transform_vb, and avoids the need for a multiplication in
+ * the transformation routines.
+ */
+typedef struct {
+   GLfloat (*data)[4];	/**< may be malloc'd or point to client data */
+   GLfloat *start;	/**< points somewhere inside of <data> */
+   GLuint count;	/**< size of the vector (in elements) */
+   GLuint stride;	/**< stride from one element to the next (in bytes) */
+   GLuint size;		/**< 2-4 for vertices and 1-4 for texcoords */
+   GLbitfield flags;	/**< bitmask of VEC_x flags */
+   void *storage;	/**< self-allocated storage */
+   GLuint storage_count; /**< storage size in elements */
+} GLvector4f;
+
+
+extern void _mesa_vector4f_init( GLvector4f *v, GLbitfield flags,
+			      GLfloat (*storage)[4] );
+extern void _mesa_vector4f_alloc( GLvector4f *v, GLbitfield flags,
+			       GLuint count, GLuint alignment );
+extern void _mesa_vector4f_free( GLvector4f *v );
+extern void _mesa_vector4f_print( const GLvector4f *v, const GLubyte *, GLboolean );
+extern void _mesa_vector4f_clean_elem( GLvector4f *vec, GLuint nr, GLuint elt );
+
+
+/**
+ * Given vector <v>, return a pointer (cast to <type *> to the <i>-th element.
+ *
+ * End up doing a lot of slow imuls if not careful.
+ */
+#define VEC_ELT( v, type, i ) \
+       ( (type *)  ( ((GLbyte *) ((v)->data)) + (i) * (v)->stride) )
+
+
+#endif
diff --git a/mesalib/src/mesa/math/m_xform.c b/mesalib/src/mesa/math/m_xform.c
new file mode 100644
index 000000000..369f2c6e9
--- /dev/null
+++ b/mesalib/src/mesa/math/m_xform.c
@@ -0,0 +1,128 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  5.1
+ *
+ * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/*
+ * Matrix/vertex/vector transformation stuff
+ *
+ *
+ * NOTES:
+ * 1. 4x4 transformation matrices are stored in memory in column major order.
+ * 2. Points/vertices are to be thought of as column vectors.
+ * 3. Transformation of a point p by a matrix M is: p' = M * p
+ */
+
+#include "main/glheader.h"
+#include "main/macros.h"
+
+#include "m_eval.h"
+#include "m_matrix.h"
+#include "m_translate.h"
+#include "m_xform.h"
+
+
+#ifdef DEBUG_MATH
+#include "m_debug.h"
+#endif
+
+#ifdef USE_X86_ASM
+#include "x86/common_x86_asm.h"
+#endif
+
+#ifdef USE_X86_64_ASM
+#include "x86-64/x86-64.h"
+#endif
+
+#ifdef USE_SPARC_ASM
+#include "sparc/sparc.h"
+#endif
+
+#ifdef USE_PPC_ASM
+#include "ppc/common_ppc_features.h"
+#endif
+
+clip_func _mesa_clip_tab[5];
+clip_func _mesa_clip_np_tab[5];
+dotprod_func _mesa_dotprod_tab[5];
+vec_copy_func _mesa_copy_tab[0x10];
+normal_func _mesa_normal_tab[0xf];
+transform_func *_mesa_transform_tab[5];
+
+
+/* Raw data format used for:
+ *    - Object-to-eye transform prior to culling, although this too
+ *      could be culled under some circumstances.
+ *    - Eye-to-clip transform (via the function above).
+ *    - Cliptesting
+ *    - And everything else too, if culling happens to be disabled.
+ *
+ * GH: It's used for everything now, as clipping/culling is done
+ *     elsewhere (most often by the driver itself).
+ */
+#define TAG(x) x
+#define TAG2(x,y) x##y
+#define STRIDE_LOOP for ( i = 0 ; i < count ; i++, STRIDE_F(from, stride) )
+#define LOOP for ( i = 0 ; i < n ; i++ )
+#define ARGS
+#include "m_xform_tmp.h"
+#include "m_clip_tmp.h"
+#include "m_norm_tmp.h"
+#include "m_dotprod_tmp.h"
+#include "m_copy_tmp.h"
+#undef TAG
+#undef TAG2
+#undef LOOP
+#undef ARGS
+
+
+/*
+ * This is called only once.  It initializes several tables with pointers
+ * to optimized transformation functions.  This is where we can test for
+ * AMD 3Dnow! capability, Intel SSE, etc. and hook in the right code.
+ */
+void
+_math_init_transformation( void )
+{
+   init_c_transformations();
+   init_c_norm_transform();
+   init_c_cliptest();
+   init_copy0();
+   init_dotprod();
+
+#ifdef DEBUG_MATH
+   _math_test_all_transform_functions( "default" );
+   _math_test_all_normal_transform_functions( "default" );
+   _math_test_all_cliptest_functions( "default" );
+#endif
+
+#ifdef USE_X86_ASM
+   _mesa_init_all_x86_transform_asm();
+#elif defined( USE_SPARC_ASM )
+   _mesa_init_all_sparc_transform_asm();
+#elif defined( USE_PPC_ASM )
+   _mesa_init_all_ppc_transform_asm();
+#elif defined( USE_X86_64_ASM )
+   _mesa_init_all_x86_64_transform_asm();
+#endif
+}
diff --git a/mesalib/src/mesa/math/m_xform.h b/mesalib/src/mesa/math/m_xform.h
new file mode 100644
index 000000000..7ef76e0b9
--- /dev/null
+++ b/mesalib/src/mesa/math/m_xform.h
@@ -0,0 +1,166 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.3
+ *
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _M_XFORM_H
+#define _M_XFORM_H
+
+
+#include "main/glheader.h"
+#include "main/config.h"
+#include "math/m_vector.h"
+#include "math/m_matrix.h"
+
+#ifdef USE_X86_ASM
+#define _XFORMAPI _ASMAPI
+#define _XFORMAPIP _ASMAPIP
+#else
+#define _XFORMAPI
+#define _XFORMAPIP *
+#endif
+
+
+extern void
+_math_init_transformation(void);
+
+
+/* KW: Clip functions now do projective divide as well.  The projected
+ * coordinates are very useful to us because they let us cull
+ * backfaces and eliminate vertices from lighting, fogging, etc
+ * calculations.  Despite the fact that this divide could be done one
+ * day in hardware, we would still have a reason to want to do it here
+ * as long as those other calculations remain in software.
+ *
+ * Clipping is a convenient place to do the divide on x86 as it should be
+ * possible to overlap with integer outcode calculations.
+ *
+ * There are two cases where we wouldn't want to do the divide in cliptest:
+ *    - When we aren't clipping.  We still might want to cull backfaces
+ *      so the divide should be done elsewhere.  This currently never
+ *      happens.
+ *
+ *    - When culling isn't likely to help us, such as when the GL culling
+ *      is disabled and we not lighting or are only lighting
+ *      one-sided.  In this situation, backface determination provides
+ *      us with no useful information.  A tricky case to detect is when
+ *      all input data is already culled, although hopefully the
+ *      application wouldn't turn on culling in such cases.
+ *
+ * We supply a buffer to hold the [x/w,y/w,z/w,1/w] values which
+ * are the result of the projection.  This is only used in the
+ * 4-vector case - in other cases, we just use the clip coordinates
+ * as the projected coordinates - they are identical.
+ *
+ * This is doubly convenient because it means the Win[] array is now
+ * of the same stride as all the others, so I can now turn map_vertices
+ * into a straight-forward matrix transformation, with asm acceleration
+ * automatically available.
+ */
+
+/* Vertex buffer clipping flags
+ */
+#define CLIP_RIGHT_SHIFT 	0
+#define CLIP_LEFT_SHIFT 	1
+#define CLIP_TOP_SHIFT  	2
+#define CLIP_BOTTOM_SHIFT       3
+#define CLIP_NEAR_SHIFT  	4
+#define CLIP_FAR_SHIFT  	5
+
+#define CLIP_RIGHT_BIT   0x01
+#define CLIP_LEFT_BIT    0x02
+#define CLIP_TOP_BIT     0x04
+#define CLIP_BOTTOM_BIT  0x08
+#define CLIP_NEAR_BIT    0x10
+#define CLIP_FAR_BIT     0x20
+#define CLIP_USER_BIT    0x40
+#define CLIP_CULL_BIT    0x80
+#define CLIP_FRUSTUM_BITS    0x3f
+
+
+typedef GLvector4f * (_XFORMAPIP clip_func)( GLvector4f *vClip,
+					     GLvector4f *vProj,
+					     GLubyte clipMask[],
+					     GLubyte *orMask,
+					     GLubyte *andMask );
+
+typedef void (*dotprod_func)( GLfloat *out,
+			      GLuint out_stride,
+			      CONST GLvector4f *coord_vec,
+			      CONST GLfloat plane[4] );
+
+typedef void (*vec_copy_func)( GLvector4f *to,
+			       CONST GLvector4f *from );
+
+
+
+/*
+ * Functions for transformation of normals in the VB.
+ */
+typedef void (_NORMAPIP normal_func)( CONST GLmatrix *mat,
+				      GLfloat scale,
+				      CONST GLvector4f *in,
+				      CONST GLfloat lengths[],
+				      GLvector4f *dest );
+
+
+/* Flags for selecting a normal transformation function.
+ */
+#define NORM_RESCALE   0x1		/* apply the scale factor */
+#define NORM_NORMALIZE 0x2		/* normalize */
+#define NORM_TRANSFORM 0x4		/* apply the transformation matrix */
+#define NORM_TRANSFORM_NO_ROT 0x8	/* apply the transformation matrix */
+
+
+
+
+/* KW: New versions of the transform function allow a mask array
+ *     specifying that individual vector transform should be skipped
+ *     when the mask byte is zero.  This is always present as a
+ *     parameter, to allow a unified interface.
+ */
+typedef void (_XFORMAPIP transform_func)( GLvector4f *to_vec,
+					  CONST GLfloat m[16],
+					  CONST GLvector4f *from_vec );
+
+
+extern dotprod_func  _mesa_dotprod_tab[5];
+extern vec_copy_func _mesa_copy_tab[0x10];
+extern vec_copy_func _mesa_copy_clean_tab[5];
+extern clip_func     _mesa_clip_tab[5];
+extern clip_func     _mesa_clip_np_tab[5];
+extern normal_func   _mesa_normal_tab[0xf];
+
+/* Use of 2 layers of linked 1-dimensional arrays to reduce
+ * cost of lookup.
+ */
+extern transform_func *_mesa_transform_tab[5];
+
+
+
+#define TransformRaw( to, mat, from ) \
+   ( _mesa_transform_tab[(from)->size][(mat)->type]( to, (mat)->m, from ), \
+     (to) )
+
+
+#endif
diff --git a/mesalib/src/mesa/math/m_xform_tmp.h b/mesalib/src/mesa/math/m_xform_tmp.h
new file mode 100644
index 000000000..e93837725
--- /dev/null
+++ b/mesalib/src/mesa/math/m_xform_tmp.h
@@ -0,0 +1,810 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * New (3.1) transformation code written by Keith Whitwell.
+ */
+
+
+/*----------------------------------------------------------------------
+ * Begin Keith's new code
+ *
+ *----------------------------------------------------------------------
+ */
+
+/* KW: Fixed stride, now measured in bytes as is the OpenGL array stride.
+ */
+
+/* KW: These are now parameterized to produce two versions, one
+ *     which transforms all incoming points, and a second which
+ *     takes notice of a cullmask array, and only transforms
+ *     unculled vertices.
+ */
+
+/* KW: 1-vectors can sneak into the texture pipeline via the array
+ *     interface.  These functions are here because I want consistant
+ *     treatment of the vertex sizes and a lazy strategy for
+ *     cleaning unused parts of the vector, and so as not to exclude
+ *     them from the vertex array interface.
+ *
+ *     Under our current analysis of matrices, there is no way that
+ *     the product of a matrix and a 1-vector can remain a 1-vector,
+ *     with the exception of the identity transform.
+ */
+
+/* KW: No longer zero-pad outgoing vectors.  Now that external
+ *     vectors can get into the pipeline we cannot ever assume
+ *     that there is more to a vector than indicated by its
+ *     size.
+ */
+
+/* KW: Now uses clipmask and a flag to allow us to skip both/either
+ *     cliped and/or culled vertices.
+ */
+
+/* GH: Not any more -- it's easier (and faster) to just process the
+ *     entire vector.  Clipping and culling are handled further down
+ *     the pipe, most often during or after the conversion to some
+ *     driver-specific vertex format.
+ */
+
+static void _XFORMAPI
+TAG(transform_points1_general)( GLvector4f *to_vec,
+				const GLfloat m[16],
+				const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0],  m12 = m[12];
+   const GLfloat m1 = m[1],  m13 = m[13];
+   const GLfloat m2 = m[2],  m14 = m[14];
+   const GLfloat m3 = m[3],  m15 = m[15];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0];
+      to[i][0] = m0 * ox + m12;
+      to[i][1] = m1 * ox + m13;
+      to[i][2] = m2 * ox + m14;
+      to[i][3] = m3 * ox + m15;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points1_identity)( GLvector4f *to_vec,
+				 const GLfloat m[16],
+				 const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLuint count = from_vec->count;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint i;
+   (void) m;
+   if (to_vec == from_vec) return;
+   STRIDE_LOOP {
+      to[i][0] = from[0];
+   }
+   to_vec->size = 1;
+   to_vec->flags |= VEC_SIZE_1;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points1_2d)( GLvector4f *to_vec,
+			   const GLfloat m[16],
+			   const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m1 = m[1];
+   const GLfloat m12 = m[12], m13 = m[13];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0];
+      to[i][0] = m0 * ox + m12;
+      to[i][1] = m1 * ox + m13;
+   }
+   to_vec->size = 2;
+   to_vec->flags |= VEC_SIZE_2;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points1_2d_no_rot)( GLvector4f *to_vec,
+				  const GLfloat m[16],
+				  const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m12 = m[12], m13 = m[13];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0];
+      to[i][0] = m0 * ox + m12;
+      to[i][1] =           m13;
+   }
+   to_vec->size = 2;
+   to_vec->flags |= VEC_SIZE_2;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points1_3d)( GLvector4f *to_vec,
+			   const GLfloat m[16],
+			   const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m1 = m[1], m2 = m[2];
+   const GLfloat m12 = m[12], m13 = m[13], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0];
+      to[i][0] = m0 * ox + m12;
+      to[i][1] = m1 * ox + m13;
+      to[i][2] = m2 * ox + m14;
+   }
+   to_vec->size = 3;
+   to_vec->flags |= VEC_SIZE_3;
+   to_vec->count = from_vec->count;
+}
+
+
+static void _XFORMAPI
+TAG(transform_points1_3d_no_rot)( GLvector4f *to_vec,
+				  const GLfloat m[16],
+				  const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0];
+   const GLfloat m12 = m[12], m13 = m[13], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0];
+      to[i][0] = m0 * ox           + m12;
+      to[i][1] =                     m13;
+      to[i][2] =                     m14;
+   }
+   to_vec->size = 3;
+   to_vec->flags |= VEC_SIZE_3;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points1_perspective)( GLvector4f *to_vec,
+				    const GLfloat m[16],
+				    const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0];
+      to[i][0] = m0 * ox                ;
+      to[i][1] =           0            ;
+      to[i][2] =                     m14;
+      to[i][3] = 0;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+
+
+
+/* 2-vectors, which are a lot more relevant than 1-vectors, are
+ * present early in the geometry pipeline and throughout the
+ * texture pipeline.
+ */
+static void _XFORMAPI
+TAG(transform_points2_general)( GLvector4f *to_vec,
+				const GLfloat m[16],
+				const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0],  m4 = m[4],  m12 = m[12];
+   const GLfloat m1 = m[1],  m5 = m[5],  m13 = m[13];
+   const GLfloat m2 = m[2],  m6 = m[6],  m14 = m[14];
+   const GLfloat m3 = m[3],  m7 = m[7],  m15 = m[15];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1];
+      to[i][0] = m0 * ox + m4 * oy + m12;
+      to[i][1] = m1 * ox + m5 * oy + m13;
+      to[i][2] = m2 * ox + m6 * oy + m14;
+      to[i][3] = m3 * ox + m7 * oy + m15;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points2_identity)( GLvector4f *to_vec,
+				 const GLfloat m[16],
+				 const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   GLuint i;
+   (void) m;
+   if (to_vec == from_vec) return;
+   STRIDE_LOOP {
+      to[i][0] = from[0];
+      to[i][1] = from[1];
+   }
+   to_vec->size = 2;
+   to_vec->flags |= VEC_SIZE_2;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points2_2d)( GLvector4f *to_vec,
+			   const GLfloat m[16],
+			   const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m1 = m[1], m4 = m[4], m5 = m[5];
+   const GLfloat m12 = m[12], m13 = m[13];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1];
+      to[i][0] = m0 * ox + m4 * oy + m12;
+      to[i][1] = m1 * ox + m5 * oy + m13;
+   }
+   to_vec->size = 2;
+   to_vec->flags |= VEC_SIZE_2;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points2_2d_no_rot)( GLvector4f *to_vec,
+				  const GLfloat m[16],
+				  const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m5 = m[5], m12 = m[12], m13 = m[13];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1];
+      to[i][0] = m0 * ox           + m12;
+      to[i][1] =           m5 * oy + m13;
+   }
+   to_vec->size = 2;
+   to_vec->flags |= VEC_SIZE_2;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points2_3d)( GLvector4f *to_vec,
+			   const GLfloat m[16],
+			   const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m1 = m[1], m2 = m[2], m4 = m[4], m5 = m[5];
+   const GLfloat m6 = m[6], m12 = m[12], m13 = m[13], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1];
+      to[i][0] = m0 * ox + m4 * oy + m12;
+      to[i][1] = m1 * ox + m5 * oy + m13;
+      to[i][2] = m2 * ox + m6 * oy + m14;
+   }
+   to_vec->size = 3;
+   to_vec->flags |= VEC_SIZE_3;
+   to_vec->count = from_vec->count;
+}
+
+
+/* I would actually say this was a fairly important function, from
+ * a texture transformation point of view.
+ */
+static void _XFORMAPI
+TAG(transform_points2_3d_no_rot)( GLvector4f *to_vec,
+				  const GLfloat m[16],
+				  const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m5 = m[5];
+   const GLfloat m12 = m[12], m13 = m[13], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1];
+      to[i][0] = m0 * ox           + m12;
+      to[i][1] =           m5 * oy + m13;
+      to[i][2] =                     m14;
+   }
+   if (m14 == 0) {
+      to_vec->size = 2;
+      to_vec->flags |= VEC_SIZE_2;
+   } else {
+      to_vec->size = 3;
+      to_vec->flags |= VEC_SIZE_3;
+   }
+   to_vec->count = from_vec->count;
+}
+
+
+static void _XFORMAPI
+TAG(transform_points2_perspective)( GLvector4f *to_vec,
+				    const GLfloat m[16],
+				    const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m5 = m[5], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1];
+      to[i][0] = m0 * ox                ;
+      to[i][1] =           m5 * oy      ;
+      to[i][2] =                     m14;
+      to[i][3] = 0;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+
+
+static void _XFORMAPI
+TAG(transform_points3_general)( GLvector4f *to_vec,
+				const GLfloat m[16],
+				const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0],  m4 = m[4],  m8 = m[8],  m12 = m[12];
+   const GLfloat m1 = m[1],  m5 = m[5],  m9 = m[9],  m13 = m[13];
+   const GLfloat m2 = m[2],  m6 = m[6],  m10 = m[10],  m14 = m[14];
+   const GLfloat m3 = m[3],  m7 = m[7],  m11 = m[11],  m15 = m[15];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2];
+      to[i][0] = m0 * ox + m4 * oy + m8  * oz + m12;
+      to[i][1] = m1 * ox + m5 * oy + m9  * oz + m13;
+      to[i][2] = m2 * ox + m6 * oy + m10 * oz + m14;
+      to[i][3] = m3 * ox + m7 * oy + m11 * oz + m15;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points3_identity)( GLvector4f *to_vec,
+				 const GLfloat m[16],
+				 const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   GLuint i;
+   (void) m;
+   if (to_vec == from_vec) return;
+   STRIDE_LOOP {
+      to[i][0] = from[0];
+      to[i][1] = from[1];
+      to[i][2] = from[2];
+   }
+   to_vec->size = 3;
+   to_vec->flags |= VEC_SIZE_3;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points3_2d)( GLvector4f *to_vec,
+			   const GLfloat m[16],
+			   const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m1 = m[1], m4 = m[4], m5 = m[5];
+   const GLfloat m12 = m[12], m13 = m[13];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2];
+      to[i][0] = m0 * ox + m4 * oy            + m12       ;
+      to[i][1] = m1 * ox + m5 * oy            + m13       ;
+      to[i][2] =                   +       oz             ;
+   }
+   to_vec->size = 3;
+   to_vec->flags |= VEC_SIZE_3;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points3_2d_no_rot)( GLvector4f *to_vec,
+				  const GLfloat m[16],
+				  const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m5 = m[5], m12 = m[12], m13 = m[13];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2];
+      to[i][0] = m0 * ox                      + m12       ;
+      to[i][1] =           m5 * oy            + m13       ;
+      to[i][2] =                   +       oz             ;
+   }
+   to_vec->size = 3;
+   to_vec->flags |= VEC_SIZE_3;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points3_3d)( GLvector4f *to_vec,
+			   const GLfloat m[16],
+			   const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m1 = m[1], m2 = m[2], m4 = m[4], m5 = m[5];
+   const GLfloat m6 = m[6], m8 = m[8], m9 = m[9], m10 = m[10];
+   const GLfloat m12 = m[12], m13 = m[13], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2];
+      to[i][0] = m0 * ox + m4 * oy +  m8 * oz + m12       ;
+      to[i][1] = m1 * ox + m5 * oy +  m9 * oz + m13       ;
+      to[i][2] = m2 * ox + m6 * oy + m10 * oz + m14       ;
+   }
+   to_vec->size = 3;
+   to_vec->flags |= VEC_SIZE_3;
+   to_vec->count = from_vec->count;
+}
+
+/* previously known as ortho...
+ */
+static void _XFORMAPI
+TAG(transform_points3_3d_no_rot)( GLvector4f *to_vec,
+				  const GLfloat m[16],
+				  const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m5 = m[5];
+   const GLfloat m10 = m[10], m12 = m[12], m13 = m[13], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2];
+      to[i][0] = m0 * ox                      + m12       ;
+      to[i][1] =           m5 * oy            + m13       ;
+      to[i][2] =                     m10 * oz + m14       ;
+   }
+   to_vec->size = 3;
+   to_vec->flags |= VEC_SIZE_3;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points3_perspective)( GLvector4f *to_vec,
+				    const GLfloat m[16],
+				    const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m5 = m[5], m8 = m[8], m9 = m[9];
+   const GLfloat m10 = m[10], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2];
+      to[i][0] = m0 * ox           + m8  * oz       ;
+      to[i][1] =           m5 * oy + m9  * oz       ;
+      to[i][2] =                     m10 * oz + m14 ;
+      to[i][3] =                          -oz       ;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+
+
+static void _XFORMAPI
+TAG(transform_points4_general)( GLvector4f *to_vec,
+				const GLfloat m[16],
+				const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0],  m4 = m[4],  m8 = m[8],  m12 = m[12];
+   const GLfloat m1 = m[1],  m5 = m[5],  m9 = m[9],  m13 = m[13];
+   const GLfloat m2 = m[2],  m6 = m[6],  m10 = m[10],  m14 = m[14];
+   const GLfloat m3 = m[3],  m7 = m[7],  m11 = m[11],  m15 = m[15];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2], ow = from[3];
+      to[i][0] = m0 * ox + m4 * oy + m8  * oz + m12 * ow;
+      to[i][1] = m1 * ox + m5 * oy + m9  * oz + m13 * ow;
+      to[i][2] = m2 * ox + m6 * oy + m10 * oz + m14 * ow;
+      to[i][3] = m3 * ox + m7 * oy + m11 * oz + m15 * ow;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points4_identity)( GLvector4f *to_vec,
+				 const GLfloat m[16],
+				 const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   GLuint i;
+   (void) m;
+   if (to_vec == from_vec) return;
+   STRIDE_LOOP {
+      to[i][0] = from[0];
+      to[i][1] = from[1];
+      to[i][2] = from[2];
+      to[i][3] = from[3];
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points4_2d)( GLvector4f *to_vec,
+			   const GLfloat m[16],
+			   const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m1 = m[1], m4 = m[4], m5 = m[5];
+   const GLfloat m12 = m[12], m13 = m[13];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2], ow = from[3];
+      to[i][0] = m0 * ox + m4 * oy            + m12 * ow;
+      to[i][1] = m1 * ox + m5 * oy            + m13 * ow;
+      to[i][2] =                   +       oz           ;
+      to[i][3] =                                      ow;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points4_2d_no_rot)( GLvector4f *to_vec,
+				  const GLfloat m[16],
+				  const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m5 = m[5], m12 = m[12], m13 = m[13];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2], ow = from[3];
+      to[i][0] = m0 * ox                      + m12 * ow;
+      to[i][1] =           m5 * oy            + m13 * ow;
+      to[i][2] =                   +       oz           ;
+      to[i][3] =                                      ow;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points4_3d)( GLvector4f *to_vec,
+			   const GLfloat m[16],
+			   const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m1 = m[1], m2 = m[2], m4 = m[4], m5 = m[5];
+   const GLfloat m6 = m[6], m8 = m[8], m9 = m[9], m10 = m[10];
+   const GLfloat m12 = m[12], m13 = m[13], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2], ow = from[3];
+      to[i][0] = m0 * ox + m4 * oy +  m8 * oz + m12 * ow;
+      to[i][1] = m1 * ox + m5 * oy +  m9 * oz + m13 * ow;
+      to[i][2] = m2 * ox + m6 * oy + m10 * oz + m14 * ow;
+      to[i][3] =                                      ow;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points4_3d_no_rot)( GLvector4f *to_vec,
+				  const GLfloat m[16],
+				  const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m5 = m[5];
+   const GLfloat m10 = m[10], m12 = m[12], m13 = m[13], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2], ow = from[3];
+      to[i][0] = m0 * ox                      + m12 * ow;
+      to[i][1] =           m5 * oy            + m13 * ow;
+      to[i][2] =                     m10 * oz + m14 * ow;
+      to[i][3] =                                      ow;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+static void _XFORMAPI
+TAG(transform_points4_perspective)( GLvector4f *to_vec,
+				    const GLfloat m[16],
+				    const GLvector4f *from_vec )
+{
+   const GLuint stride = from_vec->stride;
+   GLfloat *from = from_vec->start;
+   GLfloat (*to)[4] = (GLfloat (*)[4])to_vec->start;
+   GLuint count = from_vec->count;
+   const GLfloat m0 = m[0], m5 = m[5], m8 = m[8], m9 = m[9];
+   const GLfloat m10 = m[10], m14 = m[14];
+   GLuint i;
+   STRIDE_LOOP {
+      const GLfloat ox = from[0], oy = from[1], oz = from[2], ow = from[3];
+      to[i][0] = m0 * ox           + m8  * oz            ;
+      to[i][1] =           m5 * oy + m9  * oz            ;
+      to[i][2] =                     m10 * oz + m14 * ow ;
+      to[i][3] =                          -oz            ;
+   }
+   to_vec->size = 4;
+   to_vec->flags |= VEC_SIZE_4;
+   to_vec->count = from_vec->count;
+}
+
+static transform_func TAG(transform_tab_1)[7];
+static transform_func TAG(transform_tab_2)[7];
+static transform_func TAG(transform_tab_3)[7];
+static transform_func TAG(transform_tab_4)[7];
+
+/* Similar functions could be called several times, with more highly
+ * optimized routines overwriting the arrays.  This only occurs during
+ * startup.
+ */
+static void _XFORMAPI TAG(init_c_transformations)( void )
+{
+#define TAG_TAB   _mesa_transform_tab
+#define TAG_TAB_1 TAG(transform_tab_1)
+#define TAG_TAB_2 TAG(transform_tab_2)
+#define TAG_TAB_3 TAG(transform_tab_3)
+#define TAG_TAB_4 TAG(transform_tab_4)
+
+   TAG_TAB[1] = TAG_TAB_1;
+   TAG_TAB[2] = TAG_TAB_2;
+   TAG_TAB[3] = TAG_TAB_3;
+   TAG_TAB[4] = TAG_TAB_4;
+
+   /* 1-D points (ie texcoords) */
+   TAG_TAB_1[MATRIX_GENERAL]     = TAG(transform_points1_general);
+   TAG_TAB_1[MATRIX_IDENTITY]    = TAG(transform_points1_identity);
+   TAG_TAB_1[MATRIX_3D_NO_ROT]   = TAG(transform_points1_3d_no_rot);
+   TAG_TAB_1[MATRIX_PERSPECTIVE] = TAG(transform_points1_perspective);
+   TAG_TAB_1[MATRIX_2D]          = TAG(transform_points1_2d);
+   TAG_TAB_1[MATRIX_2D_NO_ROT]   = TAG(transform_points1_2d_no_rot);
+   TAG_TAB_1[MATRIX_3D]          = TAG(transform_points1_3d);
+
+   /* 2-D points */
+   TAG_TAB_2[MATRIX_GENERAL]     = TAG(transform_points2_general);
+   TAG_TAB_2[MATRIX_IDENTITY]    = TAG(transform_points2_identity);
+   TAG_TAB_2[MATRIX_3D_NO_ROT]   = TAG(transform_points2_3d_no_rot);
+   TAG_TAB_2[MATRIX_PERSPECTIVE] = TAG(transform_points2_perspective);
+   TAG_TAB_2[MATRIX_2D]          = TAG(transform_points2_2d);
+   TAG_TAB_2[MATRIX_2D_NO_ROT]   = TAG(transform_points2_2d_no_rot);
+   TAG_TAB_2[MATRIX_3D]          = TAG(transform_points2_3d);
+
+   /* 3-D points */
+   TAG_TAB_3[MATRIX_GENERAL]     = TAG(transform_points3_general);
+   TAG_TAB_3[MATRIX_IDENTITY]    = TAG(transform_points3_identity);
+   TAG_TAB_3[MATRIX_3D_NO_ROT]   = TAG(transform_points3_3d_no_rot);
+   TAG_TAB_3[MATRIX_PERSPECTIVE] = TAG(transform_points3_perspective);
+   TAG_TAB_3[MATRIX_2D]          = TAG(transform_points3_2d);
+   TAG_TAB_3[MATRIX_2D_NO_ROT]   = TAG(transform_points3_2d_no_rot);
+   TAG_TAB_3[MATRIX_3D]          = TAG(transform_points3_3d);
+
+   /* 4-D points */
+   TAG_TAB_4[MATRIX_GENERAL]     = TAG(transform_points4_general);
+   TAG_TAB_4[MATRIX_IDENTITY]    = TAG(transform_points4_identity);
+   TAG_TAB_4[MATRIX_3D_NO_ROT]   = TAG(transform_points4_3d_no_rot);
+   TAG_TAB_4[MATRIX_PERSPECTIVE] = TAG(transform_points4_perspective);
+   TAG_TAB_4[MATRIX_2D]          = TAG(transform_points4_2d);
+   TAG_TAB_4[MATRIX_2D_NO_ROT]   = TAG(transform_points4_2d_no_rot);
+   TAG_TAB_4[MATRIX_3D]          = TAG(transform_points4_3d);
+
+#undef TAG_TAB
+#undef TAG_TAB_1
+#undef TAG_TAB_2
+#undef TAG_TAB_3
+#undef TAG_TAB_4
+}
-- 
cgit v1.2.3