From 01df5d59e56a1b060568f8cad2e89f7eea22fc70 Mon Sep 17 00:00:00 2001
From: marha <marha@users.sourceforge.net>
Date: Mon, 29 Aug 2011 08:51:20 +0200
Subject: xwininfo libX11 libXmu libxcb mesa xserver xkeyboard-config git
 update 29 aug 2011

---
 mesalib/src/mesa/swrast/s_aatritemp.h  |  674 ++++-----
 mesalib/src/mesa/swrast/s_context.c    |  105 +-
 mesalib/src/mesa/swrast/s_stencil.c    | 2491 ++++++++++++++++----------------
 mesalib/src/mesa/swrast/s_texcombine.c | 1506 +++++++++----------
 4 files changed, 2364 insertions(+), 2412 deletions(-)

(limited to 'mesalib/src/mesa/swrast')

diff --git a/mesalib/src/mesa/swrast/s_aatritemp.h b/mesalib/src/mesa/swrast/s_aatritemp.h
index 4136df3a7..77b3ae6ec 100644
--- a/mesalib/src/mesa/swrast/s_aatritemp.h
+++ b/mesalib/src/mesa/swrast/s_aatritemp.h
@@ -1,331 +1,343 @@
-/*
- * Mesa 3-D graphics library
- * Version:  7.0.3
- *
- * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-/*
- * Antialiased Triangle Rasterizer Template
- *
- * This file is #include'd to generate custom AA triangle rasterizers.
- * NOTE: this code hasn't been optimized yet.  That'll come after it
- * works correctly.
- *
- * The following macros may be defined to indicate what auxillary information
- * must be copmuted across the triangle:
- *    DO_Z         - if defined, compute Z values
- *    DO_ATTRIBS   - if defined, compute texcoords, varying, etc.
- */
-
-/*void triangle( struct gl_context *ctx, GLuint v0, GLuint v1, GLuint v2, GLuint pv )*/
-{
-   const SWcontext *swrast = SWRAST_CONTEXT(ctx);
-   const GLfloat *p0 = v0->attrib[FRAG_ATTRIB_WPOS];
-   const GLfloat *p1 = v1->attrib[FRAG_ATTRIB_WPOS];
-   const GLfloat *p2 = v2->attrib[FRAG_ATTRIB_WPOS];
-   const SWvertex *vMin, *vMid, *vMax;
-   GLint iyMin, iyMax;
-   GLfloat yMin, yMax;
-   GLboolean ltor;
-   GLfloat majDx, majDy;  /* major (i.e. long) edge dx and dy */
-   
-   SWspan span;
-   
-#ifdef DO_Z
-   GLfloat zPlane[4];
-#endif
-   GLfloat rPlane[4], gPlane[4], bPlane[4], aPlane[4];
-#if defined(DO_ATTRIBS)
-   GLfloat attrPlane[FRAG_ATTRIB_MAX][4][4];
-   GLfloat wPlane[4];  /* win[3] */
-#endif
-   GLfloat bf = SWRAST_CONTEXT(ctx)->_BackfaceCullSign;
-   
-   (void) swrast;
-
-   INIT_SPAN(span, GL_POLYGON);
-   span.arrayMask = SPAN_COVERAGE;
-
-   /* determine bottom to top order of vertices */
-   {
-      GLfloat y0 = v0->attrib[FRAG_ATTRIB_WPOS][1];
-      GLfloat y1 = v1->attrib[FRAG_ATTRIB_WPOS][1];
-      GLfloat y2 = v2->attrib[FRAG_ATTRIB_WPOS][1];
-      if (y0 <= y1) {
-	 if (y1 <= y2) {
-	    vMin = v0;   vMid = v1;   vMax = v2;   /* y0<=y1<=y2 */
-	 }
-	 else if (y2 <= y0) {
-	    vMin = v2;   vMid = v0;   vMax = v1;   /* y2<=y0<=y1 */
-	 }
-	 else {
-	    vMin = v0;   vMid = v2;   vMax = v1;  bf = -bf; /* y0<=y2<=y1 */
-	 }
-      }
-      else {
-	 if (y0 <= y2) {
-	    vMin = v1;   vMid = v0;   vMax = v2;  bf = -bf; /* y1<=y0<=y2 */
-	 }
-	 else if (y2 <= y1) {
-	    vMin = v2;   vMid = v1;   vMax = v0;  bf = -bf; /* y2<=y1<=y0 */
-	 }
-	 else {
-	    vMin = v1;   vMid = v2;   vMax = v0;   /* y1<=y2<=y0 */
-	 }
-      }
-   }
-
-   majDx = vMax->attrib[FRAG_ATTRIB_WPOS][0] - vMin->attrib[FRAG_ATTRIB_WPOS][0];
-   majDy = vMax->attrib[FRAG_ATTRIB_WPOS][1] - vMin->attrib[FRAG_ATTRIB_WPOS][1];
-
-   /* front/back-face determination and cullling */
-   {
-      const GLfloat botDx = vMid->attrib[FRAG_ATTRIB_WPOS][0] - vMin->attrib[FRAG_ATTRIB_WPOS][0];
-      const GLfloat botDy = vMid->attrib[FRAG_ATTRIB_WPOS][1] - vMin->attrib[FRAG_ATTRIB_WPOS][1];
-      const GLfloat area = majDx * botDy - botDx * majDy;
-      /* Do backface culling */
-      if (area * bf < 0 || area == 0 || IS_INF_OR_NAN(area))
-	 return;
-      ltor = (GLboolean) (area < 0.0F);
-
-      span.facing = area * swrast->_BackfaceSign > 0.0F;
-   }
-
-   /* Plane equation setup:
-    * We evaluate plane equations at window (x,y) coordinates in order
-    * to compute color, Z, fog, texcoords, etc.  This isn't terribly
-    * efficient but it's easy and reliable.
-    */
-#ifdef DO_Z
-   compute_plane(p0, p1, p2, p0[2], p1[2], p2[2], zPlane);
-   span.arrayMask |= SPAN_Z;
-#endif
-   if (ctx->Light.ShadeModel == GL_SMOOTH) {
-      compute_plane(p0, p1, p2, v0->color[RCOMP], v1->color[RCOMP], v2->color[RCOMP], rPlane);
-      compute_plane(p0, p1, p2, v0->color[GCOMP], v1->color[GCOMP], v2->color[GCOMP], gPlane);
-      compute_plane(p0, p1, p2, v0->color[BCOMP], v1->color[BCOMP], v2->color[BCOMP], bPlane);
-      compute_plane(p0, p1, p2, v0->color[ACOMP], v1->color[ACOMP], v2->color[ACOMP], aPlane);
-   }
-   else {
-      constant_plane(v2->color[RCOMP], rPlane);
-      constant_plane(v2->color[GCOMP], gPlane);
-      constant_plane(v2->color[BCOMP], bPlane);
-      constant_plane(v2->color[ACOMP], aPlane);
-   }
-   span.arrayMask |= SPAN_RGBA;
-#if defined(DO_ATTRIBS)
-   {
-      const GLfloat invW0 = v0->attrib[FRAG_ATTRIB_WPOS][3];
-      const GLfloat invW1 = v1->attrib[FRAG_ATTRIB_WPOS][3];
-      const GLfloat invW2 = v2->attrib[FRAG_ATTRIB_WPOS][3];
-      compute_plane(p0, p1, p2, invW0, invW1, invW2, wPlane);
-      span.attrStepX[FRAG_ATTRIB_WPOS][3] = plane_dx(wPlane);
-      span.attrStepY[FRAG_ATTRIB_WPOS][3] = plane_dy(wPlane);
-      ATTRIB_LOOP_BEGIN
-         GLuint c;
-         if (swrast->_InterpMode[attr] == GL_FLAT) {
-            for (c = 0; c < 4; c++) {
-               constant_plane(v2->attrib[attr][c] * invW2, attrPlane[attr][c]);
-            }
-         }
-         else {
-            for (c = 0; c < 4; c++) {
-               const GLfloat a0 = v0->attrib[attr][c] * invW0;
-               const GLfloat a1 = v1->attrib[attr][c] * invW1;
-               const GLfloat a2 = v2->attrib[attr][c] * invW2;
-               compute_plane(p0, p1, p2, a0, a1, a2, attrPlane[attr][c]);
-            }
-         }
-         for (c = 0; c < 4; c++) {
-            span.attrStepX[attr][c] = plane_dx(attrPlane[attr][c]);
-            span.attrStepY[attr][c] = plane_dy(attrPlane[attr][c]);
-         }
-      ATTRIB_LOOP_END
-   }
-#endif
-
-   /* Begin bottom-to-top scan over the triangle.
-    * The long edge will either be on the left or right side of the
-    * triangle.  We always scan from the long edge toward the shorter
-    * edges, stopping when we find that coverage = 0.  If the long edge
-    * is on the left we scan left-to-right.  Else, we scan right-to-left.
-    */
-   yMin = vMin->attrib[FRAG_ATTRIB_WPOS][1];
-   yMax = vMax->attrib[FRAG_ATTRIB_WPOS][1];
-   iyMin = (GLint) yMin;
-   iyMax = (GLint) yMax + 1;
-
-   if (ltor) {
-      /* scan left to right */
-      const GLfloat *pMin = vMin->attrib[FRAG_ATTRIB_WPOS];
-      const GLfloat *pMid = vMid->attrib[FRAG_ATTRIB_WPOS];
-      const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
-      const GLfloat dxdy = majDx / majDy;
-      const GLfloat xAdj = dxdy < 0.0F ? -dxdy : 0.0F;
-      GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
-      GLint iy;
-      for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
-         GLint ix, startX = (GLint) (x - xAdj);
-         GLuint count;
-         GLfloat coverage = 0.0F;
-
-         /* skip over fragments with zero coverage */
-         while (startX < MAX_WIDTH) {
-            coverage = compute_coveragef(pMin, pMid, pMax, startX, iy);
-            if (coverage > 0.0F)
-               break;
-            startX++;
-         }
-
-         /* enter interior of triangle */
-         ix = startX;
-
-#if defined(DO_ATTRIBS)
-         /* compute attributes at left-most fragment */
-         span.attrStart[FRAG_ATTRIB_WPOS][3] = solve_plane(ix + 0.5F, iy + 0.5F, wPlane);
-         ATTRIB_LOOP_BEGIN
-            GLuint c;
-            for (c = 0; c < 4; c++) {
-               span.attrStart[attr][c] = solve_plane(ix + 0.5F, iy + 0.5F, attrPlane[attr][c]);
-            }
-         ATTRIB_LOOP_END
-#endif
-
-         count = 0;
-         while (coverage > 0.0F) {
-            /* (cx,cy) = center of fragment */
-            const GLfloat cx = ix + 0.5F, cy = iy + 0.5F;
-            SWspanarrays *array = span.array;
-            array->coverage[count] = coverage;
-#ifdef DO_Z
-            array->z[count] = (GLuint) solve_plane(cx, cy, zPlane);
-#endif
-            array->rgba[count][RCOMP] = solve_plane_chan(cx, cy, rPlane);
-            array->rgba[count][GCOMP] = solve_plane_chan(cx, cy, gPlane);
-            array->rgba[count][BCOMP] = solve_plane_chan(cx, cy, bPlane);
-            array->rgba[count][ACOMP] = solve_plane_chan(cx, cy, aPlane);
-            ix++;
-            count++;
-            coverage = compute_coveragef(pMin, pMid, pMax, ix, iy);
-         }
-         
-         if (ix <= startX)
-            continue;
-         
-         span.x = startX;
-         span.y = iy;
-         span.end = (GLuint) ix - (GLuint) startX;
-         _swrast_write_rgba_span(ctx, &span);
-      }
-   }
-   else {
-      /* scan right to left */
-      const GLfloat *pMin = vMin->attrib[FRAG_ATTRIB_WPOS];
-      const GLfloat *pMid = vMid->attrib[FRAG_ATTRIB_WPOS];
-      const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
-      const GLfloat dxdy = majDx / majDy;
-      const GLfloat xAdj = dxdy > 0 ? dxdy : 0.0F;
-      GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
-      GLint iy;
-      for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
-         GLint ix, left, startX = (GLint) (x + xAdj);
-         GLuint count, n;
-         GLfloat coverage = 0.0F;
-         
-         /* make sure we're not past the window edge */
-         if (startX >= ctx->DrawBuffer->_Xmax) {
-            startX = ctx->DrawBuffer->_Xmax - 1;
-         }
-
-         /* skip fragments with zero coverage */
-         while (startX > 0) {
-            coverage = compute_coveragef(pMin, pMax, pMid, startX, iy);
-            if (coverage > 0.0F)
-               break;
-            startX--;
-         }
-         
-         /* enter interior of triangle */
-         ix = startX;
-         count = 0;
-         while (coverage > 0.0F) {
-            /* (cx,cy) = center of fragment */
-            const GLfloat cx = ix + 0.5F, cy = iy + 0.5F;
-            SWspanarrays *array = span.array;
-            ASSERT(ix >= 0);
-            array->coverage[ix] = coverage;
-#ifdef DO_Z
-            array->z[ix] = (GLuint) solve_plane(cx, cy, zPlane);
-#endif
-            array->rgba[ix][RCOMP] = solve_plane_chan(cx, cy, rPlane);
-            array->rgba[ix][GCOMP] = solve_plane_chan(cx, cy, gPlane);
-            array->rgba[ix][BCOMP] = solve_plane_chan(cx, cy, bPlane);
-            array->rgba[ix][ACOMP] = solve_plane_chan(cx, cy, aPlane);
-            ix--;
-            count++;
-            coverage = compute_coveragef(pMin, pMax, pMid, ix, iy);
-         }
-         
-#if defined(DO_ATTRIBS)
-         /* compute attributes at left-most fragment */
-         span.attrStart[FRAG_ATTRIB_WPOS][3] = solve_plane(ix + 1.5F, iy + 0.5F, wPlane);
-         ATTRIB_LOOP_BEGIN
-            GLuint c;
-            for (c = 0; c < 4; c++) {
-               span.attrStart[attr][c] = solve_plane(ix + 1.5F, iy + 0.5F, attrPlane[attr][c]);
-            }
-         ATTRIB_LOOP_END
-#endif
-
-         if (startX <= ix)
-            continue;
-
-         n = (GLuint) startX - (GLuint) ix;
-
-         left = ix + 1;
-
-         /* shift all values to the left */
-         /* XXX this is temporary */
-         {
-            SWspanarrays *array = span.array;
-            GLint j;
-            for (j = 0; j < (GLint) n; j++) {
-               array->coverage[j] = array->coverage[j + left];
-               COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
-#ifdef DO_Z
-               array->z[j] = array->z[j + left];
-#endif
-            }
-         }
-
-         span.x = left;
-         span.y = iy;
-         span.end = n;
-         _swrast_write_rgba_span(ctx, &span);
-      }
-   }
-}
-
-
-#undef DO_Z
-#undef DO_ATTRIBS
-#undef DO_OCCLUSION_TEST
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.0.3
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/*
+ * Antialiased Triangle Rasterizer Template
+ *
+ * This file is #include'd to generate custom AA triangle rasterizers.
+ * NOTE: this code hasn't been optimized yet.  That'll come after it
+ * works correctly.
+ *
+ * The following macros may be defined to indicate what auxillary information
+ * must be copmuted across the triangle:
+ *    DO_Z         - if defined, compute Z values
+ *    DO_ATTRIBS   - if defined, compute texcoords, varying, etc.
+ */
+
+/*void triangle( struct gl_context *ctx, GLuint v0, GLuint v1, GLuint v2, GLuint pv )*/
+{
+   const SWcontext *swrast = SWRAST_CONTEXT(ctx);
+   const GLfloat *p0 = v0->attrib[FRAG_ATTRIB_WPOS];
+   const GLfloat *p1 = v1->attrib[FRAG_ATTRIB_WPOS];
+   const GLfloat *p2 = v2->attrib[FRAG_ATTRIB_WPOS];
+   const SWvertex *vMin, *vMid, *vMax;
+   GLint iyMin, iyMax;
+   GLfloat yMin, yMax;
+   GLboolean ltor;
+   GLfloat majDx, majDy;  /* major (i.e. long) edge dx and dy */
+   
+   SWspan span;
+   
+#ifdef DO_Z
+   GLfloat zPlane[4];
+#endif
+   GLfloat rPlane[4], gPlane[4], bPlane[4], aPlane[4];
+#if defined(DO_ATTRIBS)
+   GLfloat attrPlane[FRAG_ATTRIB_MAX][4][4];
+   GLfloat wPlane[4];  /* win[3] */
+#endif
+   GLfloat bf = SWRAST_CONTEXT(ctx)->_BackfaceCullSign;
+   
+   (void) swrast;
+
+   INIT_SPAN(span, GL_POLYGON);
+   span.arrayMask = SPAN_COVERAGE;
+
+   /* determine bottom to top order of vertices */
+   {
+      GLfloat y0 = v0->attrib[FRAG_ATTRIB_WPOS][1];
+      GLfloat y1 = v1->attrib[FRAG_ATTRIB_WPOS][1];
+      GLfloat y2 = v2->attrib[FRAG_ATTRIB_WPOS][1];
+      if (y0 <= y1) {
+	 if (y1 <= y2) {
+	    vMin = v0;   vMid = v1;   vMax = v2;   /* y0<=y1<=y2 */
+	 }
+	 else if (y2 <= y0) {
+	    vMin = v2;   vMid = v0;   vMax = v1;   /* y2<=y0<=y1 */
+	 }
+	 else {
+	    vMin = v0;   vMid = v2;   vMax = v1;  bf = -bf; /* y0<=y2<=y1 */
+	 }
+      }
+      else {
+	 if (y0 <= y2) {
+	    vMin = v1;   vMid = v0;   vMax = v2;  bf = -bf; /* y1<=y0<=y2 */
+	 }
+	 else if (y2 <= y1) {
+	    vMin = v2;   vMid = v1;   vMax = v0;  bf = -bf; /* y2<=y1<=y0 */
+	 }
+	 else {
+	    vMin = v1;   vMid = v2;   vMax = v0;   /* y1<=y2<=y0 */
+	 }
+      }
+   }
+
+   majDx = vMax->attrib[FRAG_ATTRIB_WPOS][0] - vMin->attrib[FRAG_ATTRIB_WPOS][0];
+   majDy = vMax->attrib[FRAG_ATTRIB_WPOS][1] - vMin->attrib[FRAG_ATTRIB_WPOS][1];
+
+   /* front/back-face determination and cullling */
+   {
+      const GLfloat botDx = vMid->attrib[FRAG_ATTRIB_WPOS][0] - vMin->attrib[FRAG_ATTRIB_WPOS][0];
+      const GLfloat botDy = vMid->attrib[FRAG_ATTRIB_WPOS][1] - vMin->attrib[FRAG_ATTRIB_WPOS][1];
+      const GLfloat area = majDx * botDy - botDx * majDy;
+      /* Do backface culling */
+      if (area * bf < 0 || area == 0 || IS_INF_OR_NAN(area))
+	 return;
+      ltor = (GLboolean) (area < 0.0F);
+
+      span.facing = area * swrast->_BackfaceSign > 0.0F;
+   }
+
+   /* Plane equation setup:
+    * We evaluate plane equations at window (x,y) coordinates in order
+    * to compute color, Z, fog, texcoords, etc.  This isn't terribly
+    * efficient but it's easy and reliable.
+    */
+#ifdef DO_Z
+   compute_plane(p0, p1, p2, p0[2], p1[2], p2[2], zPlane);
+   span.arrayMask |= SPAN_Z;
+#endif
+   if (ctx->Light.ShadeModel == GL_SMOOTH) {
+      compute_plane(p0, p1, p2, v0->color[RCOMP], v1->color[RCOMP], v2->color[RCOMP], rPlane);
+      compute_plane(p0, p1, p2, v0->color[GCOMP], v1->color[GCOMP], v2->color[GCOMP], gPlane);
+      compute_plane(p0, p1, p2, v0->color[BCOMP], v1->color[BCOMP], v2->color[BCOMP], bPlane);
+      compute_plane(p0, p1, p2, v0->color[ACOMP], v1->color[ACOMP], v2->color[ACOMP], aPlane);
+   }
+   else {
+      constant_plane(v2->color[RCOMP], rPlane);
+      constant_plane(v2->color[GCOMP], gPlane);
+      constant_plane(v2->color[BCOMP], bPlane);
+      constant_plane(v2->color[ACOMP], aPlane);
+   }
+   span.arrayMask |= SPAN_RGBA;
+#if defined(DO_ATTRIBS)
+   {
+      const GLfloat invW0 = v0->attrib[FRAG_ATTRIB_WPOS][3];
+      const GLfloat invW1 = v1->attrib[FRAG_ATTRIB_WPOS][3];
+      const GLfloat invW2 = v2->attrib[FRAG_ATTRIB_WPOS][3];
+      compute_plane(p0, p1, p2, invW0, invW1, invW2, wPlane);
+      span.attrStepX[FRAG_ATTRIB_WPOS][3] = plane_dx(wPlane);
+      span.attrStepY[FRAG_ATTRIB_WPOS][3] = plane_dy(wPlane);
+      ATTRIB_LOOP_BEGIN
+         GLuint c;
+         if (swrast->_InterpMode[attr] == GL_FLAT) {
+            for (c = 0; c < 4; c++) {
+               constant_plane(v2->attrib[attr][c] * invW2, attrPlane[attr][c]);
+            }
+         }
+         else {
+            for (c = 0; c < 4; c++) {
+               const GLfloat a0 = v0->attrib[attr][c] * invW0;
+               const GLfloat a1 = v1->attrib[attr][c] * invW1;
+               const GLfloat a2 = v2->attrib[attr][c] * invW2;
+               compute_plane(p0, p1, p2, a0, a1, a2, attrPlane[attr][c]);
+            }
+         }
+         for (c = 0; c < 4; c++) {
+            span.attrStepX[attr][c] = plane_dx(attrPlane[attr][c]);
+            span.attrStepY[attr][c] = plane_dy(attrPlane[attr][c]);
+         }
+      ATTRIB_LOOP_END
+   }
+#endif
+
+   /* Begin bottom-to-top scan over the triangle.
+    * The long edge will either be on the left or right side of the
+    * triangle.  We always scan from the long edge toward the shorter
+    * edges, stopping when we find that coverage = 0.  If the long edge
+    * is on the left we scan left-to-right.  Else, we scan right-to-left.
+    */
+   yMin = vMin->attrib[FRAG_ATTRIB_WPOS][1];
+   yMax = vMax->attrib[FRAG_ATTRIB_WPOS][1];
+   iyMin = (GLint) yMin;
+   iyMax = (GLint) yMax + 1;
+
+   if (ltor) {
+      /* scan left to right */
+      const GLfloat *pMin = vMin->attrib[FRAG_ATTRIB_WPOS];
+      const GLfloat *pMid = vMid->attrib[FRAG_ATTRIB_WPOS];
+      const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
+      const GLfloat dxdy = majDx / majDy;
+      const GLfloat xAdj = dxdy < 0.0F ? -dxdy : 0.0F;
+      GLint iy;
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
+#endif
+      for (iy = iyMin; iy < iyMax; iy++) {
+         GLfloat x = pMin[0] - (yMin - iy) * dxdy;
+         GLint ix, startX = (GLint) (x - xAdj);
+         GLuint count;
+         GLfloat coverage = 0.0F;
+
+#ifdef _OPENMP
+         /* each thread needs to use a different (global) SpanArrays variable */
+         span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
+#endif
+         /* skip over fragments with zero coverage */
+         while (startX < MAX_WIDTH) {
+            coverage = compute_coveragef(pMin, pMid, pMax, startX, iy);
+            if (coverage > 0.0F)
+               break;
+            startX++;
+         }
+
+         /* enter interior of triangle */
+         ix = startX;
+
+#if defined(DO_ATTRIBS)
+         /* compute attributes at left-most fragment */
+         span.attrStart[FRAG_ATTRIB_WPOS][3] = solve_plane(ix + 0.5F, iy + 0.5F, wPlane);
+         ATTRIB_LOOP_BEGIN
+            GLuint c;
+            for (c = 0; c < 4; c++) {
+               span.attrStart[attr][c] = solve_plane(ix + 0.5F, iy + 0.5F, attrPlane[attr][c]);
+            }
+         ATTRIB_LOOP_END
+#endif
+
+         count = 0;
+         while (coverage > 0.0F) {
+            /* (cx,cy) = center of fragment */
+            const GLfloat cx = ix + 0.5F, cy = iy + 0.5F;
+            SWspanarrays *array = span.array;
+            array->coverage[count] = coverage;
+#ifdef DO_Z
+            array->z[count] = (GLuint) solve_plane(cx, cy, zPlane);
+#endif
+            array->rgba[count][RCOMP] = solve_plane_chan(cx, cy, rPlane);
+            array->rgba[count][GCOMP] = solve_plane_chan(cx, cy, gPlane);
+            array->rgba[count][BCOMP] = solve_plane_chan(cx, cy, bPlane);
+            array->rgba[count][ACOMP] = solve_plane_chan(cx, cy, aPlane);
+            ix++;
+            count++;
+            coverage = compute_coveragef(pMin, pMid, pMax, ix, iy);
+         }
+         
+         if (ix > startX) {
+            span.x = startX;
+            span.y = iy;
+            span.end = (GLuint) ix - (GLuint) startX;
+            _swrast_write_rgba_span(ctx, &span);
+         }
+      }
+   }
+   else {
+      /* scan right to left */
+      const GLfloat *pMin = vMin->attrib[FRAG_ATTRIB_WPOS];
+      const GLfloat *pMid = vMid->attrib[FRAG_ATTRIB_WPOS];
+      const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
+      const GLfloat dxdy = majDx / majDy;
+      const GLfloat xAdj = dxdy > 0 ? dxdy : 0.0F;
+      GLint iy;
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
+#endif
+      for (iy = iyMin; iy < iyMax; iy++) {
+         GLfloat x = pMin[0] - (yMin - iy) * dxdy;
+         GLint ix, left, startX = (GLint) (x + xAdj);
+         GLuint count, n;
+         GLfloat coverage = 0.0F;
+         
+#ifdef _OPENMP
+         /* each thread needs to use a different (global) SpanArrays variable */
+         span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
+#endif
+         /* make sure we're not past the window edge */
+         if (startX >= ctx->DrawBuffer->_Xmax) {
+            startX = ctx->DrawBuffer->_Xmax - 1;
+         }
+
+         /* skip fragments with zero coverage */
+         while (startX > 0) {
+            coverage = compute_coveragef(pMin, pMax, pMid, startX, iy);
+            if (coverage > 0.0F)
+               break;
+            startX--;
+         }
+         
+         /* enter interior of triangle */
+         ix = startX;
+         count = 0;
+         while (coverage > 0.0F) {
+            /* (cx,cy) = center of fragment */
+            const GLfloat cx = ix + 0.5F, cy = iy + 0.5F;
+            SWspanarrays *array = span.array;
+            ASSERT(ix >= 0);
+            array->coverage[ix] = coverage;
+#ifdef DO_Z
+            array->z[ix] = (GLuint) solve_plane(cx, cy, zPlane);
+#endif
+            array->rgba[ix][RCOMP] = solve_plane_chan(cx, cy, rPlane);
+            array->rgba[ix][GCOMP] = solve_plane_chan(cx, cy, gPlane);
+            array->rgba[ix][BCOMP] = solve_plane_chan(cx, cy, bPlane);
+            array->rgba[ix][ACOMP] = solve_plane_chan(cx, cy, aPlane);
+            ix--;
+            count++;
+            coverage = compute_coveragef(pMin, pMax, pMid, ix, iy);
+         }
+         
+#if defined(DO_ATTRIBS)
+         /* compute attributes at left-most fragment */
+         span.attrStart[FRAG_ATTRIB_WPOS][3] = solve_plane(ix + 1.5F, iy + 0.5F, wPlane);
+         ATTRIB_LOOP_BEGIN
+            GLuint c;
+            for (c = 0; c < 4; c++) {
+               span.attrStart[attr][c] = solve_plane(ix + 1.5F, iy + 0.5F, attrPlane[attr][c]);
+            }
+         ATTRIB_LOOP_END
+#endif
+
+         if (startX > ix) {
+            n = (GLuint) startX - (GLuint) ix;
+
+            left = ix + 1;
+
+            /* shift all values to the left */
+            /* XXX this is temporary */
+            {
+               SWspanarrays *array = span.array;
+               GLint j;
+               for (j = 0; j < (GLint) n; j++) {
+                  array->coverage[j] = array->coverage[j + left];
+                  COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
+#ifdef DO_Z
+                  array->z[j] = array->z[j + left];
+#endif
+               }
+            }
+
+            span.x = left;
+            span.y = iy;
+            span.end = n;
+            _swrast_write_rgba_span(ctx, &span);
+         }
+      }
+   }
+}
+
+
+#undef DO_Z
+#undef DO_ATTRIBS
+#undef DO_OCCLUSION_TEST
diff --git a/mesalib/src/mesa/swrast/s_context.c b/mesalib/src/mesa/swrast/s_context.c
index def1531d7..792b528ee 100644
--- a/mesalib/src/mesa/swrast/s_context.c
+++ b/mesalib/src/mesa/swrast/s_context.c
@@ -417,84 +417,6 @@ _swrast_validate_blend_func(struct gl_context *ctx, GLuint n, const GLubyte mask
    swrast->BlendFunc( ctx, n, mask, src, dst, chanType );
 }
 
-
-/**
- * Make sure we have texture image data for all the textures we may need
- * for subsequent rendering.
- */
-static void
-_swrast_validate_texture_images(struct gl_context *ctx)
-{
-   SWcontext *swrast = SWRAST_CONTEXT(ctx);
-   GLuint u;
-
-   if (!swrast->ValidateTextureImage || !ctx->Texture._EnabledUnits) {
-      /* no textures enabled, or no way to validate images! */
-      return;
-   }
-
-   for (u = 0; u < ctx->Const.MaxTextureImageUnits; u++) {
-      if (ctx->Texture.Unit[u]._ReallyEnabled) {
-         struct gl_texture_object *texObj = ctx->Texture.Unit[u]._Current;
-         ASSERT(texObj);
-         if (texObj) {
-            GLuint numFaces = (texObj->Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
-            GLuint face;
-            for (face = 0; face < numFaces; face++) {
-               GLint lvl;
-               for (lvl = texObj->BaseLevel; lvl <= texObj->_MaxLevel; lvl++) {
-                  struct gl_texture_image *texImg = texObj->Image[face][lvl];
-                  if (texImg && !texImg->Data) {
-                     swrast->ValidateTextureImage(ctx, texObj, face, lvl);
-                     ASSERT(texObj->Image[face][lvl]->Data);
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-
-/**
- * Free the texture image data attached to all currently enabled
- * textures.  Meant to be called by device drivers when transitioning
- * from software to hardware rendering.
- */
-void
-_swrast_eject_texture_images(struct gl_context *ctx)
-{
-   GLuint u;
-
-   if (!ctx->Texture._EnabledUnits) {
-      /* no textures enabled */
-      return;
-   }
-
-   for (u = 0; u < ctx->Const.MaxTextureImageUnits; u++) {
-      if (ctx->Texture.Unit[u]._ReallyEnabled) {
-         struct gl_texture_object *texObj = ctx->Texture.Unit[u]._Current;
-         ASSERT(texObj);
-         if (texObj) {
-            GLuint numFaces = (texObj->Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
-            GLuint face;
-            for (face = 0; face < numFaces; face++) {
-               GLint lvl;
-               for (lvl = texObj->BaseLevel; lvl <= texObj->_MaxLevel; lvl++) {
-                  struct gl_texture_image *texImg = texObj->Image[face][lvl];
-                  if (texImg && texImg->Data) {
-                     _mesa_free_texmemory(texImg->Data);
-                     texImg->Data = NULL;
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-
-
 static void
 _swrast_sleep( struct gl_context *ctx, GLbitfield new_state )
 {
@@ -640,7 +562,6 @@ _swrast_validate_derived( struct gl_context *ctx )
 
       if (swrast->NewState & (_NEW_TEXTURE | _NEW_PROGRAM)) {
          _swrast_update_texture_samplers( ctx );
-         _swrast_validate_texture_images(ctx);
       }
 
       if (swrast->NewState & (_NEW_COLOR | _NEW_PROGRAM))
@@ -772,6 +693,11 @@ _swrast_CreateContext( struct gl_context *ctx )
 {
    GLuint i;
    SWcontext *swrast = (SWcontext *)CALLOC(sizeof(SWcontext));
+#ifdef _OPENMP
+   const GLint maxThreads = omp_get_max_threads();
+#else
+   const GLint maxThreads = 1;
+#endif
 
    if (SWRAST_DEBUG) {
       _mesa_debug(ctx, "_swrast_CreateContext\n");
@@ -806,19 +732,25 @@ _swrast_CreateContext( struct gl_context *ctx )
    for (i = 0; i < MAX_TEXTURE_IMAGE_UNITS; i++)
       swrast->TextureSample[i] = NULL;
 
-   swrast->SpanArrays = MALLOC_STRUCT(sw_span_arrays);
+   /* SpanArrays is global and shared by all SWspan instances. However, when
+    * using multiple threads, it is necessary to have one SpanArrays instance
+    * per thread.
+    */
+   swrast->SpanArrays = (SWspanarrays *) MALLOC(maxThreads * sizeof(SWspanarrays));
    if (!swrast->SpanArrays) {
       FREE(swrast);
       return GL_FALSE;
    }
-   swrast->SpanArrays->ChanType = CHAN_TYPE;
+   for(i = 0; i < maxThreads; i++) {
+      swrast->SpanArrays[i].ChanType = CHAN_TYPE;
 #if CHAN_TYPE == GL_UNSIGNED_BYTE
-   swrast->SpanArrays->rgba = swrast->SpanArrays->rgba8;
+      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba8;
 #elif CHAN_TYPE == GL_UNSIGNED_SHORT
-   swrast->SpanArrays->rgba = swrast->SpanArrays->rgba16;
+      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba16;
 #else
-   swrast->SpanArrays->rgba = swrast->SpanArrays->attribs[FRAG_ATTRIB_COL0];
+      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].attribs[FRAG_ATTRIB_COL0];
 #endif
+   }
 
    /* init point span buffer */
    swrast->PointSpan.primitive = GL_POINT;
@@ -826,7 +758,10 @@ _swrast_CreateContext( struct gl_context *ctx )
    swrast->PointSpan.facing = 0;
    swrast->PointSpan.array = swrast->SpanArrays;
 
-   swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits *
+   /* TexelBuffer is also global and normally shared by all SWspan instances;
+    * when running with multiple threads, create one per thread.
+    */
+   swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits * maxThreads *
                                            MAX_WIDTH * 4 * sizeof(GLfloat));
    if (!swrast->TexelBuffer) {
       FREE(swrast->SpanArrays);
diff --git a/mesalib/src/mesa/swrast/s_stencil.c b/mesalib/src/mesa/swrast/s_stencil.c
index 999fe3c3f..fa5093a34 100644
--- a/mesalib/src/mesa/swrast/s_stencil.c
+++ b/mesalib/src/mesa/swrast/s_stencil.c
@@ -1,1245 +1,1246 @@
-/*
- * Mesa 3-D graphics library
- * Version:  7.1
- *
- * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/imports.h"
-
-#include "s_context.h"
-#include "s_depth.h"
-#include "s_stencil.h"
-#include "s_span.h"
-
-
-
-/* Stencil Logic:
-
-IF stencil test fails THEN
-   Apply fail-op to stencil value
-   Don't write the pixel (RGBA,Z)
-ELSE
-   IF doing depth test && depth test fails THEN
-      Apply zfail-op to stencil value
-      Write RGBA and Z to appropriate buffers
-   ELSE
-      Apply zpass-op to stencil value
-ENDIF
-
-*/
-
-
-/**
- * Apply the given stencil operator to the array of stencil values.
- * Don't touch stencil[i] if mask[i] is zero.
- * Input:  n - size of stencil array
- *         oper - the stencil buffer operator
- *         face - 0 or 1 for front or back face operation
- *         stencil - array of stencil values
- *         mask - array [n] of flag:  1=apply operator, 0=don't apply operator
- * Output:  stencil - modified values
- */
-static void
-apply_stencil_op( const struct gl_context *ctx, GLenum oper, GLuint face,
-                  GLuint n, GLstencil stencil[], const GLubyte mask[] )
-{
-   const GLstencil ref = ctx->Stencil.Ref[face];
-   const GLstencil wrtmask = ctx->Stencil.WriteMask[face];
-   const GLstencil invmask = (GLstencil) (~wrtmask);
-   const GLstencil stencilMax = (1 << ctx->DrawBuffer->Visual.stencilBits) - 1;
-   GLuint i;
-
-   switch (oper) {
-      case GL_KEEP:
-         /* do nothing */
-         break;
-      case GL_ZERO:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-		  stencil[i] = 0;
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-		  stencil[i] = (GLstencil) (stencil[i] & invmask);
-	       }
-	    }
-	 }
-	 break;
-      case GL_REPLACE:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  stencil[i] = ref;
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-		  GLstencil s = stencil[i];
-		  stencil[i] = (GLstencil) ((invmask & s ) | (wrtmask & ref));
-	       }
-	    }
-	 }
-	 break;
-      case GL_INCR:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-		  GLstencil s = stencil[i];
-		  if (s < stencilMax) {
-		     stencil[i] = (GLstencil) (s+1);
-		  }
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-		  /* VERIFY logic of adding 1 to a write-masked value */
-		  GLstencil s = stencil[i];
-		  if (s < stencilMax) {
-		     stencil[i] = (GLstencil) ((invmask & s) | (wrtmask & (s+1)));
-		  }
-	       }
-	    }
-	 }
-	 break;
-      case GL_DECR:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-		  GLstencil s = stencil[i];
-		  if (s>0) {
-		     stencil[i] = (GLstencil) (s-1);
-		  }
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-		  /* VERIFY logic of subtracting 1 to a write-masked value */
-		  GLstencil s = stencil[i];
-		  if (s>0) {
-		     stencil[i] = (GLstencil) ((invmask & s) | (wrtmask & (s-1)));
-		  }
-	       }
-	    }
-	 }
-	 break;
-      case GL_INCR_WRAP_EXT:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  stencil[i]++;
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil s = stencil[i];
-                  stencil[i] = (GLstencil) ((invmask & s) | (wrtmask & (s+1)));
-	       }
-	    }
-	 }
-	 break;
-      case GL_DECR_WRAP_EXT:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-		  stencil[i]--;
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil s = stencil[i];
-                  stencil[i] = (GLstencil) ((invmask & s) | (wrtmask & (s-1)));
-	       }
-	    }
-	 }
-	 break;
-      case GL_INVERT:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-		  GLstencil s = stencil[i];
-		  stencil[i] = (GLstencil) ~s;
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-		  GLstencil s = stencil[i];
-		  stencil[i] = (GLstencil) ((invmask & s) | (wrtmask & ~s));
-	       }
-	    }
-	 }
-	 break;
-      default:
-         _mesa_problem(ctx, "Bad stencil op in apply_stencil_op");
-   }
-}
-
-
-
-
-/**
- * Apply stencil test to an array of stencil values (before depth buffering).
- * Input:  face - 0 or 1 for front or back-face polygons
- *         n - number of pixels in the array
- *         stencil - array of [n] stencil values
- *         mask - array [n] of flag:  0=skip the pixel, 1=stencil the pixel
- * Output:  mask - pixels which fail the stencil test will have their
- *                 mask flag set to 0.
- *          stencil - updated stencil values (where the test passed)
- * Return:  GL_FALSE = all pixels failed, GL_TRUE = zero or more pixels passed.
- */
-static GLboolean
-do_stencil_test( struct gl_context *ctx, GLuint face, GLuint n, GLstencil stencil[],
-                 GLubyte mask[] )
-{
-   GLubyte fail[MAX_WIDTH];
-   GLboolean allfail = GL_FALSE;
-   GLuint i;
-   const GLuint valueMask = ctx->Stencil.ValueMask[face];
-   const GLstencil r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
-   GLstencil s;
-
-   ASSERT(n <= MAX_WIDTH);
-
-   /*
-    * Perform stencil test.  The results of this operation are stored
-    * in the fail[] array:
-    *   IF fail[i] is non-zero THEN
-    *       the stencil fail operator is to be applied
-    *   ELSE
-    *       the stencil fail operator is not to be applied
-    *   ENDIF
-    */
-   switch (ctx->Stencil.Function[face]) {
-      case GL_NEVER:
-         /* never pass; always fail */
-         for (i=0;i<n;i++) {
-	    if (mask[i]) {
-	       mask[i] = 0;
-	       fail[i] = 1;
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 allfail = GL_TRUE;
-	 break;
-      case GL_LESS:
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-	       s = (GLstencil) (stencil[i] & valueMask);
-	       if (r < s) {
-		  /* passed */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_LEQUAL:
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-	       s = (GLstencil) (stencil[i] & valueMask);
-	       if (r <= s) {
-		  /* pass */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_GREATER:
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-	       s = (GLstencil) (stencil[i] & valueMask);
-	       if (r > s) {
-		  /* passed */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_GEQUAL:
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-	       s = (GLstencil) (stencil[i] & valueMask);
-	       if (r >= s) {
-		  /* passed */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_EQUAL:
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-	       s = (GLstencil) (stencil[i] & valueMask);
-	       if (r == s) {
-		  /* passed */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_NOTEQUAL:
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-	       s = (GLstencil) (stencil[i] & valueMask);
-	       if (r != s) {
-		  /* passed */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_ALWAYS:
-	 /* always pass */
-	 for (i=0;i<n;i++) {
-	    fail[i] = 0;
-	 }
-	 break;
-      default:
-         _mesa_problem(ctx, "Bad stencil func in gl_stencil_span");
-         return 0;
-   }
-
-   if (ctx->Stencil.FailFunc[face] != GL_KEEP) {
-      apply_stencil_op( ctx, ctx->Stencil.FailFunc[face], face, n, stencil, fail );
-   }
-
-   return !allfail;
-}
-
-
-/**
- * Compute the zpass/zfail masks by comparing the pre- and post-depth test
- * masks.
- */
-static INLINE void
-compute_pass_fail_masks(GLuint n, const GLubyte origMask[],
-                        const GLubyte newMask[],
-                        GLubyte passMask[], GLubyte failMask[])
-{
-   GLuint i;
-   for (i = 0; i < n; i++) {
-      ASSERT(newMask[i] == 0 || newMask[i] == 1);
-      passMask[i] = origMask[i] & newMask[i];
-      failMask[i] = origMask[i] & (newMask[i] ^ 1);
-   }
-}
-
-
-/**
- * Apply stencil and depth testing to the span of pixels.
- * Both software and hardware stencil buffers are acceptable.
- * Input:  n - number of pixels in the span
- *         x, y - location of leftmost pixel in span
- *         z - array [n] of z values
- *         mask - array [n] of flags  (1=test this pixel, 0=skip the pixel)
- * Output:  mask - array [n] of flags (1=stencil and depth test passed)
- * Return: GL_FALSE - all fragments failed the testing
- *         GL_TRUE - one or more fragments passed the testing
- *
- */
-static GLboolean
-stencil_and_ztest_span(struct gl_context *ctx, SWspan *span, GLuint face)
-{
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   struct gl_renderbuffer *rb = fb->_StencilBuffer;
-   GLstencil stencilRow[MAX_WIDTH];
-   GLstencil *stencil;
-   const GLuint n = span->end;
-   const GLint x = span->x;
-   const GLint y = span->y;
-   GLubyte *mask = span->array->mask;
-
-   ASSERT((span->arrayMask & SPAN_XY) == 0);
-   ASSERT(ctx->Stencil.Enabled);
-   ASSERT(n <= MAX_WIDTH);
-#ifdef DEBUG
-   if (ctx->Depth.Test) {
-      ASSERT(span->arrayMask & SPAN_Z);
-   }
-#endif
-
-   stencil = (GLstencil *) rb->GetPointer(ctx, rb, x, y);
-   if (!stencil) {
-      rb->GetRow(ctx, rb, n, x, y, stencilRow);
-      stencil = stencilRow;
-   }
-
-   /*
-    * Apply the stencil test to the fragments.
-    * failMask[i] is 1 if the stencil test failed.
-    */
-   if (do_stencil_test( ctx, face, n, stencil, mask ) == GL_FALSE) {
-      /* all fragments failed the stencil test, we're done. */
-      span->writeAll = GL_FALSE;
-      if (!rb->GetPointer(ctx, rb, 0, 0)) {
-         /* put updated stencil values into buffer */
-         rb->PutRow(ctx, rb, n, x, y, stencil, NULL);
-      }
-      return GL_FALSE;
-   }
-
-   /*
-    * Some fragments passed the stencil test, apply depth test to them
-    * and apply Zpass and Zfail stencil ops.
-    */
-   if (ctx->Depth.Test == GL_FALSE) {
-      /*
-       * No depth buffer, just apply zpass stencil function to active pixels.
-       */
-      apply_stencil_op( ctx, ctx->Stencil.ZPassFunc[face], face, n, stencil, mask );
-   }
-   else {
-      /*
-       * Perform depth buffering, then apply zpass or zfail stencil function.
-       */
-      GLubyte passMask[MAX_WIDTH], failMask[MAX_WIDTH], origMask[MAX_WIDTH];
-
-      /* save the current mask bits */
-      memcpy(origMask, mask, n * sizeof(GLubyte));
-
-      /* apply the depth test */
-      _swrast_depth_test_span(ctx, span);
-
-      compute_pass_fail_masks(n, origMask, mask, passMask, failMask);
-
-      /* apply the pass and fail operations */
-      if (ctx->Stencil.ZFailFunc[face] != GL_KEEP) {
-         apply_stencil_op( ctx, ctx->Stencil.ZFailFunc[face], face,
-                           n, stencil, failMask );
-      }
-      if (ctx->Stencil.ZPassFunc[face] != GL_KEEP) {
-         apply_stencil_op( ctx, ctx->Stencil.ZPassFunc[face], face,
-                           n, stencil, passMask );
-      }
-   }
-
-   /*
-    * Write updated stencil values back into hardware stencil buffer.
-    */
-   if (!rb->GetPointer(ctx, rb, 0, 0)) {
-      rb->PutRow(ctx, rb, n, x, y, stencil, NULL);
-   }
-   
-   span->writeAll = GL_FALSE;
-   
-   return GL_TRUE;  /* one or more fragments passed both tests */
-}
-
-
-
-/*
- * Return the address of a stencil buffer value given the window coords:
- */
-#define STENCIL_ADDRESS(X, Y)  (stencilStart + (Y) * stride + (X))
-
-
-
-/**
- * Apply the given stencil operator for each pixel in the array whose
- * mask flag is set.
- * \note  This is for software stencil buffers only.
- * Input:  n - number of pixels in the span
- *         x, y - array of [n] pixels
- *         operator - the stencil buffer operator
- *         mask - array [n] of flag:  1=apply operator, 0=don't apply operator
- */
-static void
-apply_stencil_op_to_pixels( struct gl_context *ctx,
-                            GLuint n, const GLint x[], const GLint y[],
-                            GLenum oper, GLuint face, const GLubyte mask[] )
-{
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   struct gl_renderbuffer *rb = fb->_StencilBuffer;
-   const GLstencil stencilMax = (1 << fb->Visual.stencilBits) - 1;
-   const GLstencil ref = ctx->Stencil.Ref[face];
-   const GLstencil wrtmask = ctx->Stencil.WriteMask[face];
-   const GLstencil invmask = (GLstencil) (~wrtmask);
-   GLuint i;
-   GLstencil *stencilStart = (GLubyte *) rb->Data;
-   const GLuint stride = rb->Width;
-
-   ASSERT(rb->GetPointer(ctx, rb, 0, 0));
-   ASSERT(sizeof(GLstencil) == 1);
-
-   switch (oper) {
-      case GL_KEEP:
-         /* do nothing */
-         break;
-      case GL_ZERO:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-                  *sptr = 0;
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-		  *sptr = (GLstencil) (invmask & *sptr);
-	       }
-	    }
-	 }
-	 break;
-      case GL_REPLACE:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-                  *sptr = ref;
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-		  *sptr = (GLstencil) ((invmask & *sptr ) | (wrtmask & ref));
-	       }
-	    }
-	 }
-	 break;
-      case GL_INCR:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-		  if (*sptr < stencilMax) {
-		     *sptr = (GLstencil) (*sptr + 1);
-		  }
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-		  if (*sptr < stencilMax) {
-		     *sptr = (GLstencil) ((invmask & *sptr) | (wrtmask & (*sptr+1)));
-		  }
-	       }
-	    }
-	 }
-	 break;
-      case GL_DECR:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-		  if (*sptr>0) {
-		     *sptr = (GLstencil) (*sptr - 1);
-		  }
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-		  if (*sptr>0) {
-		     *sptr = (GLstencil) ((invmask & *sptr) | (wrtmask & (*sptr-1)));
-		  }
-	       }
-	    }
-	 }
-	 break;
-      case GL_INCR_WRAP_EXT:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-                  *sptr = (GLstencil) (*sptr + 1);
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-                  *sptr = (GLstencil) ((invmask & *sptr) | (wrtmask & (*sptr+1)));
-	       }
-	    }
-	 }
-	 break;
-      case GL_DECR_WRAP_EXT:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-                  *sptr = (GLstencil) (*sptr - 1);
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-                  *sptr = (GLstencil) ((invmask & *sptr) | (wrtmask & (*sptr-1)));
-	       }
-	    }
-	 }
-	 break;
-      case GL_INVERT:
-	 if (invmask==0) {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-                  *sptr = (GLstencil) (~*sptr);
-	       }
-	    }
-	 }
-	 else {
-	    for (i=0;i<n;i++) {
-	       if (mask[i]) {
-                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
-                  *sptr = (GLstencil) ((invmask & *sptr) | (wrtmask & ~*sptr));
-	       }
-	    }
-	 }
-	 break;
-      default:
-         _mesa_problem(ctx, "Bad stencilop in apply_stencil_op_to_pixels");
-   }
-}
-
-
-
-/**
- * Apply stencil test to an array of pixels before depth buffering.
- *
- * \note Used for software stencil buffer only.
- * Input:  n - number of pixels in the span
- *         x, y - array of [n] pixels to stencil
- *         mask - array [n] of flag:  0=skip the pixel, 1=stencil the pixel
- * Output:  mask - pixels which fail the stencil test will have their
- *                 mask flag set to 0.
- * \return  GL_FALSE = all pixels failed, GL_TRUE = zero or more pixels passed.
- */
-static GLboolean
-stencil_test_pixels( struct gl_context *ctx, GLuint face, GLuint n,
-                     const GLint x[], const GLint y[], GLubyte mask[] )
-{
-   const struct gl_framebuffer *fb = ctx->DrawBuffer;
-   struct gl_renderbuffer *rb = fb->_StencilBuffer;
-   GLubyte fail[MAX_WIDTH];
-   GLstencil r, s;
-   GLuint i;
-   GLboolean allfail = GL_FALSE;
-   const GLuint valueMask = ctx->Stencil.ValueMask[face];
-   const GLstencil *stencilStart = (GLstencil *) rb->Data;
-   const GLuint stride = rb->Width;
-
-   ASSERT(rb->GetPointer(ctx, rb, 0, 0));
-   ASSERT(sizeof(GLstencil) == 1);
-
-   /*
-    * Perform stencil test.  The results of this operation are stored
-    * in the fail[] array:
-    *   IF fail[i] is non-zero THEN
-    *       the stencil fail operator is to be applied
-    *   ELSE
-    *       the stencil fail operator is not to be applied
-    *   ENDIF
-    */
-
-   switch (ctx->Stencil.Function[face]) {
-      case GL_NEVER:
-         /* always fail */
-         for (i=0;i<n;i++) {
-	    if (mask[i]) {
-	       mask[i] = 0;
-	       fail[i] = 1;
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 allfail = GL_TRUE;
-	 break;
-      case GL_LESS:
-	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
-	       s = (GLstencil) (*sptr & valueMask);
-	       if (r < s) {
-		  /* passed */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_LEQUAL:
-	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
-	       s = (GLstencil) (*sptr & valueMask);
-	       if (r <= s) {
-		  /* pass */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_GREATER:
-	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
-	       s = (GLstencil) (*sptr & valueMask);
-	       if (r > s) {
-		  /* passed */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_GEQUAL:
-	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
-	       s = (GLstencil) (*sptr & valueMask);
-	       if (r >= s) {
-		  /* passed */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_EQUAL:
-	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
-	       s = (GLstencil) (*sptr & valueMask);
-	       if (r == s) {
-		  /* passed */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_NOTEQUAL:
-	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
-	 for (i=0;i<n;i++) {
-	    if (mask[i]) {
-               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
-	       s = (GLstencil) (*sptr & valueMask);
-	       if (r != s) {
-		  /* passed */
-		  fail[i] = 0;
-	       }
-	       else {
-		  fail[i] = 1;
-		  mask[i] = 0;
-	       }
-	    }
-	    else {
-	       fail[i] = 0;
-	    }
-	 }
-	 break;
-      case GL_ALWAYS:
-	 /* always pass */
-	 for (i=0;i<n;i++) {
-	    fail[i] = 0;
-	 }
-	 break;
-      default:
-         _mesa_problem(ctx, "Bad stencil func in gl_stencil_pixels");
-         return 0;
-   }
-
-   if (ctx->Stencil.FailFunc[face] != GL_KEEP) {
-      apply_stencil_op_to_pixels( ctx, n, x, y, ctx->Stencil.FailFunc[face],
-                                  face, fail );
-   }
-
-   return !allfail;
-}
-
-
-
-
-/**
- * Apply stencil and depth testing to an array of pixels.
- * This is used both for software and hardware stencil buffers.
- *
- * The comments in this function are a bit sparse but the code is
- * almost identical to stencil_and_ztest_span(), which is well
- * commented.
- *
- * Input:  n - number of pixels in the array
- *         x, y - array of [n] pixel positions
- *         z - array [n] of z values
- *         mask - array [n] of flags  (1=test this pixel, 0=skip the pixel)
- * Output: mask - array [n] of flags (1=stencil and depth test passed)
- * Return: GL_FALSE - all fragments failed the testing
- *         GL_TRUE - one or more fragments passed the testing
- */
-static GLboolean
-stencil_and_ztest_pixels( struct gl_context *ctx, SWspan *span, GLuint face )
-{
-   GLubyte passMask[MAX_WIDTH], failMask[MAX_WIDTH], origMask[MAX_WIDTH];
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   struct gl_renderbuffer *rb = fb->_StencilBuffer;
-   const GLuint n = span->end;
-   const GLint *x = span->array->x;
-   const GLint *y = span->array->y;
-   GLubyte *mask = span->array->mask;
-
-   ASSERT(span->arrayMask & SPAN_XY);
-   ASSERT(ctx->Stencil.Enabled);
-   ASSERT(n <= MAX_WIDTH);
-
-   if (!rb->GetPointer(ctx, rb, 0, 0)) {
-      /* No direct access */
-      GLstencil stencil[MAX_WIDTH];
-
-      ASSERT(rb->DataType == GL_UNSIGNED_BYTE);
-      _swrast_get_values(ctx, rb, n, x, y, stencil, sizeof(GLubyte));
-
-      memcpy(origMask, mask, n * sizeof(GLubyte));          
-
-      (void) do_stencil_test(ctx, face, n, stencil, mask);
-
-      if (ctx->Depth.Test == GL_FALSE) {
-         apply_stencil_op(ctx, ctx->Stencil.ZPassFunc[face], face,
-                          n, stencil, mask);
-      }
-      else {
-         GLubyte tmpMask[MAX_WIDTH]; 
-         memcpy(tmpMask, mask, n * sizeof(GLubyte));
-
-         _swrast_depth_test_span(ctx, span);
-
-         compute_pass_fail_masks(n, tmpMask, mask, passMask, failMask);
-
-         if (ctx->Stencil.ZFailFunc[face] != GL_KEEP) {
-            apply_stencil_op(ctx, ctx->Stencil.ZFailFunc[face], face,
-                             n, stencil, failMask);
-         }
-         if (ctx->Stencil.ZPassFunc[face] != GL_KEEP) {
-            apply_stencil_op(ctx, ctx->Stencil.ZPassFunc[face], face,
-                             n, stencil, passMask);
-         }
-      }
-
-      /* Write updated stencil values into hardware stencil buffer */
-      rb->PutValues(ctx, rb, n, x, y, stencil, origMask);
-
-      return GL_TRUE;
-   }
-   else {
-      /* Direct access to stencil buffer */
-
-      if (stencil_test_pixels(ctx, face, n, x, y, mask) == GL_FALSE) {
-         /* all fragments failed the stencil test, we're done. */
-         return GL_FALSE;
-      }
-
-      if (ctx->Depth.Test==GL_FALSE) {
-         apply_stencil_op_to_pixels(ctx, n, x, y,
-                                    ctx->Stencil.ZPassFunc[face], face, mask);
-      }
-      else {
-         memcpy(origMask, mask, n * sizeof(GLubyte));
-
-         _swrast_depth_test_span(ctx, span);
-
-         compute_pass_fail_masks(n, origMask, mask, passMask, failMask);
-
-         if (ctx->Stencil.ZFailFunc[face] != GL_KEEP) {
-            apply_stencil_op_to_pixels(ctx, n, x, y,
-                                       ctx->Stencil.ZFailFunc[face],
-                                       face, failMask);
-         }
-         if (ctx->Stencil.ZPassFunc[face] != GL_KEEP) {
-            apply_stencil_op_to_pixels(ctx, n, x, y,
-                                       ctx->Stencil.ZPassFunc[face],
-                                       face, passMask);
-         }
-      }
-
-      return GL_TRUE;  /* one or more fragments passed both tests */
-   }
-}
-
-
-/**
- * /return GL_TRUE = one or more fragments passed,
- * GL_FALSE = all fragments failed.
- */
-GLboolean
-_swrast_stencil_and_ztest_span(struct gl_context *ctx, SWspan *span)
-{
-   const GLuint face = (span->facing == 0) ? 0 : ctx->Stencil._BackFace;
-
-   if (span->arrayMask & SPAN_XY)
-      return stencil_and_ztest_pixels(ctx, span, face);
-   else
-      return stencil_and_ztest_span(ctx, span, face);
-}
-
-
-#if 0
-GLuint
-clip_span(GLuint bufferWidth, GLuint bufferHeight,
-          GLint x, GLint y, GLuint *count)
-{
-   GLuint n = *count;
-   GLuint skipPixels = 0;
-
-   if (y < 0 || y >= bufferHeight || x + n <= 0 || x >= bufferWidth) {
-      /* totally out of bounds */
-      n = 0;
-   }
-   else {
-      /* left clip */
-      if (x < 0) {
-         skipPixels = -x;
-         x = 0;
-         n -= skipPixels;
-      }
-      /* right clip */
-      if (x + n > bufferWidth) {
-         GLint dx = x + n - bufferWidth;
-         n -= dx;
-      }
-   }
-
-   *count = n;
-
-   return skipPixels;
-}
-#endif
-
-
-/**
- * Return a span of stencil values from the stencil buffer.
- * Used for glRead/CopyPixels
- * Input:  n - how many pixels
- *         x,y - location of first pixel
- * Output:  stencil - the array of stencil values
- */
-void
-_swrast_read_stencil_span(struct gl_context *ctx, struct gl_renderbuffer *rb,
-                          GLint n, GLint x, GLint y, GLstencil stencil[])
-{
-   if (y < 0 || y >= (GLint) rb->Height ||
-       x + n <= 0 || x >= (GLint) rb->Width) {
-      /* span is completely outside framebuffer */
-      return; /* undefined values OK */
-   }
-
-   if (x < 0) {
-      GLint dx = -x;
-      x = 0;
-      n -= dx;
-      stencil += dx;
-   }
-   if (x + n > (GLint) rb->Width) {
-      GLint dx = x + n - rb->Width;
-      n -= dx;
-   }
-   if (n <= 0) {
-      return;
-   }
-
-   rb->GetRow(ctx, rb, n, x, y, stencil);
-}
-
-
-
-/**
- * Write a span of stencil values to the stencil buffer.  This function
- * applies the stencil write mask when needed.
- * Used for glDraw/CopyPixels
- * Input:  n - how many pixels
- *         x, y - location of first pixel
- *         stencil - the array of stencil values
- */
-void
-_swrast_write_stencil_span(struct gl_context *ctx, GLint n, GLint x, GLint y,
-                           const GLstencil stencil[] )
-{
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   struct gl_renderbuffer *rb = fb->_StencilBuffer;
-   const GLuint stencilMax = (1 << fb->Visual.stencilBits) - 1;
-   const GLuint stencilMask = ctx->Stencil.WriteMask[0];
-
-   if (y < 0 || y >= (GLint) rb->Height ||
-       x + n <= 0 || x >= (GLint) rb->Width) {
-      /* span is completely outside framebuffer */
-      return; /* undefined values OK */
-   }
-   if (x < 0) {
-      GLint dx = -x;
-      x = 0;
-      n -= dx;
-      stencil += dx;
-   }
-   if (x + n > (GLint) rb->Width) {
-      GLint dx = x + n - rb->Width;
-      n -= dx;
-   }
-   if (n <= 0) {
-      return;
-   }
-
-   if ((stencilMask & stencilMax) != stencilMax) {
-      /* need to apply writemask */
-      GLstencil destVals[MAX_WIDTH], newVals[MAX_WIDTH];
-      GLint i;
-      rb->GetRow(ctx, rb, n, x, y, destVals);
-      for (i = 0; i < n; i++) {
-         newVals[i]
-            = (stencil[i] & stencilMask) | (destVals[i] & ~stencilMask);
-      }
-      rb->PutRow(ctx, rb, n, x, y, newVals, NULL);
-   }
-   else {
-      rb->PutRow(ctx, rb, n, x, y, stencil, NULL);
-   }
-}
-
-
-
-/**
- * Clear the stencil buffer.
- */
-void
-_swrast_clear_stencil_buffer( struct gl_context *ctx, struct gl_renderbuffer *rb )
-{
-   const GLubyte stencilBits = ctx->DrawBuffer->Visual.stencilBits;
-   const GLuint mask = ctx->Stencil.WriteMask[0];
-   const GLuint invMask = ~mask;
-   const GLuint clearVal = (ctx->Stencil.Clear & mask);
-   const GLuint stencilMax = (1 << stencilBits) - 1;
-   GLint x, y, width, height;
-
-   if (!rb || mask == 0)
-      return;
-
-   ASSERT(rb->DataType == GL_UNSIGNED_BYTE ||
-          rb->DataType == GL_UNSIGNED_SHORT);
-
-   ASSERT(rb->_BaseFormat == GL_STENCIL_INDEX);
-
-   /* compute region to clear */
-   x = ctx->DrawBuffer->_Xmin;
-   y = ctx->DrawBuffer->_Ymin;
-   width  = ctx->DrawBuffer->_Xmax - ctx->DrawBuffer->_Xmin;
-   height = ctx->DrawBuffer->_Ymax - ctx->DrawBuffer->_Ymin;
-
-   if (rb->GetPointer(ctx, rb, 0, 0)) {
-      /* Direct buffer access */
-      if ((mask & stencilMax) != stencilMax) {
-         /* need to mask the clear */
-         if (rb->DataType == GL_UNSIGNED_BYTE) {
-            GLint i, j;
-            for (i = 0; i < height; i++) {
-               GLubyte *stencil = (GLubyte*) rb->GetPointer(ctx, rb, x, y + i);
-               for (j = 0; j < width; j++) {
-                  stencil[j] = (stencil[j] & invMask) | clearVal;
-               }
-            }
-         }
-         else {
-            GLint i, j;
-            for (i = 0; i < height; i++) {
-               GLushort *stencil = (GLushort*) rb->GetPointer(ctx, rb, x, y + i);
-               for (j = 0; j < width; j++) {
-                  stencil[j] = (stencil[j] & invMask) | clearVal;
-               }
-            }
-         }
-      }
-      else {
-         /* no bit masking */
-         if (width == (GLint) rb->Width && rb->DataType == GL_UNSIGNED_BYTE) {
-            /* optimized case */
-            /* Note: bottom-to-top raster assumed! */
-            GLubyte *stencil = (GLubyte *) rb->GetPointer(ctx, rb, x, y);
-            GLuint len = width * height * sizeof(GLubyte);
-            memset(stencil, clearVal, len);
-         }
-         else {
-            /* general case */
-            GLint i;
-            for (i = 0; i < height; i++) {
-               GLvoid *stencil = rb->GetPointer(ctx, rb, x, y + i);
-               if (rb->DataType == GL_UNSIGNED_BYTE) {
-                  memset(stencil, clearVal, width);
-               }
-               else {
-                  _mesa_memset16((short unsigned int*) stencil, clearVal, width);
-               }
-            }
-         }
-      }
-   }
-   else {
-      /* no direct access */
-      if ((mask & stencilMax) != stencilMax) {
-         /* need to mask the clear */
-         if (rb->DataType == GL_UNSIGNED_BYTE) {
-            GLint i, j;
-            for (i = 0; i < height; i++) {
-               GLubyte stencil[MAX_WIDTH];
-               rb->GetRow(ctx, rb, width, x, y + i, stencil);
-               for (j = 0; j < width; j++) {
-                  stencil[j] = (stencil[j] & invMask) | clearVal;
-               }
-               rb->PutRow(ctx, rb, width, x, y + i, stencil, NULL);
-            }
-         }
-         else {
-            GLint i, j;
-            for (i = 0; i < height; i++) {
-               GLushort stencil[MAX_WIDTH];
-               rb->GetRow(ctx, rb, width, x, y + i, stencil);
-               for (j = 0; j < width; j++) {
-                  stencil[j] = (stencil[j] & invMask) | clearVal;
-               }
-               rb->PutRow(ctx, rb, width, x, y + i, stencil, NULL);
-            }
-         }
-      }
-      else {
-         /* no bit masking */
-         const GLubyte clear8 = (GLubyte) clearVal;
-         const GLushort clear16 = (GLushort) clearVal;
-         const void *clear;
-         GLint i;
-         if (rb->DataType == GL_UNSIGNED_BYTE) {
-            clear = &clear8;
-         }
-         else {
-            clear = &clear16;
-         }
-         for (i = 0; i < height; i++) {
-            rb->PutMonoRow(ctx, rb, width, x, y + i, clear, NULL);
-         }
-      }
-   }
-}
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.1
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/imports.h"
+
+#include "s_context.h"
+#include "s_depth.h"
+#include "s_stencil.h"
+#include "s_span.h"
+
+
+
+/* Stencil Logic:
+
+IF stencil test fails THEN
+   Apply fail-op to stencil value
+   Don't write the pixel (RGBA,Z)
+ELSE
+   IF doing depth test && depth test fails THEN
+      Apply zfail-op to stencil value
+      Write RGBA and Z to appropriate buffers
+   ELSE
+      Apply zpass-op to stencil value
+ENDIF
+
+*/
+
+
+/**
+ * Apply the given stencil operator to the array of stencil values.
+ * Don't touch stencil[i] if mask[i] is zero.
+ * Input:  n - size of stencil array
+ *         oper - the stencil buffer operator
+ *         face - 0 or 1 for front or back face operation
+ *         stencil - array of stencil values
+ *         mask - array [n] of flag:  1=apply operator, 0=don't apply operator
+ * Output:  stencil - modified values
+ */
+static void
+apply_stencil_op( const struct gl_context *ctx, GLenum oper, GLuint face,
+                  GLuint n, GLstencil stencil[], const GLubyte mask[] )
+{
+   const GLstencil ref = ctx->Stencil.Ref[face];
+   const GLstencil wrtmask = ctx->Stencil.WriteMask[face];
+   const GLstencil invmask = (GLstencil) (~wrtmask);
+   const GLstencil stencilMax = (1 << ctx->DrawBuffer->Visual.stencilBits) - 1;
+   GLuint i;
+
+   switch (oper) {
+      case GL_KEEP:
+         /* do nothing */
+         break;
+      case GL_ZERO:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+		  stencil[i] = 0;
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+		  stencil[i] = (GLstencil) (stencil[i] & invmask);
+	       }
+	    }
+	 }
+	 break;
+      case GL_REPLACE:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  stencil[i] = ref;
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+		  GLstencil s = stencil[i];
+		  stencil[i] = (GLstencil) ((invmask & s ) | (wrtmask & ref));
+	       }
+	    }
+	 }
+	 break;
+      case GL_INCR:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+		  GLstencil s = stencil[i];
+		  if (s < stencilMax) {
+		     stencil[i] = (GLstencil) (s+1);
+		  }
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+		  /* VERIFY logic of adding 1 to a write-masked value */
+		  GLstencil s = stencil[i];
+		  if (s < stencilMax) {
+		     stencil[i] = (GLstencil) ((invmask & s) | (wrtmask & (s+1)));
+		  }
+	       }
+	    }
+	 }
+	 break;
+      case GL_DECR:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+		  GLstencil s = stencil[i];
+		  if (s>0) {
+		     stencil[i] = (GLstencil) (s-1);
+		  }
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+		  /* VERIFY logic of subtracting 1 to a write-masked value */
+		  GLstencil s = stencil[i];
+		  if (s>0) {
+		     stencil[i] = (GLstencil) ((invmask & s) | (wrtmask & (s-1)));
+		  }
+	       }
+	    }
+	 }
+	 break;
+      case GL_INCR_WRAP_EXT:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  stencil[i]++;
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil s = stencil[i];
+                  stencil[i] = (GLstencil) ((invmask & s) | (wrtmask & (s+1)));
+	       }
+	    }
+	 }
+	 break;
+      case GL_DECR_WRAP_EXT:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+		  stencil[i]--;
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil s = stencil[i];
+                  stencil[i] = (GLstencil) ((invmask & s) | (wrtmask & (s-1)));
+	       }
+	    }
+	 }
+	 break;
+      case GL_INVERT:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+		  GLstencil s = stencil[i];
+		  stencil[i] = (GLstencil) ~s;
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+		  GLstencil s = stencil[i];
+		  stencil[i] = (GLstencil) ((invmask & s) | (wrtmask & ~s));
+	       }
+	    }
+	 }
+	 break;
+      default:
+         _mesa_problem(ctx, "Bad stencil op in apply_stencil_op");
+   }
+}
+
+
+
+
+/**
+ * Apply stencil test to an array of stencil values (before depth buffering).
+ * Input:  face - 0 or 1 for front or back-face polygons
+ *         n - number of pixels in the array
+ *         stencil - array of [n] stencil values
+ *         mask - array [n] of flag:  0=skip the pixel, 1=stencil the pixel
+ * Output:  mask - pixels which fail the stencil test will have their
+ *                 mask flag set to 0.
+ *          stencil - updated stencil values (where the test passed)
+ * Return:  GL_FALSE = all pixels failed, GL_TRUE = zero or more pixels passed.
+ */
+static GLboolean
+do_stencil_test( struct gl_context *ctx, GLuint face, GLuint n, GLstencil stencil[],
+                 GLubyte mask[] )
+{
+   GLubyte fail[MAX_WIDTH];
+   GLboolean allfail = GL_FALSE;
+   GLuint i;
+   const GLuint valueMask = ctx->Stencil.ValueMask[face];
+   const GLstencil r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
+   GLstencil s;
+
+   ASSERT(n <= MAX_WIDTH);
+
+   /*
+    * Perform stencil test.  The results of this operation are stored
+    * in the fail[] array:
+    *   IF fail[i] is non-zero THEN
+    *       the stencil fail operator is to be applied
+    *   ELSE
+    *       the stencil fail operator is not to be applied
+    *   ENDIF
+    */
+   switch (ctx->Stencil.Function[face]) {
+      case GL_NEVER:
+         /* never pass; always fail */
+         for (i=0;i<n;i++) {
+	    if (mask[i]) {
+	       mask[i] = 0;
+	       fail[i] = 1;
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 allfail = GL_TRUE;
+	 break;
+      case GL_LESS:
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+	       s = (GLstencil) (stencil[i] & valueMask);
+	       if (r < s) {
+		  /* passed */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_LEQUAL:
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+	       s = (GLstencil) (stencil[i] & valueMask);
+	       if (r <= s) {
+		  /* pass */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_GREATER:
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+	       s = (GLstencil) (stencil[i] & valueMask);
+	       if (r > s) {
+		  /* passed */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_GEQUAL:
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+	       s = (GLstencil) (stencil[i] & valueMask);
+	       if (r >= s) {
+		  /* passed */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_EQUAL:
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+	       s = (GLstencil) (stencil[i] & valueMask);
+	       if (r == s) {
+		  /* passed */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_NOTEQUAL:
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+	       s = (GLstencil) (stencil[i] & valueMask);
+	       if (r != s) {
+		  /* passed */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_ALWAYS:
+	 /* always pass */
+	 for (i=0;i<n;i++) {
+	    fail[i] = 0;
+	 }
+	 break;
+      default:
+         _mesa_problem(ctx, "Bad stencil func in gl_stencil_span");
+         return 0;
+   }
+
+   if (ctx->Stencil.FailFunc[face] != GL_KEEP) {
+      apply_stencil_op( ctx, ctx->Stencil.FailFunc[face], face, n, stencil, fail );
+   }
+
+   return !allfail;
+}
+
+
+/**
+ * Compute the zpass/zfail masks by comparing the pre- and post-depth test
+ * masks.
+ */
+static INLINE void
+compute_pass_fail_masks(GLuint n, const GLubyte origMask[],
+                        const GLubyte newMask[],
+                        GLubyte passMask[], GLubyte failMask[])
+{
+   GLuint i;
+   for (i = 0; i < n; i++) {
+      ASSERT(newMask[i] == 0 || newMask[i] == 1);
+      passMask[i] = origMask[i] & newMask[i];
+      failMask[i] = origMask[i] & (newMask[i] ^ 1);
+   }
+}
+
+
+/**
+ * Apply stencil and depth testing to the span of pixels.
+ * Both software and hardware stencil buffers are acceptable.
+ * Input:  n - number of pixels in the span
+ *         x, y - location of leftmost pixel in span
+ *         z - array [n] of z values
+ *         mask - array [n] of flags  (1=test this pixel, 0=skip the pixel)
+ * Output:  mask - array [n] of flags (1=stencil and depth test passed)
+ * Return: GL_FALSE - all fragments failed the testing
+ *         GL_TRUE - one or more fragments passed the testing
+ *
+ */
+static GLboolean
+stencil_and_ztest_span(struct gl_context *ctx, SWspan *span, GLuint face)
+{
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   struct gl_renderbuffer *rb = fb->_StencilBuffer;
+   GLstencil stencilRow[MAX_WIDTH];
+   GLstencil *stencil;
+   const GLuint n = span->end;
+   const GLint x = span->x;
+   const GLint y = span->y;
+   GLubyte *mask = span->array->mask;
+
+   ASSERT((span->arrayMask & SPAN_XY) == 0);
+   ASSERT(ctx->Stencil.Enabled);
+   ASSERT(n <= MAX_WIDTH);
+#ifdef DEBUG
+   if (ctx->Depth.Test) {
+      ASSERT(span->arrayMask & SPAN_Z);
+   }
+#endif
+
+   stencil = (GLstencil *) rb->GetPointer(ctx, rb, x, y);
+   if (!stencil) {
+      rb->GetRow(ctx, rb, n, x, y, stencilRow);
+      stencil = stencilRow;
+   }
+
+   /*
+    * Apply the stencil test to the fragments.
+    * failMask[i] is 1 if the stencil test failed.
+    */
+   if (do_stencil_test( ctx, face, n, stencil, mask ) == GL_FALSE) {
+      /* all fragments failed the stencil test, we're done. */
+      span->writeAll = GL_FALSE;
+      if (!rb->GetPointer(ctx, rb, 0, 0)) {
+         /* put updated stencil values into buffer */
+         rb->PutRow(ctx, rb, n, x, y, stencil, NULL);
+      }
+      return GL_FALSE;
+   }
+
+   /*
+    * Some fragments passed the stencil test, apply depth test to them
+    * and apply Zpass and Zfail stencil ops.
+    */
+   if (ctx->Depth.Test == GL_FALSE ||
+       ctx->DrawBuffer->_DepthBuffer == NULL) {
+      /*
+       * No depth buffer, just apply zpass stencil function to active pixels.
+       */
+      apply_stencil_op( ctx, ctx->Stencil.ZPassFunc[face], face, n, stencil, mask );
+   }
+   else {
+      /*
+       * Perform depth buffering, then apply zpass or zfail stencil function.
+       */
+      GLubyte passMask[MAX_WIDTH], failMask[MAX_WIDTH], origMask[MAX_WIDTH];
+
+      /* save the current mask bits */
+      memcpy(origMask, mask, n * sizeof(GLubyte));
+
+      /* apply the depth test */
+      _swrast_depth_test_span(ctx, span);
+
+      compute_pass_fail_masks(n, origMask, mask, passMask, failMask);
+
+      /* apply the pass and fail operations */
+      if (ctx->Stencil.ZFailFunc[face] != GL_KEEP) {
+         apply_stencil_op( ctx, ctx->Stencil.ZFailFunc[face], face,
+                           n, stencil, failMask );
+      }
+      if (ctx->Stencil.ZPassFunc[face] != GL_KEEP) {
+         apply_stencil_op( ctx, ctx->Stencil.ZPassFunc[face], face,
+                           n, stencil, passMask );
+      }
+   }
+
+   /*
+    * Write updated stencil values back into hardware stencil buffer.
+    */
+   if (!rb->GetPointer(ctx, rb, 0, 0)) {
+      rb->PutRow(ctx, rb, n, x, y, stencil, NULL);
+   }
+   
+   span->writeAll = GL_FALSE;
+   
+   return GL_TRUE;  /* one or more fragments passed both tests */
+}
+
+
+
+/*
+ * Return the address of a stencil buffer value given the window coords:
+ */
+#define STENCIL_ADDRESS(X, Y)  (stencilStart + (Y) * stride + (X))
+
+
+
+/**
+ * Apply the given stencil operator for each pixel in the array whose
+ * mask flag is set.
+ * \note  This is for software stencil buffers only.
+ * Input:  n - number of pixels in the span
+ *         x, y - array of [n] pixels
+ *         operator - the stencil buffer operator
+ *         mask - array [n] of flag:  1=apply operator, 0=don't apply operator
+ */
+static void
+apply_stencil_op_to_pixels( struct gl_context *ctx,
+                            GLuint n, const GLint x[], const GLint y[],
+                            GLenum oper, GLuint face, const GLubyte mask[] )
+{
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   struct gl_renderbuffer *rb = fb->_StencilBuffer;
+   const GLstencil stencilMax = (1 << fb->Visual.stencilBits) - 1;
+   const GLstencil ref = ctx->Stencil.Ref[face];
+   const GLstencil wrtmask = ctx->Stencil.WriteMask[face];
+   const GLstencil invmask = (GLstencil) (~wrtmask);
+   GLuint i;
+   GLstencil *stencilStart = (GLubyte *) rb->Data;
+   const GLuint stride = rb->Width;
+
+   ASSERT(rb->GetPointer(ctx, rb, 0, 0));
+   ASSERT(sizeof(GLstencil) == 1);
+
+   switch (oper) {
+      case GL_KEEP:
+         /* do nothing */
+         break;
+      case GL_ZERO:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+                  *sptr = 0;
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+		  *sptr = (GLstencil) (invmask & *sptr);
+	       }
+	    }
+	 }
+	 break;
+      case GL_REPLACE:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+                  *sptr = ref;
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+		  *sptr = (GLstencil) ((invmask & *sptr ) | (wrtmask & ref));
+	       }
+	    }
+	 }
+	 break;
+      case GL_INCR:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+		  if (*sptr < stencilMax) {
+		     *sptr = (GLstencil) (*sptr + 1);
+		  }
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+		  if (*sptr < stencilMax) {
+		     *sptr = (GLstencil) ((invmask & *sptr) | (wrtmask & (*sptr+1)));
+		  }
+	       }
+	    }
+	 }
+	 break;
+      case GL_DECR:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+		  if (*sptr>0) {
+		     *sptr = (GLstencil) (*sptr - 1);
+		  }
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+		  if (*sptr>0) {
+		     *sptr = (GLstencil) ((invmask & *sptr) | (wrtmask & (*sptr-1)));
+		  }
+	       }
+	    }
+	 }
+	 break;
+      case GL_INCR_WRAP_EXT:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+                  *sptr = (GLstencil) (*sptr + 1);
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+                  *sptr = (GLstencil) ((invmask & *sptr) | (wrtmask & (*sptr+1)));
+	       }
+	    }
+	 }
+	 break;
+      case GL_DECR_WRAP_EXT:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+                  *sptr = (GLstencil) (*sptr - 1);
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+                  *sptr = (GLstencil) ((invmask & *sptr) | (wrtmask & (*sptr-1)));
+	       }
+	    }
+	 }
+	 break;
+      case GL_INVERT:
+	 if (invmask==0) {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+                  *sptr = (GLstencil) (~*sptr);
+	       }
+	    }
+	 }
+	 else {
+	    for (i=0;i<n;i++) {
+	       if (mask[i]) {
+                  GLstencil *sptr = STENCIL_ADDRESS( x[i], y[i] );
+                  *sptr = (GLstencil) ((invmask & *sptr) | (wrtmask & ~*sptr));
+	       }
+	    }
+	 }
+	 break;
+      default:
+         _mesa_problem(ctx, "Bad stencilop in apply_stencil_op_to_pixels");
+   }
+}
+
+
+
+/**
+ * Apply stencil test to an array of pixels before depth buffering.
+ *
+ * \note Used for software stencil buffer only.
+ * Input:  n - number of pixels in the span
+ *         x, y - array of [n] pixels to stencil
+ *         mask - array [n] of flag:  0=skip the pixel, 1=stencil the pixel
+ * Output:  mask - pixels which fail the stencil test will have their
+ *                 mask flag set to 0.
+ * \return  GL_FALSE = all pixels failed, GL_TRUE = zero or more pixels passed.
+ */
+static GLboolean
+stencil_test_pixels( struct gl_context *ctx, GLuint face, GLuint n,
+                     const GLint x[], const GLint y[], GLubyte mask[] )
+{
+   const struct gl_framebuffer *fb = ctx->DrawBuffer;
+   struct gl_renderbuffer *rb = fb->_StencilBuffer;
+   GLubyte fail[MAX_WIDTH];
+   GLstencil r, s;
+   GLuint i;
+   GLboolean allfail = GL_FALSE;
+   const GLuint valueMask = ctx->Stencil.ValueMask[face];
+   const GLstencil *stencilStart = (GLstencil *) rb->Data;
+   const GLuint stride = rb->Width;
+
+   ASSERT(rb->GetPointer(ctx, rb, 0, 0));
+   ASSERT(sizeof(GLstencil) == 1);
+
+   /*
+    * Perform stencil test.  The results of this operation are stored
+    * in the fail[] array:
+    *   IF fail[i] is non-zero THEN
+    *       the stencil fail operator is to be applied
+    *   ELSE
+    *       the stencil fail operator is not to be applied
+    *   ENDIF
+    */
+
+   switch (ctx->Stencil.Function[face]) {
+      case GL_NEVER:
+         /* always fail */
+         for (i=0;i<n;i++) {
+	    if (mask[i]) {
+	       mask[i] = 0;
+	       fail[i] = 1;
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 allfail = GL_TRUE;
+	 break;
+      case GL_LESS:
+	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
+	       s = (GLstencil) (*sptr & valueMask);
+	       if (r < s) {
+		  /* passed */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_LEQUAL:
+	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
+	       s = (GLstencil) (*sptr & valueMask);
+	       if (r <= s) {
+		  /* pass */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_GREATER:
+	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
+	       s = (GLstencil) (*sptr & valueMask);
+	       if (r > s) {
+		  /* passed */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_GEQUAL:
+	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
+	       s = (GLstencil) (*sptr & valueMask);
+	       if (r >= s) {
+		  /* passed */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_EQUAL:
+	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
+	       s = (GLstencil) (*sptr & valueMask);
+	       if (r == s) {
+		  /* passed */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_NOTEQUAL:
+	 r = (GLstencil) (ctx->Stencil.Ref[face] & valueMask);
+	 for (i=0;i<n;i++) {
+	    if (mask[i]) {
+               const GLstencil *sptr = STENCIL_ADDRESS(x[i],y[i]);
+	       s = (GLstencil) (*sptr & valueMask);
+	       if (r != s) {
+		  /* passed */
+		  fail[i] = 0;
+	       }
+	       else {
+		  fail[i] = 1;
+		  mask[i] = 0;
+	       }
+	    }
+	    else {
+	       fail[i] = 0;
+	    }
+	 }
+	 break;
+      case GL_ALWAYS:
+	 /* always pass */
+	 for (i=0;i<n;i++) {
+	    fail[i] = 0;
+	 }
+	 break;
+      default:
+         _mesa_problem(ctx, "Bad stencil func in gl_stencil_pixels");
+         return 0;
+   }
+
+   if (ctx->Stencil.FailFunc[face] != GL_KEEP) {
+      apply_stencil_op_to_pixels( ctx, n, x, y, ctx->Stencil.FailFunc[face],
+                                  face, fail );
+   }
+
+   return !allfail;
+}
+
+
+
+
+/**
+ * Apply stencil and depth testing to an array of pixels.
+ * This is used both for software and hardware stencil buffers.
+ *
+ * The comments in this function are a bit sparse but the code is
+ * almost identical to stencil_and_ztest_span(), which is well
+ * commented.
+ *
+ * Input:  n - number of pixels in the array
+ *         x, y - array of [n] pixel positions
+ *         z - array [n] of z values
+ *         mask - array [n] of flags  (1=test this pixel, 0=skip the pixel)
+ * Output: mask - array [n] of flags (1=stencil and depth test passed)
+ * Return: GL_FALSE - all fragments failed the testing
+ *         GL_TRUE - one or more fragments passed the testing
+ */
+static GLboolean
+stencil_and_ztest_pixels( struct gl_context *ctx, SWspan *span, GLuint face )
+{
+   GLubyte passMask[MAX_WIDTH], failMask[MAX_WIDTH], origMask[MAX_WIDTH];
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   struct gl_renderbuffer *rb = fb->_StencilBuffer;
+   const GLuint n = span->end;
+   const GLint *x = span->array->x;
+   const GLint *y = span->array->y;
+   GLubyte *mask = span->array->mask;
+
+   ASSERT(span->arrayMask & SPAN_XY);
+   ASSERT(ctx->Stencil.Enabled);
+   ASSERT(n <= MAX_WIDTH);
+
+   if (!rb->GetPointer(ctx, rb, 0, 0)) {
+      /* No direct access */
+      GLstencil stencil[MAX_WIDTH];
+
+      ASSERT(rb->DataType == GL_UNSIGNED_BYTE);
+      _swrast_get_values(ctx, rb, n, x, y, stencil, sizeof(GLubyte));
+
+      memcpy(origMask, mask, n * sizeof(GLubyte));          
+
+      (void) do_stencil_test(ctx, face, n, stencil, mask);
+
+      if (ctx->Depth.Test == GL_FALSE) {
+         apply_stencil_op(ctx, ctx->Stencil.ZPassFunc[face], face,
+                          n, stencil, mask);
+      }
+      else {
+         GLubyte tmpMask[MAX_WIDTH]; 
+         memcpy(tmpMask, mask, n * sizeof(GLubyte));
+
+         _swrast_depth_test_span(ctx, span);
+
+         compute_pass_fail_masks(n, tmpMask, mask, passMask, failMask);
+
+         if (ctx->Stencil.ZFailFunc[face] != GL_KEEP) {
+            apply_stencil_op(ctx, ctx->Stencil.ZFailFunc[face], face,
+                             n, stencil, failMask);
+         }
+         if (ctx->Stencil.ZPassFunc[face] != GL_KEEP) {
+            apply_stencil_op(ctx, ctx->Stencil.ZPassFunc[face], face,
+                             n, stencil, passMask);
+         }
+      }
+
+      /* Write updated stencil values into hardware stencil buffer */
+      rb->PutValues(ctx, rb, n, x, y, stencil, origMask);
+
+      return GL_TRUE;
+   }
+   else {
+      /* Direct access to stencil buffer */
+
+      if (stencil_test_pixels(ctx, face, n, x, y, mask) == GL_FALSE) {
+         /* all fragments failed the stencil test, we're done. */
+         return GL_FALSE;
+      }
+
+      if (ctx->Depth.Test==GL_FALSE) {
+         apply_stencil_op_to_pixels(ctx, n, x, y,
+                                    ctx->Stencil.ZPassFunc[face], face, mask);
+      }
+      else {
+         memcpy(origMask, mask, n * sizeof(GLubyte));
+
+         _swrast_depth_test_span(ctx, span);
+
+         compute_pass_fail_masks(n, origMask, mask, passMask, failMask);
+
+         if (ctx->Stencil.ZFailFunc[face] != GL_KEEP) {
+            apply_stencil_op_to_pixels(ctx, n, x, y,
+                                       ctx->Stencil.ZFailFunc[face],
+                                       face, failMask);
+         }
+         if (ctx->Stencil.ZPassFunc[face] != GL_KEEP) {
+            apply_stencil_op_to_pixels(ctx, n, x, y,
+                                       ctx->Stencil.ZPassFunc[face],
+                                       face, passMask);
+         }
+      }
+
+      return GL_TRUE;  /* one or more fragments passed both tests */
+   }
+}
+
+
+/**
+ * /return GL_TRUE = one or more fragments passed,
+ * GL_FALSE = all fragments failed.
+ */
+GLboolean
+_swrast_stencil_and_ztest_span(struct gl_context *ctx, SWspan *span)
+{
+   const GLuint face = (span->facing == 0) ? 0 : ctx->Stencil._BackFace;
+
+   if (span->arrayMask & SPAN_XY)
+      return stencil_and_ztest_pixels(ctx, span, face);
+   else
+      return stencil_and_ztest_span(ctx, span, face);
+}
+
+
+#if 0
+GLuint
+clip_span(GLuint bufferWidth, GLuint bufferHeight,
+          GLint x, GLint y, GLuint *count)
+{
+   GLuint n = *count;
+   GLuint skipPixels = 0;
+
+   if (y < 0 || y >= bufferHeight || x + n <= 0 || x >= bufferWidth) {
+      /* totally out of bounds */
+      n = 0;
+   }
+   else {
+      /* left clip */
+      if (x < 0) {
+         skipPixels = -x;
+         x = 0;
+         n -= skipPixels;
+      }
+      /* right clip */
+      if (x + n > bufferWidth) {
+         GLint dx = x + n - bufferWidth;
+         n -= dx;
+      }
+   }
+
+   *count = n;
+
+   return skipPixels;
+}
+#endif
+
+
+/**
+ * Return a span of stencil values from the stencil buffer.
+ * Used for glRead/CopyPixels
+ * Input:  n - how many pixels
+ *         x,y - location of first pixel
+ * Output:  stencil - the array of stencil values
+ */
+void
+_swrast_read_stencil_span(struct gl_context *ctx, struct gl_renderbuffer *rb,
+                          GLint n, GLint x, GLint y, GLstencil stencil[])
+{
+   if (y < 0 || y >= (GLint) rb->Height ||
+       x + n <= 0 || x >= (GLint) rb->Width) {
+      /* span is completely outside framebuffer */
+      return; /* undefined values OK */
+   }
+
+   if (x < 0) {
+      GLint dx = -x;
+      x = 0;
+      n -= dx;
+      stencil += dx;
+   }
+   if (x + n > (GLint) rb->Width) {
+      GLint dx = x + n - rb->Width;
+      n -= dx;
+   }
+   if (n <= 0) {
+      return;
+   }
+
+   rb->GetRow(ctx, rb, n, x, y, stencil);
+}
+
+
+
+/**
+ * Write a span of stencil values to the stencil buffer.  This function
+ * applies the stencil write mask when needed.
+ * Used for glDraw/CopyPixels
+ * Input:  n - how many pixels
+ *         x, y - location of first pixel
+ *         stencil - the array of stencil values
+ */
+void
+_swrast_write_stencil_span(struct gl_context *ctx, GLint n, GLint x, GLint y,
+                           const GLstencil stencil[] )
+{
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   struct gl_renderbuffer *rb = fb->_StencilBuffer;
+   const GLuint stencilMax = (1 << fb->Visual.stencilBits) - 1;
+   const GLuint stencilMask = ctx->Stencil.WriteMask[0];
+
+   if (y < 0 || y >= (GLint) rb->Height ||
+       x + n <= 0 || x >= (GLint) rb->Width) {
+      /* span is completely outside framebuffer */
+      return; /* undefined values OK */
+   }
+   if (x < 0) {
+      GLint dx = -x;
+      x = 0;
+      n -= dx;
+      stencil += dx;
+   }
+   if (x + n > (GLint) rb->Width) {
+      GLint dx = x + n - rb->Width;
+      n -= dx;
+   }
+   if (n <= 0) {
+      return;
+   }
+
+   if ((stencilMask & stencilMax) != stencilMax) {
+      /* need to apply writemask */
+      GLstencil destVals[MAX_WIDTH], newVals[MAX_WIDTH];
+      GLint i;
+      rb->GetRow(ctx, rb, n, x, y, destVals);
+      for (i = 0; i < n; i++) {
+         newVals[i]
+            = (stencil[i] & stencilMask) | (destVals[i] & ~stencilMask);
+      }
+      rb->PutRow(ctx, rb, n, x, y, newVals, NULL);
+   }
+   else {
+      rb->PutRow(ctx, rb, n, x, y, stencil, NULL);
+   }
+}
+
+
+
+/**
+ * Clear the stencil buffer.
+ */
+void
+_swrast_clear_stencil_buffer( struct gl_context *ctx, struct gl_renderbuffer *rb )
+{
+   const GLubyte stencilBits = ctx->DrawBuffer->Visual.stencilBits;
+   const GLuint mask = ctx->Stencil.WriteMask[0];
+   const GLuint invMask = ~mask;
+   const GLuint clearVal = (ctx->Stencil.Clear & mask);
+   const GLuint stencilMax = (1 << stencilBits) - 1;
+   GLint x, y, width, height;
+
+   if (!rb || mask == 0)
+      return;
+
+   ASSERT(rb->DataType == GL_UNSIGNED_BYTE ||
+          rb->DataType == GL_UNSIGNED_SHORT);
+
+   ASSERT(rb->_BaseFormat == GL_STENCIL_INDEX);
+
+   /* compute region to clear */
+   x = ctx->DrawBuffer->_Xmin;
+   y = ctx->DrawBuffer->_Ymin;
+   width  = ctx->DrawBuffer->_Xmax - ctx->DrawBuffer->_Xmin;
+   height = ctx->DrawBuffer->_Ymax - ctx->DrawBuffer->_Ymin;
+
+   if (rb->GetPointer(ctx, rb, 0, 0)) {
+      /* Direct buffer access */
+      if ((mask & stencilMax) != stencilMax) {
+         /* need to mask the clear */
+         if (rb->DataType == GL_UNSIGNED_BYTE) {
+            GLint i, j;
+            for (i = 0; i < height; i++) {
+               GLubyte *stencil = (GLubyte*) rb->GetPointer(ctx, rb, x, y + i);
+               for (j = 0; j < width; j++) {
+                  stencil[j] = (stencil[j] & invMask) | clearVal;
+               }
+            }
+         }
+         else {
+            GLint i, j;
+            for (i = 0; i < height; i++) {
+               GLushort *stencil = (GLushort*) rb->GetPointer(ctx, rb, x, y + i);
+               for (j = 0; j < width; j++) {
+                  stencil[j] = (stencil[j] & invMask) | clearVal;
+               }
+            }
+         }
+      }
+      else {
+         /* no bit masking */
+         if (width == (GLint) rb->Width && rb->DataType == GL_UNSIGNED_BYTE) {
+            /* optimized case */
+            /* Note: bottom-to-top raster assumed! */
+            GLubyte *stencil = (GLubyte *) rb->GetPointer(ctx, rb, x, y);
+            GLuint len = width * height * sizeof(GLubyte);
+            memset(stencil, clearVal, len);
+         }
+         else {
+            /* general case */
+            GLint i;
+            for (i = 0; i < height; i++) {
+               GLvoid *stencil = rb->GetPointer(ctx, rb, x, y + i);
+               if (rb->DataType == GL_UNSIGNED_BYTE) {
+                  memset(stencil, clearVal, width);
+               }
+               else {
+                  _mesa_memset16((short unsigned int*) stencil, clearVal, width);
+               }
+            }
+         }
+      }
+   }
+   else {
+      /* no direct access */
+      if ((mask & stencilMax) != stencilMax) {
+         /* need to mask the clear */
+         if (rb->DataType == GL_UNSIGNED_BYTE) {
+            GLint i, j;
+            for (i = 0; i < height; i++) {
+               GLubyte stencil[MAX_WIDTH];
+               rb->GetRow(ctx, rb, width, x, y + i, stencil);
+               for (j = 0; j < width; j++) {
+                  stencil[j] = (stencil[j] & invMask) | clearVal;
+               }
+               rb->PutRow(ctx, rb, width, x, y + i, stencil, NULL);
+            }
+         }
+         else {
+            GLint i, j;
+            for (i = 0; i < height; i++) {
+               GLushort stencil[MAX_WIDTH];
+               rb->GetRow(ctx, rb, width, x, y + i, stencil);
+               for (j = 0; j < width; j++) {
+                  stencil[j] = (stencil[j] & invMask) | clearVal;
+               }
+               rb->PutRow(ctx, rb, width, x, y + i, stencil, NULL);
+            }
+         }
+      }
+      else {
+         /* no bit masking */
+         const GLubyte clear8 = (GLubyte) clearVal;
+         const GLushort clear16 = (GLushort) clearVal;
+         const void *clear;
+         GLint i;
+         if (rb->DataType == GL_UNSIGNED_BYTE) {
+            clear = &clear8;
+         }
+         else {
+            clear = &clear16;
+         }
+         for (i = 0; i < height; i++) {
+            rb->PutMonoRow(ctx, rb, width, x, y + i, clear, NULL);
+         }
+      }
+   }
+}
diff --git a/mesalib/src/mesa/swrast/s_texcombine.c b/mesalib/src/mesa/swrast/s_texcombine.c
index 53ef2f890..80b9dff3c 100644
--- a/mesalib/src/mesa/swrast/s_texcombine.c
+++ b/mesalib/src/mesa/swrast/s_texcombine.c
@@ -1,751 +1,755 @@
-/*
- * Mesa 3-D graphics library
- * Version:  7.5
- *
- * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
- * Copyright (C) 2009  VMware, Inc.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/colormac.h"
-#include "main/imports.h"
-#include "main/pixeltransfer.h"
-#include "program/prog_instruction.h"
-
-#include "s_context.h"
-#include "s_texcombine.h"
-
-
-/**
- * Pointer to array of float[4]
- * This type makes the code below more concise and avoids a lot of casting.
- */
-typedef float (*float4_array)[4];
-
-
-/**
- * Return array of texels for given unit.
- */
-static INLINE float4_array
-get_texel_array(SWcontext *swrast, GLuint unit)
-{
-   return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4);
-}
-
-
-
-/**
- * Do texture application for:
- *  GL_EXT_texture_env_combine
- *  GL_ARB_texture_env_combine
- *  GL_EXT_texture_env_dot3
- *  GL_ARB_texture_env_dot3
- *  GL_ATI_texture_env_combine3
- *  GL_NV_texture_env_combine4
- *  conventional GL texture env modes
- *
- * \param ctx          rendering context
- * \param unit         the texture combiner unit
- * \param n            number of fragments to process (span width)
- * \param primary_rgba incoming fragment color array
- * \param texelBuffer  pointer to texel colors for all texture units
- * 
- * \param rgba         incoming/result fragment colors
- */
-static void
-texture_combine( struct gl_context *ctx, GLuint unit, GLuint n,
-                 const float4_array primary_rgba,
-                 const GLfloat *texelBuffer,
-                 GLchan (*rgbaChan)[4] )
-{
-   SWcontext *swrast = SWRAST_CONTEXT(ctx);
-   const struct gl_texture_unit *textureUnit = &(ctx->Texture.Unit[unit]);
-   const struct gl_tex_env_combine_state *combine = textureUnit->_CurrentCombine;
-   float4_array argRGB[MAX_COMBINER_TERMS];
-   float4_array argA[MAX_COMBINER_TERMS];
-   const GLfloat scaleRGB = (GLfloat) (1 << combine->ScaleShiftRGB);
-   const GLfloat scaleA = (GLfloat) (1 << combine->ScaleShiftA);
-   const GLuint numArgsRGB = combine->_NumArgsRGB;
-   const GLuint numArgsA = combine->_NumArgsA;
-   float4_array ccolor[4], rgba;
-   GLuint i, term;
-
-   /* alloc temp pixel buffers */
-   rgba = (float4_array) malloc(4 * n * sizeof(GLfloat));
-   if (!rgba) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "texture_combine");
-      return;
-   }
-
-   for (i = 0; i < numArgsRGB || i < numArgsA; i++) {
-      ccolor[i] = (float4_array) malloc(4 * n * sizeof(GLfloat));
-      if (!ccolor[i]) {
-         while (i) {
-            free(ccolor[i]);
-            i--;
-         }
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "texture_combine");
-         return;
-      }
-   }
-
-   for (i = 0; i < n; i++) {
-      rgba[i][RCOMP] = CHAN_TO_FLOAT(rgbaChan[i][RCOMP]);
-      rgba[i][GCOMP] = CHAN_TO_FLOAT(rgbaChan[i][GCOMP]);
-      rgba[i][BCOMP] = CHAN_TO_FLOAT(rgbaChan[i][BCOMP]);
-      rgba[i][ACOMP] = CHAN_TO_FLOAT(rgbaChan[i][ACOMP]);
-   }
-
-   /*
-   printf("modeRGB 0x%x  modeA 0x%x  srcRGB1 0x%x  srcA1 0x%x  srcRGB2 0x%x  srcA2 0x%x\n",
-          combine->ModeRGB,
-          combine->ModeA,
-          combine->SourceRGB[0],
-          combine->SourceA[0],
-          combine->SourceRGB[1],
-          combine->SourceA[1]);
-   */
-
-   /*
-    * Do operand setup for up to 4 operands.  Loop over the terms.
-    */
-   for (term = 0; term < numArgsRGB; term++) {
-      const GLenum srcRGB = combine->SourceRGB[term];
-      const GLenum operandRGB = combine->OperandRGB[term];
-
-      switch (srcRGB) {
-         case GL_TEXTURE:
-            argRGB[term] = get_texel_array(swrast, unit);
-            break;
-         case GL_PRIMARY_COLOR:
-            argRGB[term] = primary_rgba;
-            break;
-         case GL_PREVIOUS:
-            argRGB[term] = rgba;
-            break;
-         case GL_CONSTANT:
-            {
-               float4_array c = ccolor[term];
-               GLfloat red   = textureUnit->EnvColor[0];
-               GLfloat green = textureUnit->EnvColor[1];
-               GLfloat blue  = textureUnit->EnvColor[2];
-               GLfloat alpha = textureUnit->EnvColor[3];
-               for (i = 0; i < n; i++) {
-                  ASSIGN_4V(c[i], red, green, blue, alpha);
-               }
-               argRGB[term] = ccolor[term];
-            }
-            break;
-	 /* GL_ATI_texture_env_combine3 allows GL_ZERO & GL_ONE as sources.
-	  */
-	 case GL_ZERO:
-            {
-               float4_array c = ccolor[term];
-               for (i = 0; i < n; i++) {
-                  ASSIGN_4V(c[i], 0.0F, 0.0F, 0.0F, 0.0F);
-               }
-               argRGB[term] = ccolor[term];
-            }
-            break;
-	 case GL_ONE:
-            {
-               float4_array c = ccolor[term];
-               for (i = 0; i < n; i++) {
-                  ASSIGN_4V(c[i], 1.0F, 1.0F, 1.0F, 1.0F);
-               }
-               argRGB[term] = ccolor[term];
-            }
-            break;
-         default:
-            /* ARB_texture_env_crossbar source */
-            {
-               const GLuint srcUnit = srcRGB - GL_TEXTURE0;
-               ASSERT(srcUnit < ctx->Const.MaxTextureUnits);
-               if (!ctx->Texture.Unit[srcUnit]._ReallyEnabled)
-                  goto end;
-               argRGB[term] = get_texel_array(swrast, srcUnit);
-            }
-      }
-
-      if (operandRGB != GL_SRC_COLOR) {
-         float4_array src = argRGB[term];
-         float4_array dst = ccolor[term];
-
-         /* point to new arg[term] storage */
-         argRGB[term] = ccolor[term];
-
-         switch (operandRGB) {
-         case GL_ONE_MINUS_SRC_COLOR:
-            for (i = 0; i < n; i++) {
-               dst[i][RCOMP] = 1.0F - src[i][RCOMP];
-               dst[i][GCOMP] = 1.0F - src[i][GCOMP];
-               dst[i][BCOMP] = 1.0F - src[i][BCOMP];
-            }
-            break;
-         case GL_SRC_ALPHA:
-            for (i = 0; i < n; i++) {
-               dst[i][RCOMP] =
-               dst[i][GCOMP] =
-               dst[i][BCOMP] = src[i][ACOMP];
-            }
-            break;
-         case GL_ONE_MINUS_SRC_ALPHA:
-            for (i = 0; i < n; i++) {
-               dst[i][RCOMP] =
-               dst[i][GCOMP] =
-               dst[i][BCOMP] = 1.0F - src[i][ACOMP];
-            }
-            break;
-         default:
-            _mesa_problem(ctx, "Bad operandRGB");
-         }
-      }
-   }
-
-   /*
-    * Set up the argA[term] pointers
-    */
-   for (term = 0; term < numArgsA; term++) {
-      const GLenum srcA = combine->SourceA[term];
-      const GLenum operandA = combine->OperandA[term];
-
-      switch (srcA) {
-         case GL_TEXTURE:
-            argA[term] = get_texel_array(swrast, unit);
-            break;
-         case GL_PRIMARY_COLOR:
-            argA[term] = primary_rgba;
-            break;
-         case GL_PREVIOUS:
-            argA[term] = rgba;
-            break;
-         case GL_CONSTANT:
-            {
-               float4_array c = ccolor[term];
-               GLfloat alpha = textureUnit->EnvColor[3];
-               for (i = 0; i < n; i++)
-                  c[i][ACOMP] = alpha;
-               argA[term] = ccolor[term];
-            }
-            break;
-	 /* GL_ATI_texture_env_combine3 allows GL_ZERO & GL_ONE as sources.
-	  */
-	 case GL_ZERO:
-            {
-               float4_array c = ccolor[term];
-               for (i = 0; i < n; i++)
-                  c[i][ACOMP] = 0.0F;
-               argA[term] = ccolor[term];
-            }
-            break;
-	 case GL_ONE:
-            {
-               float4_array c = ccolor[term];
-               for (i = 0; i < n; i++)
-                  c[i][ACOMP] = 1.0F;
-               argA[term] = ccolor[term];
-            }
-            break;
-         default:
-            /* ARB_texture_env_crossbar source */
-            {
-               const GLuint srcUnit = srcA - GL_TEXTURE0;
-               ASSERT(srcUnit < ctx->Const.MaxTextureUnits);
-               if (!ctx->Texture.Unit[srcUnit]._ReallyEnabled)
-                  goto end;
-               argA[term] = get_texel_array(swrast, srcUnit);
-            }
-      }
-
-      if (operandA == GL_ONE_MINUS_SRC_ALPHA) {
-         float4_array src = argA[term];
-         float4_array dst = ccolor[term];
-         argA[term] = ccolor[term];
-         for (i = 0; i < n; i++) {
-            dst[i][ACOMP] = 1.0F - src[i][ACOMP];
-         }
-      }
-   }
-
-   /* RGB channel combine */
-   {
-      float4_array arg0 = argRGB[0];
-      float4_array arg1 = argRGB[1];
-      float4_array arg2 = argRGB[2];
-      float4_array arg3 = argRGB[3];
-
-      switch (combine->ModeRGB) {
-      case GL_REPLACE:
-         for (i = 0; i < n; i++) {
-            rgba[i][RCOMP] = arg0[i][RCOMP] * scaleRGB;
-            rgba[i][GCOMP] = arg0[i][GCOMP] * scaleRGB;
-            rgba[i][BCOMP] = arg0[i][BCOMP] * scaleRGB;
-         }
-         break;
-      case GL_MODULATE:
-         for (i = 0; i < n; i++) {
-            rgba[i][RCOMP] = arg0[i][RCOMP] * arg1[i][RCOMP] * scaleRGB;
-            rgba[i][GCOMP] = arg0[i][GCOMP] * arg1[i][GCOMP] * scaleRGB;
-            rgba[i][BCOMP] = arg0[i][BCOMP] * arg1[i][BCOMP] * scaleRGB;
-         }
-         break;
-      case GL_ADD:
-         if (textureUnit->EnvMode == GL_COMBINE4_NV) {
-            /* (a * b) + (c * d) */
-            for (i = 0; i < n; i++) {
-               rgba[i][RCOMP] = (arg0[i][RCOMP] * arg1[i][RCOMP] +
-                                 arg2[i][RCOMP] * arg3[i][RCOMP]) * scaleRGB;
-               rgba[i][GCOMP] = (arg0[i][GCOMP] * arg1[i][GCOMP] +
-                                 arg2[i][GCOMP] * arg3[i][GCOMP]) * scaleRGB;
-               rgba[i][BCOMP] = (arg0[i][BCOMP] * arg1[i][BCOMP] +
-                                 arg2[i][BCOMP] * arg3[i][BCOMP]) * scaleRGB;
-            }
-         }
-         else {
-            /* 2-term addition */
-            for (i = 0; i < n; i++) {
-               rgba[i][RCOMP] = (arg0[i][RCOMP] + arg1[i][RCOMP]) * scaleRGB;
-               rgba[i][GCOMP] = (arg0[i][GCOMP] + arg1[i][GCOMP]) * scaleRGB;
-               rgba[i][BCOMP] = (arg0[i][BCOMP] + arg1[i][BCOMP]) * scaleRGB;
-            }
-         }
-         break;
-      case GL_ADD_SIGNED:
-         if (textureUnit->EnvMode == GL_COMBINE4_NV) {
-            /* (a * b) + (c * d) - 0.5 */
-            for (i = 0; i < n; i++) {
-               rgba[i][RCOMP] = (arg0[i][RCOMP] * arg1[i][RCOMP] +
-                                 arg2[i][RCOMP] * arg3[i][RCOMP] - 0.5F) * scaleRGB;
-               rgba[i][GCOMP] = (arg0[i][GCOMP] * arg1[i][GCOMP] +
-                                 arg2[i][GCOMP] * arg3[i][GCOMP] - 0.5F) * scaleRGB;
-               rgba[i][BCOMP] = (arg0[i][BCOMP] * arg1[i][BCOMP] +
-                                 arg2[i][BCOMP] * arg3[i][BCOMP] - 0.5F) * scaleRGB;
-            }
-         }
-         else {
-            for (i = 0; i < n; i++) {
-               rgba[i][RCOMP] = (arg0[i][RCOMP] + arg1[i][RCOMP] - 0.5F) * scaleRGB;
-               rgba[i][GCOMP] = (arg0[i][GCOMP] + arg1[i][GCOMP] - 0.5F) * scaleRGB;
-               rgba[i][BCOMP] = (arg0[i][BCOMP] + arg1[i][BCOMP] - 0.5F) * scaleRGB;
-            }
-         }
-         break;
-      case GL_INTERPOLATE:
-         for (i = 0; i < n; i++) {
-            rgba[i][RCOMP] = (arg0[i][RCOMP] * arg2[i][RCOMP] +
-                          arg1[i][RCOMP] * (1.0F - arg2[i][RCOMP])) * scaleRGB;
-            rgba[i][GCOMP] = (arg0[i][GCOMP] * arg2[i][GCOMP] +
-                          arg1[i][GCOMP] * (1.0F - arg2[i][GCOMP])) * scaleRGB;
-            rgba[i][BCOMP] = (arg0[i][BCOMP] * arg2[i][BCOMP] +
-                          arg1[i][BCOMP] * (1.0F - arg2[i][BCOMP])) * scaleRGB;
-         }
-         break;
-      case GL_SUBTRACT:
-         for (i = 0; i < n; i++) {
-            rgba[i][RCOMP] = (arg0[i][RCOMP] - arg1[i][RCOMP]) * scaleRGB;
-            rgba[i][GCOMP] = (arg0[i][GCOMP] - arg1[i][GCOMP]) * scaleRGB;
-            rgba[i][BCOMP] = (arg0[i][BCOMP] - arg1[i][BCOMP]) * scaleRGB;
-         }
-         break;
-      case GL_DOT3_RGB_EXT:
-      case GL_DOT3_RGBA_EXT:
-         /* Do not scale the result by 1 2 or 4 */
-         for (i = 0; i < n; i++) {
-            GLfloat dot = ((arg0[i][RCOMP] - 0.5F) * (arg1[i][RCOMP] - 0.5F) +
-                           (arg0[i][GCOMP] - 0.5F) * (arg1[i][GCOMP] - 0.5F) +
-                           (arg0[i][BCOMP] - 0.5F) * (arg1[i][BCOMP] - 0.5F))
-               * 4.0F;
-            dot = CLAMP(dot, 0.0F, 1.0F);
-            rgba[i][RCOMP] = rgba[i][GCOMP] = rgba[i][BCOMP] = dot;
-         }
-         break;
-      case GL_DOT3_RGB:
-      case GL_DOT3_RGBA:
-         /* DO scale the result by 1 2 or 4 */
-         for (i = 0; i < n; i++) {
-            GLfloat dot = ((arg0[i][RCOMP] - 0.5F) * (arg1[i][RCOMP] - 0.5F) +
-                           (arg0[i][GCOMP] - 0.5F) * (arg1[i][GCOMP] - 0.5F) +
-                           (arg0[i][BCOMP] - 0.5F) * (arg1[i][BCOMP] - 0.5F))
-               * 4.0F * scaleRGB;
-            dot = CLAMP(dot, 0.0F, 1.0F);
-            rgba[i][RCOMP] = rgba[i][GCOMP] = rgba[i][BCOMP] = dot;
-         }
-         break;
-      case GL_MODULATE_ADD_ATI:
-         for (i = 0; i < n; i++) {
-            rgba[i][RCOMP] = ((arg0[i][RCOMP] * arg2[i][RCOMP]) +
-                              arg1[i][RCOMP]) * scaleRGB;
-            rgba[i][GCOMP] = ((arg0[i][GCOMP] * arg2[i][GCOMP]) +
-                              arg1[i][GCOMP]) * scaleRGB;
-            rgba[i][BCOMP] = ((arg0[i][BCOMP] * arg2[i][BCOMP]) +
-                              arg1[i][BCOMP]) * scaleRGB;
-	 }
-         break;
-      case GL_MODULATE_SIGNED_ADD_ATI:
-         for (i = 0; i < n; i++) {
-            rgba[i][RCOMP] = ((arg0[i][RCOMP] * arg2[i][RCOMP]) +
-                              arg1[i][RCOMP] - 0.5F) * scaleRGB;
-            rgba[i][GCOMP] = ((arg0[i][GCOMP] * arg2[i][GCOMP]) +
-                              arg1[i][GCOMP] - 0.5F) * scaleRGB;
-            rgba[i][BCOMP] = ((arg0[i][BCOMP] * arg2[i][BCOMP]) +
-                              arg1[i][BCOMP] - 0.5F) * scaleRGB;
-	 }
-         break;
-      case GL_MODULATE_SUBTRACT_ATI:
-         for (i = 0; i < n; i++) {
-            rgba[i][RCOMP] = ((arg0[i][RCOMP] * arg2[i][RCOMP]) -
-                              arg1[i][RCOMP]) * scaleRGB;
-            rgba[i][GCOMP] = ((arg0[i][GCOMP] * arg2[i][GCOMP]) -
-                              arg1[i][GCOMP]) * scaleRGB;
-            rgba[i][BCOMP] = ((arg0[i][BCOMP] * arg2[i][BCOMP]) -
-                              arg1[i][BCOMP]) * scaleRGB;
-	 }
-         break;
-      case GL_BUMP_ENVMAP_ATI:
-         /* this produces a fixed rgba color, and the coord calc is done elsewhere */
-         for (i = 0; i < n; i++) {
-            /* rgba result is 0,0,0,1 */
-            rgba[i][RCOMP] = 0.0;
-            rgba[i][GCOMP] = 0.0;
-            rgba[i][BCOMP] = 0.0;
-            rgba[i][ACOMP] = 1.0;
-	 }
-         goto end; /* no alpha processing */
-      default:
-         _mesa_problem(ctx, "invalid combine mode");
-      }
-   }
-
-   /* Alpha channel combine */
-   {
-      float4_array arg0 = argA[0];
-      float4_array arg1 = argA[1];
-      float4_array arg2 = argA[2];
-      float4_array arg3 = argA[3];
-
-      switch (combine->ModeA) {
-      case GL_REPLACE:
-         for (i = 0; i < n; i++) {
-            rgba[i][ACOMP] = arg0[i][ACOMP] * scaleA;
-         }
-         break;
-      case GL_MODULATE:
-         for (i = 0; i < n; i++) {
-            rgba[i][ACOMP] = arg0[i][ACOMP] * arg1[i][ACOMP] * scaleA;
-         }
-         break;
-      case GL_ADD:
-         if (textureUnit->EnvMode == GL_COMBINE4_NV) {
-            /* (a * b) + (c * d) */
-            for (i = 0; i < n; i++) {
-               rgba[i][ACOMP] = (arg0[i][ACOMP] * arg1[i][ACOMP] +
-                                 arg2[i][ACOMP] * arg3[i][ACOMP]) * scaleA;
-            }
-         }
-         else {
-            /* two-term add */
-            for (i = 0; i < n; i++) {
-               rgba[i][ACOMP] = (arg0[i][ACOMP] + arg1[i][ACOMP]) * scaleA;
-            }
-         }
-         break;
-      case GL_ADD_SIGNED:
-         if (textureUnit->EnvMode == GL_COMBINE4_NV) {
-            /* (a * b) + (c * d) - 0.5 */
-            for (i = 0; i < n; i++) {
-               rgba[i][ACOMP] = (arg0[i][ACOMP] * arg1[i][ACOMP] +
-                                 arg2[i][ACOMP] * arg3[i][ACOMP] -
-                                 0.5F) * scaleA;
-            }
-         }
-         else {
-            /* a + b - 0.5 */
-            for (i = 0; i < n; i++) {
-               rgba[i][ACOMP] = (arg0[i][ACOMP] + arg1[i][ACOMP] - 0.5F) * scaleA;
-            }
-         }
-         break;
-      case GL_INTERPOLATE:
-         for (i = 0; i < n; i++) {
-            rgba[i][ACOMP] = (arg0[i][ACOMP] * arg2[i][ACOMP] +
-                              arg1[i][ACOMP] * (1.0F - arg2[i][ACOMP]))
-               * scaleA;
-         }
-         break;
-      case GL_SUBTRACT:
-         for (i = 0; i < n; i++) {
-            rgba[i][ACOMP] = (arg0[i][ACOMP] - arg1[i][ACOMP]) * scaleA;
-         }
-         break;
-      case GL_MODULATE_ADD_ATI:
-         for (i = 0; i < n; i++) {
-            rgba[i][ACOMP] = ((arg0[i][ACOMP] * arg2[i][ACOMP])
-                              + arg1[i][ACOMP]) * scaleA;
-         }
-         break;
-      case GL_MODULATE_SIGNED_ADD_ATI:
-         for (i = 0; i < n; i++) {
-            rgba[i][ACOMP] = ((arg0[i][ACOMP] * arg2[i][ACOMP]) +
-                              arg1[i][ACOMP] - 0.5F) * scaleA;
-         }
-         break;
-      case GL_MODULATE_SUBTRACT_ATI:
-         for (i = 0; i < n; i++) {
-            rgba[i][ACOMP] = ((arg0[i][ACOMP] * arg2[i][ACOMP])
-                              - arg1[i][ACOMP]) * scaleA;
-         }
-         break;
-      default:
-         _mesa_problem(ctx, "invalid combine mode");
-      }
-   }
-
-   /* Fix the alpha component for GL_DOT3_RGBA_EXT/ARB combining.
-    * This is kind of a kludge.  It would have been better if the spec
-    * were written such that the GL_COMBINE_ALPHA value could be set to
-    * GL_DOT3.
-    */
-   if (combine->ModeRGB == GL_DOT3_RGBA_EXT ||
-       combine->ModeRGB == GL_DOT3_RGBA) {
-      for (i = 0; i < n; i++) {
-	 rgba[i][ACOMP] = rgba[i][RCOMP];
-      }
-   }
-
-   for (i = 0; i < n; i++) {
-      UNCLAMPED_FLOAT_TO_CHAN(rgbaChan[i][RCOMP], rgba[i][RCOMP]);
-      UNCLAMPED_FLOAT_TO_CHAN(rgbaChan[i][GCOMP], rgba[i][GCOMP]);
-      UNCLAMPED_FLOAT_TO_CHAN(rgbaChan[i][BCOMP], rgba[i][BCOMP]);
-      UNCLAMPED_FLOAT_TO_CHAN(rgbaChan[i][ACOMP], rgba[i][ACOMP]);
-   }
-
-end:
-   for (i = 0; i < numArgsRGB || i < numArgsA; i++) {
-      free(ccolor[i]);
-   }
-   free(rgba);
-}
-
-
-/**
- * Apply X/Y/Z/W/0/1 swizzle to an array of colors/texels.
- * See GL_EXT_texture_swizzle.
- */
-static void
-swizzle_texels(GLuint swizzle, GLuint count, float4_array texels)
-{
-   const GLuint swzR = GET_SWZ(swizzle, 0);
-   const GLuint swzG = GET_SWZ(swizzle, 1);
-   const GLuint swzB = GET_SWZ(swizzle, 2);
-   const GLuint swzA = GET_SWZ(swizzle, 3);
-   GLfloat vector[6];
-   GLuint i;
-
-   vector[SWIZZLE_ZERO] = 0;
-   vector[SWIZZLE_ONE] = 1.0F;
-
-   for (i = 0; i < count; i++) {
-      vector[SWIZZLE_X] = texels[i][0];
-      vector[SWIZZLE_Y] = texels[i][1];
-      vector[SWIZZLE_Z] = texels[i][2];
-      vector[SWIZZLE_W] = texels[i][3];
-      texels[i][RCOMP] = vector[swzR];
-      texels[i][GCOMP] = vector[swzG];
-      texels[i][BCOMP] = vector[swzB];
-      texels[i][ACOMP] = vector[swzA];
-   }
-}
-
-
-/**
- * Apply texture mapping to a span of fragments.
- */
-void
-_swrast_texture_span( struct gl_context *ctx, SWspan *span )
-{
-   SWcontext *swrast = SWRAST_CONTEXT(ctx);
-   float4_array primary_rgba;
-   GLuint unit;
-
-   primary_rgba = (float4_array) malloc(span->end * 4 * sizeof(GLfloat));
-
-   if (!primary_rgba) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "texture_span");
-      return;
-   }
-
-   ASSERT(span->end <= MAX_WIDTH);
-
-   /*
-    * Save copy of the incoming fragment colors (the GL_PRIMARY_COLOR)
-    */
-   if (swrast->_TextureCombinePrimary) {
-      GLuint i;
-      for (i = 0; i < span->end; i++) {
-         primary_rgba[i][RCOMP] = CHAN_TO_FLOAT(span->array->rgba[i][RCOMP]);
-         primary_rgba[i][GCOMP] = CHAN_TO_FLOAT(span->array->rgba[i][GCOMP]);
-         primary_rgba[i][BCOMP] = CHAN_TO_FLOAT(span->array->rgba[i][BCOMP]);
-         primary_rgba[i][ACOMP] = CHAN_TO_FLOAT(span->array->rgba[i][ACOMP]);
-      }
-   }
-
-   /* First must sample all bump maps */
-   for (unit = 0; unit < ctx->Const.MaxTextureUnits; unit++) {
-      const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-
-      if (texUnit->_ReallyEnabled &&
-         texUnit->_CurrentCombine->ModeRGB == GL_BUMP_ENVMAP_ATI) {
-         const GLfloat (*texcoords)[4] = (const GLfloat (*)[4])
-            span->array->attribs[FRAG_ATTRIB_TEX0 + unit];
-         float4_array targetcoords =
-            span->array->attribs[FRAG_ATTRIB_TEX0 +
-               ctx->Texture.Unit[unit].BumpTarget - GL_TEXTURE0];
-
-         const struct gl_texture_object *curObj = texUnit->_Current;
-         GLfloat *lambda = span->array->lambda[unit];
-         float4_array texels = get_texel_array(swrast, unit);
-         GLuint i;
-         GLfloat rotMatrix00 = ctx->Texture.Unit[unit].RotMatrix[0];
-         GLfloat rotMatrix01 = ctx->Texture.Unit[unit].RotMatrix[1];
-         GLfloat rotMatrix10 = ctx->Texture.Unit[unit].RotMatrix[2];
-         GLfloat rotMatrix11 = ctx->Texture.Unit[unit].RotMatrix[3];
-
-         /* adjust texture lod (lambda) */
-         if (span->arrayMask & SPAN_LAMBDA) {
-            if (texUnit->LodBias + curObj->Sampler.LodBias != 0.0F) {
-               /* apply LOD bias, but don't clamp yet */
-               const GLfloat bias = CLAMP(texUnit->LodBias + curObj->Sampler.LodBias,
-                                          -ctx->Const.MaxTextureLodBias,
-                                          ctx->Const.MaxTextureLodBias);
-               GLuint i;
-               for (i = 0; i < span->end; i++) {
-                  lambda[i] += bias;
-               }
-            }
-
-            if (curObj->Sampler.MinLod != -1000.0 ||
-                curObj->Sampler.MaxLod != 1000.0) {
-               /* apply LOD clamping to lambda */
-               const GLfloat min = curObj->Sampler.MinLod;
-               const GLfloat max = curObj->Sampler.MaxLod;
-               GLuint i;
-               for (i = 0; i < span->end; i++) {
-                  GLfloat l = lambda[i];
-                  lambda[i] = CLAMP(l, min, max);
-               }
-            }
-         }
-
-         /* Sample the texture (span->end = number of fragments) */
-         swrast->TextureSample[unit]( ctx, texUnit->_Current, span->end,
-                                      texcoords, lambda, texels );
-
-         /* manipulate the span values of the bump target
-            not sure this can work correctly even ignoring
-            the problem that channel is unsigned */
-         for (i = 0; i < span->end; i++) {
-            targetcoords[i][0] += (texels[i][0] * rotMatrix00 + texels[i][1] *
-                                  rotMatrix01) / targetcoords[i][3];
-            targetcoords[i][1] += (texels[i][0] * rotMatrix10 + texels[i][1] *
-                                  rotMatrix11) / targetcoords[i][3];
-         }
-      }
-   }
-
-   /*
-    * Must do all texture sampling before combining in order to
-    * accomodate GL_ARB_texture_env_crossbar.
-    */
-   for (unit = 0; unit < ctx->Const.MaxTextureUnits; unit++) {
-      const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-      if (texUnit->_ReallyEnabled &&
-          texUnit->_CurrentCombine->ModeRGB != GL_BUMP_ENVMAP_ATI) {
-         const GLfloat (*texcoords)[4] = (const GLfloat (*)[4])
-            span->array->attribs[FRAG_ATTRIB_TEX0 + unit];
-         const struct gl_texture_object *curObj = texUnit->_Current;
-         GLfloat *lambda = span->array->lambda[unit];
-         float4_array texels = get_texel_array(swrast, unit);
-
-         /* adjust texture lod (lambda) */
-         if (span->arrayMask & SPAN_LAMBDA) {
-            if (texUnit->LodBias + curObj->Sampler.LodBias != 0.0F) {
-               /* apply LOD bias, but don't clamp yet */
-               const GLfloat bias = CLAMP(texUnit->LodBias + curObj->Sampler.LodBias,
-                                          -ctx->Const.MaxTextureLodBias,
-                                          ctx->Const.MaxTextureLodBias);
-               GLuint i;
-               for (i = 0; i < span->end; i++) {
-                  lambda[i] += bias;
-               }
-            }
-
-            if (curObj->Sampler.MinLod != -1000.0 ||
-                curObj->Sampler.MaxLod != 1000.0) {
-               /* apply LOD clamping to lambda */
-               const GLfloat min = curObj->Sampler.MinLod;
-               const GLfloat max = curObj->Sampler.MaxLod;
-               GLuint i;
-               for (i = 0; i < span->end; i++) {
-                  GLfloat l = lambda[i];
-                  lambda[i] = CLAMP(l, min, max);
-               }
-            }
-         }
-         else if (curObj->Sampler.MaxAnisotropy > 1.0 &&
-                  curObj->Sampler.MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
-            /* sample_lambda_2d_aniso is beeing used as texture_sample_func,
-             * it requires the current SWspan *span as an additional parameter.
-             * In order to keep the same function signature, the unused lambda
-             * parameter will be modified to actually contain the SWspan pointer.
-             * This is a Hack. To make it right, the texture_sample_func
-             * signature and all implementing functions need to be modified.
-             */
-            /* "hide" SWspan struct; cast to (GLfloat *) to suppress warning */
-            lambda = (GLfloat *)span;
-         }
-
-         /* Sample the texture (span->end = number of fragments) */
-         swrast->TextureSample[unit]( ctx, texUnit->_Current, span->end,
-                                      texcoords, lambda, texels );
-
-         /* GL_EXT_texture_swizzle */
-         if (curObj->_Swizzle != SWIZZLE_NOOP) {
-            swizzle_texels(curObj->_Swizzle, span->end, texels);
-         }
-      }
-   }
-
-   /*
-    * OK, now apply the texture (aka texture combine/blend).
-    * We modify the span->color.rgba values.
-    */
-   for (unit = 0; unit < ctx->Const.MaxTextureUnits; unit++) {
-      if (ctx->Texture.Unit[unit]._ReallyEnabled) {
-         texture_combine( ctx, unit, span->end,
-                          primary_rgba,
-                          swrast->TexelBuffer,
-                          span->array->rgba );
-      }
-   }
-
-   free(primary_rgba);
-}
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.5
+ *
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ * Copyright (C) 2009  VMware, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/colormac.h"
+#include "main/imports.h"
+#include "main/pixeltransfer.h"
+#include "program/prog_instruction.h"
+
+#include "s_context.h"
+#include "s_texcombine.h"
+
+
+/**
+ * Pointer to array of float[4]
+ * This type makes the code below more concise and avoids a lot of casting.
+ */
+typedef float (*float4_array)[4];
+
+
+/**
+ * Return array of texels for given unit.
+ */
+static INLINE float4_array
+get_texel_array(SWcontext *swrast, GLuint unit)
+{
+#ifdef _OPENMP
+   return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4 * omp_get_num_threads() + (MAX_WIDTH * 4 * omp_get_thread_num()));
+#else
+   return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4);
+#endif
+}
+
+
+
+/**
+ * Do texture application for:
+ *  GL_EXT_texture_env_combine
+ *  GL_ARB_texture_env_combine
+ *  GL_EXT_texture_env_dot3
+ *  GL_ARB_texture_env_dot3
+ *  GL_ATI_texture_env_combine3
+ *  GL_NV_texture_env_combine4
+ *  conventional GL texture env modes
+ *
+ * \param ctx          rendering context
+ * \param unit         the texture combiner unit
+ * \param n            number of fragments to process (span width)
+ * \param primary_rgba incoming fragment color array
+ * \param texelBuffer  pointer to texel colors for all texture units
+ * 
+ * \param rgba         incoming/result fragment colors
+ */
+static void
+texture_combine( struct gl_context *ctx, GLuint unit, GLuint n,
+                 const float4_array primary_rgba,
+                 const GLfloat *texelBuffer,
+                 GLchan (*rgbaChan)[4] )
+{
+   SWcontext *swrast = SWRAST_CONTEXT(ctx);
+   const struct gl_texture_unit *textureUnit = &(ctx->Texture.Unit[unit]);
+   const struct gl_tex_env_combine_state *combine = textureUnit->_CurrentCombine;
+   float4_array argRGB[MAX_COMBINER_TERMS];
+   float4_array argA[MAX_COMBINER_TERMS];
+   const GLfloat scaleRGB = (GLfloat) (1 << combine->ScaleShiftRGB);
+   const GLfloat scaleA = (GLfloat) (1 << combine->ScaleShiftA);
+   const GLuint numArgsRGB = combine->_NumArgsRGB;
+   const GLuint numArgsA = combine->_NumArgsA;
+   float4_array ccolor[4], rgba;
+   GLuint i, term;
+
+   /* alloc temp pixel buffers */
+   rgba = (float4_array) malloc(4 * n * sizeof(GLfloat));
+   if (!rgba) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "texture_combine");
+      return;
+   }
+
+   for (i = 0; i < numArgsRGB || i < numArgsA; i++) {
+      ccolor[i] = (float4_array) malloc(4 * n * sizeof(GLfloat));
+      if (!ccolor[i]) {
+         while (i) {
+            free(ccolor[i]);
+            i--;
+         }
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "texture_combine");
+         return;
+      }
+   }
+
+   for (i = 0; i < n; i++) {
+      rgba[i][RCOMP] = CHAN_TO_FLOAT(rgbaChan[i][RCOMP]);
+      rgba[i][GCOMP] = CHAN_TO_FLOAT(rgbaChan[i][GCOMP]);
+      rgba[i][BCOMP] = CHAN_TO_FLOAT(rgbaChan[i][BCOMP]);
+      rgba[i][ACOMP] = CHAN_TO_FLOAT(rgbaChan[i][ACOMP]);
+   }
+
+   /*
+   printf("modeRGB 0x%x  modeA 0x%x  srcRGB1 0x%x  srcA1 0x%x  srcRGB2 0x%x  srcA2 0x%x\n",
+          combine->ModeRGB,
+          combine->ModeA,
+          combine->SourceRGB[0],
+          combine->SourceA[0],
+          combine->SourceRGB[1],
+          combine->SourceA[1]);
+   */
+
+   /*
+    * Do operand setup for up to 4 operands.  Loop over the terms.
+    */
+   for (term = 0; term < numArgsRGB; term++) {
+      const GLenum srcRGB = combine->SourceRGB[term];
+      const GLenum operandRGB = combine->OperandRGB[term];
+
+      switch (srcRGB) {
+         case GL_TEXTURE:
+            argRGB[term] = get_texel_array(swrast, unit);
+            break;
+         case GL_PRIMARY_COLOR:
+            argRGB[term] = primary_rgba;
+            break;
+         case GL_PREVIOUS:
+            argRGB[term] = rgba;
+            break;
+         case GL_CONSTANT:
+            {
+               float4_array c = ccolor[term];
+               GLfloat red   = textureUnit->EnvColor[0];
+               GLfloat green = textureUnit->EnvColor[1];
+               GLfloat blue  = textureUnit->EnvColor[2];
+               GLfloat alpha = textureUnit->EnvColor[3];
+               for (i = 0; i < n; i++) {
+                  ASSIGN_4V(c[i], red, green, blue, alpha);
+               }
+               argRGB[term] = ccolor[term];
+            }
+            break;
+	 /* GL_ATI_texture_env_combine3 allows GL_ZERO & GL_ONE as sources.
+	  */
+	 case GL_ZERO:
+            {
+               float4_array c = ccolor[term];
+               for (i = 0; i < n; i++) {
+                  ASSIGN_4V(c[i], 0.0F, 0.0F, 0.0F, 0.0F);
+               }
+               argRGB[term] = ccolor[term];
+            }
+            break;
+	 case GL_ONE:
+            {
+               float4_array c = ccolor[term];
+               for (i = 0; i < n; i++) {
+                  ASSIGN_4V(c[i], 1.0F, 1.0F, 1.0F, 1.0F);
+               }
+               argRGB[term] = ccolor[term];
+            }
+            break;
+         default:
+            /* ARB_texture_env_crossbar source */
+            {
+               const GLuint srcUnit = srcRGB - GL_TEXTURE0;
+               ASSERT(srcUnit < ctx->Const.MaxTextureUnits);
+               if (!ctx->Texture.Unit[srcUnit]._ReallyEnabled)
+                  goto end;
+               argRGB[term] = get_texel_array(swrast, srcUnit);
+            }
+      }
+
+      if (operandRGB != GL_SRC_COLOR) {
+         float4_array src = argRGB[term];
+         float4_array dst = ccolor[term];
+
+         /* point to new arg[term] storage */
+         argRGB[term] = ccolor[term];
+
+         switch (operandRGB) {
+         case GL_ONE_MINUS_SRC_COLOR:
+            for (i = 0; i < n; i++) {
+               dst[i][RCOMP] = 1.0F - src[i][RCOMP];
+               dst[i][GCOMP] = 1.0F - src[i][GCOMP];
+               dst[i][BCOMP] = 1.0F - src[i][BCOMP];
+            }
+            break;
+         case GL_SRC_ALPHA:
+            for (i = 0; i < n; i++) {
+               dst[i][RCOMP] =
+               dst[i][GCOMP] =
+               dst[i][BCOMP] = src[i][ACOMP];
+            }
+            break;
+         case GL_ONE_MINUS_SRC_ALPHA:
+            for (i = 0; i < n; i++) {
+               dst[i][RCOMP] =
+               dst[i][GCOMP] =
+               dst[i][BCOMP] = 1.0F - src[i][ACOMP];
+            }
+            break;
+         default:
+            _mesa_problem(ctx, "Bad operandRGB");
+         }
+      }
+   }
+
+   /*
+    * Set up the argA[term] pointers
+    */
+   for (term = 0; term < numArgsA; term++) {
+      const GLenum srcA = combine->SourceA[term];
+      const GLenum operandA = combine->OperandA[term];
+
+      switch (srcA) {
+         case GL_TEXTURE:
+            argA[term] = get_texel_array(swrast, unit);
+            break;
+         case GL_PRIMARY_COLOR:
+            argA[term] = primary_rgba;
+            break;
+         case GL_PREVIOUS:
+            argA[term] = rgba;
+            break;
+         case GL_CONSTANT:
+            {
+               float4_array c = ccolor[term];
+               GLfloat alpha = textureUnit->EnvColor[3];
+               for (i = 0; i < n; i++)
+                  c[i][ACOMP] = alpha;
+               argA[term] = ccolor[term];
+            }
+            break;
+	 /* GL_ATI_texture_env_combine3 allows GL_ZERO & GL_ONE as sources.
+	  */
+	 case GL_ZERO:
+            {
+               float4_array c = ccolor[term];
+               for (i = 0; i < n; i++)
+                  c[i][ACOMP] = 0.0F;
+               argA[term] = ccolor[term];
+            }
+            break;
+	 case GL_ONE:
+            {
+               float4_array c = ccolor[term];
+               for (i = 0; i < n; i++)
+                  c[i][ACOMP] = 1.0F;
+               argA[term] = ccolor[term];
+            }
+            break;
+         default:
+            /* ARB_texture_env_crossbar source */
+            {
+               const GLuint srcUnit = srcA - GL_TEXTURE0;
+               ASSERT(srcUnit < ctx->Const.MaxTextureUnits);
+               if (!ctx->Texture.Unit[srcUnit]._ReallyEnabled)
+                  goto end;
+               argA[term] = get_texel_array(swrast, srcUnit);
+            }
+      }
+
+      if (operandA == GL_ONE_MINUS_SRC_ALPHA) {
+         float4_array src = argA[term];
+         float4_array dst = ccolor[term];
+         argA[term] = ccolor[term];
+         for (i = 0; i < n; i++) {
+            dst[i][ACOMP] = 1.0F - src[i][ACOMP];
+         }
+      }
+   }
+
+   /* RGB channel combine */
+   {
+      float4_array arg0 = argRGB[0];
+      float4_array arg1 = argRGB[1];
+      float4_array arg2 = argRGB[2];
+      float4_array arg3 = argRGB[3];
+
+      switch (combine->ModeRGB) {
+      case GL_REPLACE:
+         for (i = 0; i < n; i++) {
+            rgba[i][RCOMP] = arg0[i][RCOMP] * scaleRGB;
+            rgba[i][GCOMP] = arg0[i][GCOMP] * scaleRGB;
+            rgba[i][BCOMP] = arg0[i][BCOMP] * scaleRGB;
+         }
+         break;
+      case GL_MODULATE:
+         for (i = 0; i < n; i++) {
+            rgba[i][RCOMP] = arg0[i][RCOMP] * arg1[i][RCOMP] * scaleRGB;
+            rgba[i][GCOMP] = arg0[i][GCOMP] * arg1[i][GCOMP] * scaleRGB;
+            rgba[i][BCOMP] = arg0[i][BCOMP] * arg1[i][BCOMP] * scaleRGB;
+         }
+         break;
+      case GL_ADD:
+         if (textureUnit->EnvMode == GL_COMBINE4_NV) {
+            /* (a * b) + (c * d) */
+            for (i = 0; i < n; i++) {
+               rgba[i][RCOMP] = (arg0[i][RCOMP] * arg1[i][RCOMP] +
+                                 arg2[i][RCOMP] * arg3[i][RCOMP]) * scaleRGB;
+               rgba[i][GCOMP] = (arg0[i][GCOMP] * arg1[i][GCOMP] +
+                                 arg2[i][GCOMP] * arg3[i][GCOMP]) * scaleRGB;
+               rgba[i][BCOMP] = (arg0[i][BCOMP] * arg1[i][BCOMP] +
+                                 arg2[i][BCOMP] * arg3[i][BCOMP]) * scaleRGB;
+            }
+         }
+         else {
+            /* 2-term addition */
+            for (i = 0; i < n; i++) {
+               rgba[i][RCOMP] = (arg0[i][RCOMP] + arg1[i][RCOMP]) * scaleRGB;
+               rgba[i][GCOMP] = (arg0[i][GCOMP] + arg1[i][GCOMP]) * scaleRGB;
+               rgba[i][BCOMP] = (arg0[i][BCOMP] + arg1[i][BCOMP]) * scaleRGB;
+            }
+         }
+         break;
+      case GL_ADD_SIGNED:
+         if (textureUnit->EnvMode == GL_COMBINE4_NV) {
+            /* (a * b) + (c * d) - 0.5 */
+            for (i = 0; i < n; i++) {
+               rgba[i][RCOMP] = (arg0[i][RCOMP] * arg1[i][RCOMP] +
+                                 arg2[i][RCOMP] * arg3[i][RCOMP] - 0.5F) * scaleRGB;
+               rgba[i][GCOMP] = (arg0[i][GCOMP] * arg1[i][GCOMP] +
+                                 arg2[i][GCOMP] * arg3[i][GCOMP] - 0.5F) * scaleRGB;
+               rgba[i][BCOMP] = (arg0[i][BCOMP] * arg1[i][BCOMP] +
+                                 arg2[i][BCOMP] * arg3[i][BCOMP] - 0.5F) * scaleRGB;
+            }
+         }
+         else {
+            for (i = 0; i < n; i++) {
+               rgba[i][RCOMP] = (arg0[i][RCOMP] + arg1[i][RCOMP] - 0.5F) * scaleRGB;
+               rgba[i][GCOMP] = (arg0[i][GCOMP] + arg1[i][GCOMP] - 0.5F) * scaleRGB;
+               rgba[i][BCOMP] = (arg0[i][BCOMP] + arg1[i][BCOMP] - 0.5F) * scaleRGB;
+            }
+         }
+         break;
+      case GL_INTERPOLATE:
+         for (i = 0; i < n; i++) {
+            rgba[i][RCOMP] = (arg0[i][RCOMP] * arg2[i][RCOMP] +
+                          arg1[i][RCOMP] * (1.0F - arg2[i][RCOMP])) * scaleRGB;
+            rgba[i][GCOMP] = (arg0[i][GCOMP] * arg2[i][GCOMP] +
+                          arg1[i][GCOMP] * (1.0F - arg2[i][GCOMP])) * scaleRGB;
+            rgba[i][BCOMP] = (arg0[i][BCOMP] * arg2[i][BCOMP] +
+                          arg1[i][BCOMP] * (1.0F - arg2[i][BCOMP])) * scaleRGB;
+         }
+         break;
+      case GL_SUBTRACT:
+         for (i = 0; i < n; i++) {
+            rgba[i][RCOMP] = (arg0[i][RCOMP] - arg1[i][RCOMP]) * scaleRGB;
+            rgba[i][GCOMP] = (arg0[i][GCOMP] - arg1[i][GCOMP]) * scaleRGB;
+            rgba[i][BCOMP] = (arg0[i][BCOMP] - arg1[i][BCOMP]) * scaleRGB;
+         }
+         break;
+      case GL_DOT3_RGB_EXT:
+      case GL_DOT3_RGBA_EXT:
+         /* Do not scale the result by 1 2 or 4 */
+         for (i = 0; i < n; i++) {
+            GLfloat dot = ((arg0[i][RCOMP] - 0.5F) * (arg1[i][RCOMP] - 0.5F) +
+                           (arg0[i][GCOMP] - 0.5F) * (arg1[i][GCOMP] - 0.5F) +
+                           (arg0[i][BCOMP] - 0.5F) * (arg1[i][BCOMP] - 0.5F))
+               * 4.0F;
+            dot = CLAMP(dot, 0.0F, 1.0F);
+            rgba[i][RCOMP] = rgba[i][GCOMP] = rgba[i][BCOMP] = dot;
+         }
+         break;
+      case GL_DOT3_RGB:
+      case GL_DOT3_RGBA:
+         /* DO scale the result by 1 2 or 4 */
+         for (i = 0; i < n; i++) {
+            GLfloat dot = ((arg0[i][RCOMP] - 0.5F) * (arg1[i][RCOMP] - 0.5F) +
+                           (arg0[i][GCOMP] - 0.5F) * (arg1[i][GCOMP] - 0.5F) +
+                           (arg0[i][BCOMP] - 0.5F) * (arg1[i][BCOMP] - 0.5F))
+               * 4.0F * scaleRGB;
+            dot = CLAMP(dot, 0.0F, 1.0F);
+            rgba[i][RCOMP] = rgba[i][GCOMP] = rgba[i][BCOMP] = dot;
+         }
+         break;
+      case GL_MODULATE_ADD_ATI:
+         for (i = 0; i < n; i++) {
+            rgba[i][RCOMP] = ((arg0[i][RCOMP] * arg2[i][RCOMP]) +
+                              arg1[i][RCOMP]) * scaleRGB;
+            rgba[i][GCOMP] = ((arg0[i][GCOMP] * arg2[i][GCOMP]) +
+                              arg1[i][GCOMP]) * scaleRGB;
+            rgba[i][BCOMP] = ((arg0[i][BCOMP] * arg2[i][BCOMP]) +
+                              arg1[i][BCOMP]) * scaleRGB;
+	 }
+         break;
+      case GL_MODULATE_SIGNED_ADD_ATI:
+         for (i = 0; i < n; i++) {
+            rgba[i][RCOMP] = ((arg0[i][RCOMP] * arg2[i][RCOMP]) +
+                              arg1[i][RCOMP] - 0.5F) * scaleRGB;
+            rgba[i][GCOMP] = ((arg0[i][GCOMP] * arg2[i][GCOMP]) +
+                              arg1[i][GCOMP] - 0.5F) * scaleRGB;
+            rgba[i][BCOMP] = ((arg0[i][BCOMP] * arg2[i][BCOMP]) +
+                              arg1[i][BCOMP] - 0.5F) * scaleRGB;
+	 }
+         break;
+      case GL_MODULATE_SUBTRACT_ATI:
+         for (i = 0; i < n; i++) {
+            rgba[i][RCOMP] = ((arg0[i][RCOMP] * arg2[i][RCOMP]) -
+                              arg1[i][RCOMP]) * scaleRGB;
+            rgba[i][GCOMP] = ((arg0[i][GCOMP] * arg2[i][GCOMP]) -
+                              arg1[i][GCOMP]) * scaleRGB;
+            rgba[i][BCOMP] = ((arg0[i][BCOMP] * arg2[i][BCOMP]) -
+                              arg1[i][BCOMP]) * scaleRGB;
+	 }
+         break;
+      case GL_BUMP_ENVMAP_ATI:
+         /* this produces a fixed rgba color, and the coord calc is done elsewhere */
+         for (i = 0; i < n; i++) {
+            /* rgba result is 0,0,0,1 */
+            rgba[i][RCOMP] = 0.0;
+            rgba[i][GCOMP] = 0.0;
+            rgba[i][BCOMP] = 0.0;
+            rgba[i][ACOMP] = 1.0;
+	 }
+         goto end; /* no alpha processing */
+      default:
+         _mesa_problem(ctx, "invalid combine mode");
+      }
+   }
+
+   /* Alpha channel combine */
+   {
+      float4_array arg0 = argA[0];
+      float4_array arg1 = argA[1];
+      float4_array arg2 = argA[2];
+      float4_array arg3 = argA[3];
+
+      switch (combine->ModeA) {
+      case GL_REPLACE:
+         for (i = 0; i < n; i++) {
+            rgba[i][ACOMP] = arg0[i][ACOMP] * scaleA;
+         }
+         break;
+      case GL_MODULATE:
+         for (i = 0; i < n; i++) {
+            rgba[i][ACOMP] = arg0[i][ACOMP] * arg1[i][ACOMP] * scaleA;
+         }
+         break;
+      case GL_ADD:
+         if (textureUnit->EnvMode == GL_COMBINE4_NV) {
+            /* (a * b) + (c * d) */
+            for (i = 0; i < n; i++) {
+               rgba[i][ACOMP] = (arg0[i][ACOMP] * arg1[i][ACOMP] +
+                                 arg2[i][ACOMP] * arg3[i][ACOMP]) * scaleA;
+            }
+         }
+         else {
+            /* two-term add */
+            for (i = 0; i < n; i++) {
+               rgba[i][ACOMP] = (arg0[i][ACOMP] + arg1[i][ACOMP]) * scaleA;
+            }
+         }
+         break;
+      case GL_ADD_SIGNED:
+         if (textureUnit->EnvMode == GL_COMBINE4_NV) {
+            /* (a * b) + (c * d) - 0.5 */
+            for (i = 0; i < n; i++) {
+               rgba[i][ACOMP] = (arg0[i][ACOMP] * arg1[i][ACOMP] +
+                                 arg2[i][ACOMP] * arg3[i][ACOMP] -
+                                 0.5F) * scaleA;
+            }
+         }
+         else {
+            /* a + b - 0.5 */
+            for (i = 0; i < n; i++) {
+               rgba[i][ACOMP] = (arg0[i][ACOMP] + arg1[i][ACOMP] - 0.5F) * scaleA;
+            }
+         }
+         break;
+      case GL_INTERPOLATE:
+         for (i = 0; i < n; i++) {
+            rgba[i][ACOMP] = (arg0[i][ACOMP] * arg2[i][ACOMP] +
+                              arg1[i][ACOMP] * (1.0F - arg2[i][ACOMP]))
+               * scaleA;
+         }
+         break;
+      case GL_SUBTRACT:
+         for (i = 0; i < n; i++) {
+            rgba[i][ACOMP] = (arg0[i][ACOMP] - arg1[i][ACOMP]) * scaleA;
+         }
+         break;
+      case GL_MODULATE_ADD_ATI:
+         for (i = 0; i < n; i++) {
+            rgba[i][ACOMP] = ((arg0[i][ACOMP] * arg2[i][ACOMP])
+                              + arg1[i][ACOMP]) * scaleA;
+         }
+         break;
+      case GL_MODULATE_SIGNED_ADD_ATI:
+         for (i = 0; i < n; i++) {
+            rgba[i][ACOMP] = ((arg0[i][ACOMP] * arg2[i][ACOMP]) +
+                              arg1[i][ACOMP] - 0.5F) * scaleA;
+         }
+         break;
+      case GL_MODULATE_SUBTRACT_ATI:
+         for (i = 0; i < n; i++) {
+            rgba[i][ACOMP] = ((arg0[i][ACOMP] * arg2[i][ACOMP])
+                              - arg1[i][ACOMP]) * scaleA;
+         }
+         break;
+      default:
+         _mesa_problem(ctx, "invalid combine mode");
+      }
+   }
+
+   /* Fix the alpha component for GL_DOT3_RGBA_EXT/ARB combining.
+    * This is kind of a kludge.  It would have been better if the spec
+    * were written such that the GL_COMBINE_ALPHA value could be set to
+    * GL_DOT3.
+    */
+   if (combine->ModeRGB == GL_DOT3_RGBA_EXT ||
+       combine->ModeRGB == GL_DOT3_RGBA) {
+      for (i = 0; i < n; i++) {
+	 rgba[i][ACOMP] = rgba[i][RCOMP];
+      }
+   }
+
+   for (i = 0; i < n; i++) {
+      UNCLAMPED_FLOAT_TO_CHAN(rgbaChan[i][RCOMP], rgba[i][RCOMP]);
+      UNCLAMPED_FLOAT_TO_CHAN(rgbaChan[i][GCOMP], rgba[i][GCOMP]);
+      UNCLAMPED_FLOAT_TO_CHAN(rgbaChan[i][BCOMP], rgba[i][BCOMP]);
+      UNCLAMPED_FLOAT_TO_CHAN(rgbaChan[i][ACOMP], rgba[i][ACOMP]);
+   }
+
+end:
+   for (i = 0; i < numArgsRGB || i < numArgsA; i++) {
+      free(ccolor[i]);
+   }
+   free(rgba);
+}
+
+
+/**
+ * Apply X/Y/Z/W/0/1 swizzle to an array of colors/texels.
+ * See GL_EXT_texture_swizzle.
+ */
+static void
+swizzle_texels(GLuint swizzle, GLuint count, float4_array texels)
+{
+   const GLuint swzR = GET_SWZ(swizzle, 0);
+   const GLuint swzG = GET_SWZ(swizzle, 1);
+   const GLuint swzB = GET_SWZ(swizzle, 2);
+   const GLuint swzA = GET_SWZ(swizzle, 3);
+   GLfloat vector[6];
+   GLuint i;
+
+   vector[SWIZZLE_ZERO] = 0;
+   vector[SWIZZLE_ONE] = 1.0F;
+
+   for (i = 0; i < count; i++) {
+      vector[SWIZZLE_X] = texels[i][0];
+      vector[SWIZZLE_Y] = texels[i][1];
+      vector[SWIZZLE_Z] = texels[i][2];
+      vector[SWIZZLE_W] = texels[i][3];
+      texels[i][RCOMP] = vector[swzR];
+      texels[i][GCOMP] = vector[swzG];
+      texels[i][BCOMP] = vector[swzB];
+      texels[i][ACOMP] = vector[swzA];
+   }
+}
+
+
+/**
+ * Apply texture mapping to a span of fragments.
+ */
+void
+_swrast_texture_span( struct gl_context *ctx, SWspan *span )
+{
+   SWcontext *swrast = SWRAST_CONTEXT(ctx);
+   float4_array primary_rgba;
+   GLuint unit;
+
+   primary_rgba = (float4_array) malloc(span->end * 4 * sizeof(GLfloat));
+
+   if (!primary_rgba) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "texture_span");
+      return;
+   }
+
+   ASSERT(span->end <= MAX_WIDTH);
+
+   /*
+    * Save copy of the incoming fragment colors (the GL_PRIMARY_COLOR)
+    */
+   if (swrast->_TextureCombinePrimary) {
+      GLuint i;
+      for (i = 0; i < span->end; i++) {
+         primary_rgba[i][RCOMP] = CHAN_TO_FLOAT(span->array->rgba[i][RCOMP]);
+         primary_rgba[i][GCOMP] = CHAN_TO_FLOAT(span->array->rgba[i][GCOMP]);
+         primary_rgba[i][BCOMP] = CHAN_TO_FLOAT(span->array->rgba[i][BCOMP]);
+         primary_rgba[i][ACOMP] = CHAN_TO_FLOAT(span->array->rgba[i][ACOMP]);
+      }
+   }
+
+   /* First must sample all bump maps */
+   for (unit = 0; unit < ctx->Const.MaxTextureUnits; unit++) {
+      const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+
+      if (texUnit->_ReallyEnabled &&
+         texUnit->_CurrentCombine->ModeRGB == GL_BUMP_ENVMAP_ATI) {
+         const GLfloat (*texcoords)[4] = (const GLfloat (*)[4])
+            span->array->attribs[FRAG_ATTRIB_TEX0 + unit];
+         float4_array targetcoords =
+            span->array->attribs[FRAG_ATTRIB_TEX0 +
+               ctx->Texture.Unit[unit].BumpTarget - GL_TEXTURE0];
+
+         const struct gl_texture_object *curObj = texUnit->_Current;
+         GLfloat *lambda = span->array->lambda[unit];
+         float4_array texels = get_texel_array(swrast, unit);
+         GLuint i;
+         GLfloat rotMatrix00 = ctx->Texture.Unit[unit].RotMatrix[0];
+         GLfloat rotMatrix01 = ctx->Texture.Unit[unit].RotMatrix[1];
+         GLfloat rotMatrix10 = ctx->Texture.Unit[unit].RotMatrix[2];
+         GLfloat rotMatrix11 = ctx->Texture.Unit[unit].RotMatrix[3];
+
+         /* adjust texture lod (lambda) */
+         if (span->arrayMask & SPAN_LAMBDA) {
+            if (texUnit->LodBias + curObj->Sampler.LodBias != 0.0F) {
+               /* apply LOD bias, but don't clamp yet */
+               const GLfloat bias = CLAMP(texUnit->LodBias + curObj->Sampler.LodBias,
+                                          -ctx->Const.MaxTextureLodBias,
+                                          ctx->Const.MaxTextureLodBias);
+               GLuint i;
+               for (i = 0; i < span->end; i++) {
+                  lambda[i] += bias;
+               }
+            }
+
+            if (curObj->Sampler.MinLod != -1000.0 ||
+                curObj->Sampler.MaxLod != 1000.0) {
+               /* apply LOD clamping to lambda */
+               const GLfloat min = curObj->Sampler.MinLod;
+               const GLfloat max = curObj->Sampler.MaxLod;
+               GLuint i;
+               for (i = 0; i < span->end; i++) {
+                  GLfloat l = lambda[i];
+                  lambda[i] = CLAMP(l, min, max);
+               }
+            }
+         }
+
+         /* Sample the texture (span->end = number of fragments) */
+         swrast->TextureSample[unit]( ctx, texUnit->_Current, span->end,
+                                      texcoords, lambda, texels );
+
+         /* manipulate the span values of the bump target
+            not sure this can work correctly even ignoring
+            the problem that channel is unsigned */
+         for (i = 0; i < span->end; i++) {
+            targetcoords[i][0] += (texels[i][0] * rotMatrix00 + texels[i][1] *
+                                  rotMatrix01) / targetcoords[i][3];
+            targetcoords[i][1] += (texels[i][0] * rotMatrix10 + texels[i][1] *
+                                  rotMatrix11) / targetcoords[i][3];
+         }
+      }
+   }
+
+   /*
+    * Must do all texture sampling before combining in order to
+    * accomodate GL_ARB_texture_env_crossbar.
+    */
+   for (unit = 0; unit < ctx->Const.MaxTextureUnits; unit++) {
+      const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+      if (texUnit->_ReallyEnabled &&
+          texUnit->_CurrentCombine->ModeRGB != GL_BUMP_ENVMAP_ATI) {
+         const GLfloat (*texcoords)[4] = (const GLfloat (*)[4])
+            span->array->attribs[FRAG_ATTRIB_TEX0 + unit];
+         const struct gl_texture_object *curObj = texUnit->_Current;
+         GLfloat *lambda = span->array->lambda[unit];
+         float4_array texels = get_texel_array(swrast, unit);
+
+         /* adjust texture lod (lambda) */
+         if (span->arrayMask & SPAN_LAMBDA) {
+            if (texUnit->LodBias + curObj->Sampler.LodBias != 0.0F) {
+               /* apply LOD bias, but don't clamp yet */
+               const GLfloat bias = CLAMP(texUnit->LodBias + curObj->Sampler.LodBias,
+                                          -ctx->Const.MaxTextureLodBias,
+                                          ctx->Const.MaxTextureLodBias);
+               GLuint i;
+               for (i = 0; i < span->end; i++) {
+                  lambda[i] += bias;
+               }
+            }
+
+            if (curObj->Sampler.MinLod != -1000.0 ||
+                curObj->Sampler.MaxLod != 1000.0) {
+               /* apply LOD clamping to lambda */
+               const GLfloat min = curObj->Sampler.MinLod;
+               const GLfloat max = curObj->Sampler.MaxLod;
+               GLuint i;
+               for (i = 0; i < span->end; i++) {
+                  GLfloat l = lambda[i];
+                  lambda[i] = CLAMP(l, min, max);
+               }
+            }
+         }
+         else if (curObj->Sampler.MaxAnisotropy > 1.0 &&
+                  curObj->Sampler.MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
+            /* sample_lambda_2d_aniso is beeing used as texture_sample_func,
+             * it requires the current SWspan *span as an additional parameter.
+             * In order to keep the same function signature, the unused lambda
+             * parameter will be modified to actually contain the SWspan pointer.
+             * This is a Hack. To make it right, the texture_sample_func
+             * signature and all implementing functions need to be modified.
+             */
+            /* "hide" SWspan struct; cast to (GLfloat *) to suppress warning */
+            lambda = (GLfloat *)span;
+         }
+
+         /* Sample the texture (span->end = number of fragments) */
+         swrast->TextureSample[unit]( ctx, texUnit->_Current, span->end,
+                                      texcoords, lambda, texels );
+
+         /* GL_EXT_texture_swizzle */
+         if (curObj->_Swizzle != SWIZZLE_NOOP) {
+            swizzle_texels(curObj->_Swizzle, span->end, texels);
+         }
+      }
+   }
+
+   /*
+    * OK, now apply the texture (aka texture combine/blend).
+    * We modify the span->color.rgba values.
+    */
+   for (unit = 0; unit < ctx->Const.MaxTextureUnits; unit++) {
+      if (ctx->Texture.Unit[unit]._ReallyEnabled) {
+         texture_combine( ctx, unit, span->end,
+                          primary_rgba,
+                          swrast->TexelBuffer,
+                          span->array->rgba );
+      }
+   }
+
+   free(primary_rgba);
+}
-- 
cgit v1.2.3