41 files changed, 5640 insertions, 5688 deletions
diff --git a/mesalib/src/SConscript b/mesalib/src/SConscript
index 1eee8761a..0a30838de 100644
--- a/mesalib/src/SConscript
+++ b/mesalib/src/SConscript
@@ -22,9 +22,11 @@ SConscript('mesa/SConscript')
 SConscript('mapi/vgapi/SConscript')
 
 if not env['embedded']:
-    SConscript('glx/SConscript')
-    SConscript('egl/main/SConscript')
-    SConscript('glu/sgi/SConscript')
+    if env['platform'] not in ['windows', 'darwin']:
+        SConscript('glx/SConscript')
+    if env['platform'] not in ['darwin']:
+        SConscript('egl/main/SConscript')
+        SConscript('glu/sgi/SConscript')
 
     if env['gles']:
         SConscript('mapi/shared-glapi/SConscript')
diff --git a/mesalib/src/gallium/auxiliary/util/u_blit.c b/mesalib/src/gallium/auxiliary/util/u_blit.c
index e892a4a77..87530e94a 100644
--- a/mesalib/src/gallium/auxiliary/util/u_blit.c
+++ b/mesalib/src/gallium/auxiliary/util/u_blit.c
@@ -1,790 +1,794 @@
-/**************************************************************************
- *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Copy/blit pixel rect between surfaces
- *  
- * @author Brian Paul
- */
-
-
-#include "pipe/p_context.h"
-#include "util/u_debug.h"
-#include "pipe/p_defines.h"
-#include "util/u_inlines.h"
-#include "pipe/p_shader_tokens.h"
-#include "pipe/p_state.h"
-
-#include "util/u_blit.h"
-#include "util/u_draw_quad.h"
-#include "util/u_format.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "util/u_sampler.h"
-#include "util/u_simple_shaders.h"
-
-#include "cso_cache/cso_context.h"
-
-
-struct blit_state
-{
-   struct pipe_context *pipe;
-   struct cso_context *cso;
-
-   struct pipe_blend_state blend;
-   struct pipe_depth_stencil_alpha_state depthstencil_keep;
-   struct pipe_depth_stencil_alpha_state depthstencil_write;
-   struct pipe_rasterizer_state rasterizer;
-   struct pipe_sampler_state sampler;
-   struct pipe_viewport_state viewport;
-   struct pipe_clip_state clip;
-   struct pipe_vertex_element velem[2];
-   enum pipe_texture_target internal_target;
-
-   void *vs;
-   void *fs[TGSI_WRITEMASK_XYZW + 1];
-   void *fs_depth;
-
-   struct pipe_resource *vbuf;  /**< quad vertices */
-   unsigned vbuf_slot;
-
-   float vertices[4][2][4];   /**< vertex/texcoords for quad */
-};
-
-
-/**
- * Create state object for blit.
- * Intended to be created once and re-used for many blit() calls.
- */
-struct blit_state *
-util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
-{
-   struct blit_state *ctx;
-   uint i;
-
-   ctx = CALLOC_STRUCT(blit_state);
-   if (!ctx)
-      return NULL;
-
-   ctx->pipe = pipe;
-   ctx->cso = cso;
-
-   /* disabled blending/masking */
-   memset(&ctx->blend, 0, sizeof(ctx->blend));
-   ctx->blend.rt[0].colormask = PIPE_MASK_RGBA;
-
-   /* no-op depth/stencil/alpha */
-   memset(&ctx->depthstencil_keep, 0, sizeof(ctx->depthstencil_keep));
-   memset(&ctx->depthstencil_write, 0, sizeof(ctx->depthstencil_write));
-   ctx->depthstencil_write.depth.enabled = 1;
-   ctx->depthstencil_write.depth.writemask = 1;
-   ctx->depthstencil_write.depth.func = PIPE_FUNC_ALWAYS;
-
-   /* rasterizer */
-   memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
-   ctx->rasterizer.cull_face = PIPE_FACE_NONE;
-   ctx->rasterizer.gl_rasterization_rules = 1;
-
-   /* samplers */
-   memset(&ctx->sampler, 0, sizeof(ctx->sampler));
-   ctx->sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-   ctx->sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-   ctx->sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-   ctx->sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-   ctx->sampler.min_img_filter = 0; /* set later */
-   ctx->sampler.mag_img_filter = 0; /* set later */
-
-   /* vertex elements state */
-   memset(&ctx->velem[0], 0, sizeof(ctx->velem[0]) * 2);
-   for (i = 0; i < 2; i++) {
-      ctx->velem[i].src_offset = i * 4 * sizeof(float);
-      ctx->velem[i].instance_divisor = 0;
-      ctx->velem[i].vertex_buffer_index = 0;
-      ctx->velem[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-   }
-
-   ctx->vbuf = NULL;
-
-   /* init vertex data that doesn't change */
-   for (i = 0; i < 4; i++) {
-      ctx->vertices[i][0][3] = 1.0f; /* w */
-      ctx->vertices[i][1][2] = 0.0f; /* r */
-      ctx->vertices[i][1][3] = 1.0f; /* q */
-   }
-
-   if(pipe->screen->get_param(pipe->screen, PIPE_CAP_NPOT_TEXTURES))
-      ctx->internal_target = PIPE_TEXTURE_2D;
-   else
-      ctx->internal_target = PIPE_TEXTURE_RECT;
-
-   return ctx;
-}
-
-
-/**
- * Destroy a blit context
- */
-void
-util_destroy_blit(struct blit_state *ctx)
-{
-   struct pipe_context *pipe = ctx->pipe;
-   unsigned i;
-
-   if (ctx->vs)
-      pipe->delete_vs_state(pipe, ctx->vs);
-
-   for (i = 0; i < Elements(ctx->fs); i++)
-      if (ctx->fs[i])
-         pipe->delete_fs_state(pipe, ctx->fs[i]);
-
-   if (ctx->fs_depth)
-      pipe->delete_fs_state(pipe, ctx->fs_depth);
-
-   pipe_resource_reference(&ctx->vbuf, NULL);
-
-   FREE(ctx);
-}
-
-
-/**
- * Helper function to set the fragment shaders.
- */
-static INLINE void
-set_fragment_shader(struct blit_state *ctx, uint writemask)
-{
-   if (!ctx->fs[writemask])
-      ctx->fs[writemask] =
-         util_make_fragment_tex_shader_writemask(ctx->pipe, TGSI_TEXTURE_2D,
-                                                 TGSI_INTERPOLATE_LINEAR,
-                                                 writemask);
-
-   cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]);
-}
-
-
-/**
- * Helper function to set the depthwrite shader.
- */
-static INLINE void
-set_depth_fragment_shader(struct blit_state *ctx)
-{
-   if (!ctx->fs_depth)
-      ctx->fs_depth =
-         util_make_fragment_tex_shader_writedepth(ctx->pipe, TGSI_TEXTURE_2D,
-                                                  TGSI_INTERPOLATE_LINEAR);
-
-   cso_set_fragment_shader_handle(ctx->cso, ctx->fs_depth);
-}
-
-
-/**
- * Helper function to set the vertex shader.
- */
-static INLINE void
-set_vertex_shader(struct blit_state *ctx)
-{
-   /* vertex shader - still required to provide the linkage between
-    * fragment shader input semantics and vertex_element/buffers.
-    */
-   if (!ctx->vs) {
-      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
-                                      TGSI_SEMANTIC_GENERIC };
-      const uint semantic_indexes[] = { 0, 0 };
-      ctx->vs = util_make_vertex_passthrough_shader(ctx->pipe, 2,
-                                                    semantic_names,
-                                                    semantic_indexes);
-   }
-
-   cso_set_vertex_shader_handle(ctx->cso, ctx->vs);
-}
-
-
-/**
- * Get offset of next free slot in vertex buffer for quad vertices.
- */
-static unsigned
-get_next_slot( struct blit_state *ctx )
-{
-   const unsigned max_slots = 4096 / sizeof ctx->vertices;
-
-   if (ctx->vbuf_slot >= max_slots) 
-      util_blit_flush( ctx );
-
-   if (!ctx->vbuf) {
-      ctx->vbuf = pipe_buffer_create(ctx->pipe->screen,
-                                     PIPE_BIND_VERTEX_BUFFER,
-                                     PIPE_USAGE_STREAM,
-                                     max_slots * sizeof ctx->vertices);
-   }
-   
-   return ctx->vbuf_slot++ * sizeof ctx->vertices;
-}
-
-
-
-
-/**
- * Setup vertex data for the textured quad we'll draw.
- * Note: y=0=top
- */
-static unsigned
-setup_vertex_data_tex(struct blit_state *ctx,
-                      float x0, float y0, float x1, float y1,
-                      float s0, float t0, float s1, float t1,
-                      float z)
-{
-   unsigned offset;
-
-   ctx->vertices[0][0][0] = x0;
-   ctx->vertices[0][0][1] = y0;
-   ctx->vertices[0][0][2] = z;
-   ctx->vertices[0][1][0] = s0; /*s*/
-   ctx->vertices[0][1][1] = t0; /*t*/
-
-   ctx->vertices[1][0][0] = x1;
-   ctx->vertices[1][0][1] = y0;
-   ctx->vertices[1][0][2] = z;
-   ctx->vertices[1][1][0] = s1; /*s*/
-   ctx->vertices[1][1][1] = t0; /*t*/
-
-   ctx->vertices[2][0][0] = x1;
-   ctx->vertices[2][0][1] = y1;
-   ctx->vertices[2][0][2] = z;
-   ctx->vertices[2][1][0] = s1;
-   ctx->vertices[2][1][1] = t1;
-
-   ctx->vertices[3][0][0] = x0;
-   ctx->vertices[3][0][1] = y1;
-   ctx->vertices[3][0][2] = z;
-   ctx->vertices[3][1][0] = s0;
-   ctx->vertices[3][1][1] = t1;
-
-   offset = get_next_slot( ctx );
-
-   pipe_buffer_write_nooverlap(ctx->pipe, ctx->vbuf,
-                               offset, sizeof(ctx->vertices), ctx->vertices);
-
-   return offset;
-}
-
-
-/**
- * \return TRUE if two regions overlap, FALSE otherwise
- */
-static boolean
-regions_overlap(int srcX0, int srcY0,
-                int srcX1, int srcY1,
-                int dstX0, int dstY0,
-                int dstX1, int dstY1)
-{
-   if (MAX2(srcX0, srcX1) < MIN2(dstX0, dstX1))
-      return FALSE; /* src completely left of dst */
-
-   if (MAX2(dstX0, dstX1) < MIN2(srcX0, srcX1))
-      return FALSE; /* dst completely left of src */
-
-   if (MAX2(srcY0, srcY1) < MIN2(dstY0, dstY1))
-      return FALSE; /* src completely above dst */
-
-   if (MAX2(dstY0, dstY1) < MIN2(srcY0, srcY1))
-      return FALSE; /* dst completely above src */
-
-   return TRUE; /* some overlap */
-}
-
-
-/**
- * Copy pixel block from src surface to dst surface.
- * Overlapping regions are acceptable.
- * Flipping and stretching are supported.
- * \param filter  one of PIPE_TEX_MIPFILTER_NEAREST/LINEAR
- * \param writemask  controls which channels in the dest surface are sourced
- *                   from the src surface.  Disabled channels are sourced
- *                   from (0,0,0,1).
- * XXX need some control over blitting stencil.
- */
-void
-util_blit_pixels_writemask(struct blit_state *ctx,
-                           struct pipe_resource *src_tex,
-                           unsigned src_level,
-                           int srcX0, int srcY0,
-                           int srcX1, int srcY1,
-                           int srcZ0,
-                           struct pipe_surface *dst,
-                           int dstX0, int dstY0,
-                           int dstX1, int dstY1,
-                           float z, uint filter,
-                           uint writemask)
-{
-   struct pipe_context *pipe = ctx->pipe;
-   struct pipe_screen *screen = pipe->screen;
-   enum pipe_format src_format, dst_format;
-   struct pipe_sampler_view *sampler_view = NULL;
-   struct pipe_sampler_view sv_templ;
-   struct pipe_surface *dst_surface;
-   struct pipe_framebuffer_state fb;
-   const int srcW = abs(srcX1 - srcX0);
-   const int srcH = abs(srcY1 - srcY0);
-   unsigned offset;
-   boolean overlap, dst_is_depth;
-   float s0, t0, s1, t1;
-   boolean normalized;
-
-   assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
-          filter == PIPE_TEX_MIPFILTER_LINEAR);
-
-   assert(src_level <= src_tex->last_level);
-
-   /* do the regions overlap? */
-   overlap = src_tex == dst->texture &&
-             dst->u.tex.level == src_level &&
-             dst->u.tex.first_layer == srcZ0 &&
-      regions_overlap(srcX0, srcY0, srcX1, srcY1,
-                      dstX0, dstY0, dstX1, dstY1);
-
-   src_format = util_format_linear(src_tex->format);
-   dst_format = util_format_linear(dst->format);
-
-   /*
-    * Check for simple case:  no format conversion, no flipping, no stretching,
-    * no overlapping.
-    * Filter mode should not matter since there's no stretching.
-    */
-   if (dst_format == src_format &&
-       srcX0 < srcX1 &&
-       dstX0 < dstX1 &&
-       srcY0 < srcY1 &&
-       dstY0 < dstY1 &&
-       (dstX1 - dstX0) == (srcX1 - srcX0) &&
-       (dstY1 - dstY0) == (srcY1 - srcY0) &&
-       !overlap) {
-      struct pipe_box src_box;
-      src_box.x = srcX0;
-      src_box.y = srcY0;
-      src_box.z = srcZ0;
-      src_box.width = srcW;
-      src_box.height = srcH;
-      src_box.depth = 1;
-      pipe->resource_copy_region(pipe,
-                                 dst->texture, dst->u.tex.level,
-                                 dstX0, dstY0, dst->u.tex.first_layer,/* dest */
-                                 src_tex, src_level,
-                                 &src_box);
-       return;
-   }
-
-   if (dst_format == dst->format) {
-      dst_surface = dst;
-   } else {
-      struct pipe_surface templ = *dst;
-      templ.format = dst_format;
-      dst_surface = pipe->create_surface(pipe, dst->texture, &templ);
-   }
-
-   /* Create a temporary texture when src and dest alias or when src
-    * is anything other than a 2d texture.
-    * XXX should just use appropriate shader to access 1d / 3d slice / cube face,
-    * much like the u_blitter code does (should be pretty trivial).
-    * 
-    * This can still be improved upon.
-    */
-   if ((src_tex == dst_surface->texture &&
-       dst_surface->u.tex.level == src_level &&
-       dst_surface->u.tex.first_layer == srcZ0) ||
-       (src_tex->target != PIPE_TEXTURE_2D &&
-       src_tex->target != PIPE_TEXTURE_2D &&
-       src_tex->target != PIPE_TEXTURE_RECT))
-   {
-      struct pipe_resource texTemp;
-      struct pipe_resource *tex;
-      struct pipe_sampler_view sv_templ;
-      struct pipe_box src_box;
-      const int srcLeft = MIN2(srcX0, srcX1);
-      const int srcTop = MIN2(srcY0, srcY1);
-
-      if (srcLeft != srcX0) {
-         /* left-right flip */
-         int tmp = dstX0;
-         dstX0 = dstX1;
-         dstX1 = tmp;
-      }
-
-      if (srcTop != srcY0) {
-         /* up-down flip */
-         int tmp = dstY0;
-         dstY0 = dstY1;
-         dstY1 = tmp;
-      }
-
-      /* create temp texture */
-      memset(&texTemp, 0, sizeof(texTemp));
-      texTemp.target = ctx->internal_target;
-      texTemp.format = src_format;
-      texTemp.last_level = 0;
-      texTemp.width0 = srcW;
-      texTemp.height0 = srcH;
-      texTemp.depth0 = 1;
-      texTemp.array_size = 1;
-      texTemp.bind = PIPE_BIND_SAMPLER_VIEW;
-
-      tex = screen->resource_create(screen, &texTemp);
-      if (!tex)
-         return;
-
-      src_box.x = srcLeft;
-      src_box.y = srcTop;
-      src_box.z = srcZ0;
-      src_box.width = srcW;
-      src_box.height = srcH;
-      src_box.depth = 1;
-      /* load temp texture */
-      pipe->resource_copy_region(pipe,
-                                 tex, 0, 0, 0, 0,  /* dest */
-                                 src_tex, src_level, &src_box);
-
-      normalized = tex->target != PIPE_TEXTURE_RECT;
-      if(normalized) {
-         s0 = 0.0f;
-         s1 = 1.0f;
-         t0 = 0.0f;
-         t1 = 1.0f;
-      }
-      else {
-         s0 = 0;
-         s1 = srcW;
-         t0 = 0;
-         t1 = srcH;
-      }
-
-      u_sampler_view_default_template(&sv_templ, tex, tex->format);
-      sampler_view = pipe->create_sampler_view(pipe, tex, &sv_templ);
-
-      if (!sampler_view) {
-         pipe_resource_reference(&tex, NULL);
-         return;
-      }
-      pipe_resource_reference(&tex, NULL);
-   }
-   else {
-      u_sampler_view_default_template(&sv_templ, src_tex, src_format);
-      sampler_view = pipe->create_sampler_view(pipe, src_tex, &sv_templ);
-
-      if (!sampler_view) {
-         return;
-      }
-
-      s0 = srcX0;
-      s1 = srcX1;
-      t0 = srcY0;
-      t1 = srcY1;
-      normalized = sampler_view->texture->target != PIPE_TEXTURE_RECT;
-      if(normalized)
-      {
-         s0 /= (float)(u_minify(sampler_view->texture->width0, src_level));
-         s1 /= (float)(u_minify(sampler_view->texture->width0, src_level));
-         t0 /= (float)(u_minify(sampler_view->texture->height0, src_level));
-         t1 /= (float)(u_minify(sampler_view->texture->height0, src_level));
-      }
-   }
-
-   dst_is_depth = util_format_is_depth_or_stencil(dst_format);
-
-   assert(screen->is_format_supported(screen, sampler_view->format, ctx->internal_target,
-                                      sampler_view->texture->nr_samples,
-                                      PIPE_BIND_SAMPLER_VIEW));
-   assert(screen->is_format_supported(screen, dst_format, ctx->internal_target,
-                                      dst_surface->texture->nr_samples,
-                                      dst_is_depth ? PIPE_BIND_DEPTH_STENCIL :
-                                                     PIPE_BIND_RENDER_TARGET));
-   /* save state (restored below) */
-   cso_save_blend(ctx->cso);
-   cso_save_depth_stencil_alpha(ctx->cso);
-   cso_save_rasterizer(ctx->cso);
-   cso_save_samplers(ctx->cso);
-   cso_save_fragment_sampler_views(ctx->cso);
-   cso_save_viewport(ctx->cso);
-   cso_save_framebuffer(ctx->cso);
-   cso_save_fragment_shader(ctx->cso);
-   cso_save_vertex_shader(ctx->cso);
-   cso_save_clip(ctx->cso);
-   cso_save_vertex_elements(ctx->cso);
-   cso_save_vertex_buffers(ctx->cso);
-
-   /* set misc state we care about */
-   cso_set_blend(ctx->cso, &ctx->blend);
-   cso_set_depth_stencil_alpha(ctx->cso,
-                               dst_is_depth ? &ctx->depthstencil_write :
-                                              &ctx->depthstencil_keep);
-   cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
-   cso_set_clip(ctx->cso, &ctx->clip);
-   cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
-
-   /* sampler */
-   ctx->sampler.normalized_coords = normalized;
-   ctx->sampler.min_img_filter = filter;
-   ctx->sampler.mag_img_filter = filter;
-   ctx->sampler.min_lod = src_level;
-   ctx->sampler.max_lod = src_level;
-   cso_single_sampler(ctx->cso, 0, &ctx->sampler);
-   cso_single_sampler_done(ctx->cso);
-
-   /* viewport */
-   ctx->viewport.scale[0] = 0.5f * dst_surface->width;
-   ctx->viewport.scale[1] = 0.5f * dst_surface->height;
-   ctx->viewport.scale[2] = 0.5f;
-   ctx->viewport.scale[3] = 1.0f;
-   ctx->viewport.translate[0] = 0.5f * dst_surface->width;
-   ctx->viewport.translate[1] = 0.5f * dst_surface->height;
-   ctx->viewport.translate[2] = 0.5f;
-   ctx->viewport.translate[3] = 0.0f;
-   cso_set_viewport(ctx->cso, &ctx->viewport);
-
-   /* texture */
-   cso_set_fragment_sampler_views(ctx->cso, 1, &sampler_view);
-
-   /* shaders */
-   if (dst_is_depth) {
-      set_depth_fragment_shader(ctx);
-   } else {
-      set_fragment_shader(ctx, writemask);
-   }
-   set_vertex_shader(ctx);
-
-   /* drawing dest */
-   memset(&fb, 0, sizeof(fb));
-   fb.width = dst_surface->width;
-   fb.height = dst_surface->height;
-   if (dst_is_depth) {
-      fb.zsbuf = dst_surface;
-   } else {
-      fb.nr_cbufs = 1;
-      fb.cbufs[0] = dst_surface;
-   }
-   cso_set_framebuffer(ctx->cso, &fb);
-
-   /* draw quad */
-   offset = setup_vertex_data_tex(ctx,
-                                  (float) dstX0 / dst_surface->width * 2.0f - 1.0f,
-                                  (float) dstY0 / dst_surface->height * 2.0f - 1.0f,
-                                  (float) dstX1 / dst_surface->width * 2.0f - 1.0f,
-                                  (float) dstY1 / dst_surface->height * 2.0f - 1.0f,
-                                  s0, t0,
-                                  s1, t1,
-                                  z);
-
-   util_draw_vertex_buffer(ctx->pipe, ctx->cso, ctx->vbuf, offset,
-                           PIPE_PRIM_TRIANGLE_FAN,
-                           4,  /* verts */
-                           2); /* attribs/vert */
-
-   /* restore state we changed */
-   cso_restore_blend(ctx->cso);
-   cso_restore_depth_stencil_alpha(ctx->cso);
-   cso_restore_rasterizer(ctx->cso);
-   cso_restore_samplers(ctx->cso);
-   cso_restore_fragment_sampler_views(ctx->cso);
-   cso_restore_viewport(ctx->cso);
-   cso_restore_framebuffer(ctx->cso);
-   cso_restore_fragment_shader(ctx->cso);
-   cso_restore_vertex_shader(ctx->cso);
-   cso_restore_clip(ctx->cso);
-   cso_restore_vertex_elements(ctx->cso);
-   cso_restore_vertex_buffers(ctx->cso);
-
-   pipe_sampler_view_reference(&sampler_view, NULL);
-   if (dst_surface != dst)
-      pipe_surface_reference(&dst_surface, NULL);
-}
-
-
-void
-util_blit_pixels(struct blit_state *ctx,
-                 struct pipe_resource *src_tex,
-                 unsigned src_level,
-                 int srcX0, int srcY0,
-                 int srcX1, int srcY1,
-                 int srcZ,
-                 struct pipe_surface *dst,
-                 int dstX0, int dstY0,
-                 int dstX1, int dstY1,
-                 float z, uint filter )
-{
-   util_blit_pixels_writemask( ctx, src_tex,
-                               src_level,
-                               srcX0, srcY0,
-                               srcX1, srcY1,
-                               srcZ,
-                               dst,
-                               dstX0, dstY0,
-                               dstX1, dstY1,
-                               z, filter,
-                               TGSI_WRITEMASK_XYZW );
-}
-
-
-/* Release vertex buffer at end of frame to avoid synchronous
- * rendering.
- */
-void util_blit_flush( struct blit_state *ctx )
-{
-   pipe_resource_reference(&ctx->vbuf, NULL);
-   ctx->vbuf_slot = 0;
-} 
-
-
-
-/**
- * Copy pixel block from src texture to dst surface.
- *
- * XXX Should support selection of level.
- * XXX need some control over blitting Z and/or stencil.
- */
-void
-util_blit_pixels_tex(struct blit_state *ctx,
-                     struct pipe_sampler_view *src_sampler_view,
-                     int srcX0, int srcY0,
-                     int srcX1, int srcY1,
-                     struct pipe_surface *dst,
-                     int dstX0, int dstY0,
-                     int dstX1, int dstY1,
-                     float z, uint filter)
-{
-   boolean normalized = src_sampler_view->texture->target != PIPE_TEXTURE_RECT;
-   struct pipe_framebuffer_state fb;
-   float s0, t0, s1, t1;
-   unsigned offset;
-   struct pipe_resource *tex = src_sampler_view->texture;
-
-   assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
-          filter == PIPE_TEX_MIPFILTER_LINEAR);
-
-   assert(tex);
-   assert(tex->width0 != 0);
-   assert(tex->height0 != 0);
-
-   s0 = srcX0;
-   s1 = srcX1;
-   t0 = srcY0;
-   t1 = srcY1;
-
-   if(normalized)
-   {
-      s0 /= (float)tex->width0;
-      s1 /= (float)tex->width0;
-      t0 /= (float)tex->height0;
-      t1 /= (float)tex->height0;
-   }
-
-   assert(ctx->pipe->screen->is_format_supported(ctx->pipe->screen, dst->format,
-                                                 PIPE_TEXTURE_2D,
-                                                 dst->texture->nr_samples,
-                                                 PIPE_BIND_RENDER_TARGET));
-
-   /* save state (restored below) */
-   cso_save_blend(ctx->cso);
-   cso_save_depth_stencil_alpha(ctx->cso);
-   cso_save_rasterizer(ctx->cso);
-   cso_save_samplers(ctx->cso);
-   cso_save_fragment_sampler_views(ctx->cso);
-   cso_save_viewport(ctx->cso);
-   cso_save_framebuffer(ctx->cso);
-   cso_save_fragment_shader(ctx->cso);
-   cso_save_vertex_shader(ctx->cso);
-   cso_save_clip(ctx->cso);
-   cso_save_vertex_elements(ctx->cso);
-   cso_save_vertex_buffers(ctx->cso);
-
-   /* set misc state we care about */
-   cso_set_blend(ctx->cso, &ctx->blend);
-   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil_keep);
-   cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
-   cso_set_clip(ctx->cso, &ctx->clip);
-   cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
-
-   /* sampler */
-   ctx->sampler.normalized_coords = normalized;
-   ctx->sampler.min_img_filter = filter;
-   ctx->sampler.mag_img_filter = filter;
-   cso_single_sampler(ctx->cso, 0, &ctx->sampler);
-   cso_single_sampler_done(ctx->cso);
-
-   /* viewport */
-   ctx->viewport.scale[0] = 0.5f * dst->width;
-   ctx->viewport.scale[1] = 0.5f * dst->height;
-   ctx->viewport.scale[2] = 0.5f;
-   ctx->viewport.scale[3] = 1.0f;
-   ctx->viewport.translate[0] = 0.5f * dst->width;
-   ctx->viewport.translate[1] = 0.5f * dst->height;
-   ctx->viewport.translate[2] = 0.5f;
-   ctx->viewport.translate[3] = 0.0f;
-   cso_set_viewport(ctx->cso, &ctx->viewport);
-
-   /* texture */
-   cso_set_fragment_sampler_views(ctx->cso, 1, &src_sampler_view);
-
-   /* shaders */
-   set_fragment_shader(ctx, TGSI_WRITEMASK_XYZW);
-   set_vertex_shader(ctx);
-
-   /* drawing dest */
-   memset(&fb, 0, sizeof(fb));
-   fb.width = dst->width;
-   fb.height = dst->height;
-   fb.nr_cbufs = 1;
-   fb.cbufs[0] = dst;
-   cso_set_framebuffer(ctx->cso, &fb);
-
-   /* draw quad */
-   offset = setup_vertex_data_tex(ctx,
-                                  (float) dstX0 / dst->width * 2.0f - 1.0f,
-                                  (float) dstY0 / dst->height * 2.0f - 1.0f,
-                                  (float) dstX1 / dst->width * 2.0f - 1.0f,
-                                  (float) dstY1 / dst->height * 2.0f - 1.0f,
-                                  s0, t0, s1, t1,
-                                  z);
-
-   util_draw_vertex_buffer(ctx->pipe, ctx->cso,
-                           ctx->vbuf, offset,
-                           PIPE_PRIM_TRIANGLE_FAN,
-                           4,  /* verts */
-                           2); /* attribs/vert */
-
-   /* restore state we changed */
-   cso_restore_blend(ctx->cso);
-   cso_restore_depth_stencil_alpha(ctx->cso);
-   cso_restore_rasterizer(ctx->cso);
-   cso_restore_samplers(ctx->cso);
-   cso_restore_fragment_sampler_views(ctx->cso);
-   cso_restore_viewport(ctx->cso);
-   cso_restore_framebuffer(ctx->cso);
-   cso_restore_fragment_shader(ctx->cso);
-   cso_restore_vertex_shader(ctx->cso);
-   cso_restore_clip(ctx->cso);
-   cso_restore_vertex_elements(ctx->cso);
-   cso_restore_vertex_buffers(ctx->cso);
-}
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Copy/blit pixel rect between surfaces
+ *  
+ * @author Brian Paul
+ */
+
+
+#include "pipe/p_context.h"
+#include "util/u_debug.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_state.h"
+
+#include "util/u_blit.h"
+#include "util/u_draw_quad.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_sampler.h"
+#include "util/u_simple_shaders.h"
+
+#include "cso_cache/cso_context.h"
+
+
+struct blit_state
+{
+   struct pipe_context *pipe;
+   struct cso_context *cso;
+
+   struct pipe_blend_state blend;
+   struct pipe_depth_stencil_alpha_state depthstencil_keep;
+   struct pipe_depth_stencil_alpha_state depthstencil_write;
+   struct pipe_rasterizer_state rasterizer;
+   struct pipe_sampler_state sampler;
+   struct pipe_viewport_state viewport;
+   struct pipe_clip_state clip;
+   struct pipe_vertex_element velem[2];
+   enum pipe_texture_target internal_target;
+
+   void *vs;
+   void *fs[TGSI_WRITEMASK_XYZW + 1];
+   void *fs_depth;
+
+   struct pipe_resource *vbuf;  /**< quad vertices */
+   unsigned vbuf_slot;
+
+   float vertices[4][2][4];   /**< vertex/texcoords for quad */
+};
+
+
+/**
+ * Create state object for blit.
+ * Intended to be created once and re-used for many blit() calls.
+ */
+struct blit_state *
+util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
+{
+   struct blit_state *ctx;
+   uint i;
+
+   ctx = CALLOC_STRUCT(blit_state);
+   if (!ctx)
+      return NULL;
+
+   ctx->pipe = pipe;
+   ctx->cso = cso;
+
+   /* disabled blending/masking */
+   memset(&ctx->blend, 0, sizeof(ctx->blend));
+   ctx->blend.rt[0].colormask = PIPE_MASK_RGBA;
+
+   /* no-op depth/stencil/alpha */
+   memset(&ctx->depthstencil_keep, 0, sizeof(ctx->depthstencil_keep));
+   memset(&ctx->depthstencil_write, 0, sizeof(ctx->depthstencil_write));
+   ctx->depthstencil_write.depth.enabled = 1;
+   ctx->depthstencil_write.depth.writemask = 1;
+   ctx->depthstencil_write.depth.func = PIPE_FUNC_ALWAYS;
+
+   /* rasterizer */
+   memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
+   ctx->rasterizer.cull_face = PIPE_FACE_NONE;
+   ctx->rasterizer.gl_rasterization_rules = 1;
+
+   /* samplers */
+   memset(&ctx->sampler, 0, sizeof(ctx->sampler));
+   ctx->sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   ctx->sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   ctx->sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   ctx->sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   ctx->sampler.min_img_filter = 0; /* set later */
+   ctx->sampler.mag_img_filter = 0; /* set later */
+
+   /* vertex elements state */
+   memset(&ctx->velem[0], 0, sizeof(ctx->velem[0]) * 2);
+   for (i = 0; i < 2; i++) {
+      ctx->velem[i].src_offset = i * 4 * sizeof(float);
+      ctx->velem[i].instance_divisor = 0;
+      ctx->velem[i].vertex_buffer_index = 0;
+      ctx->velem[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+   }
+
+   ctx->vbuf = NULL;
+
+   /* init vertex data that doesn't change */
+   for (i = 0; i < 4; i++) {
+      ctx->vertices[i][0][3] = 1.0f; /* w */
+      ctx->vertices[i][1][2] = 0.0f; /* r */
+      ctx->vertices[i][1][3] = 1.0f; /* q */
+   }
+
+   if(pipe->screen->get_param(pipe->screen, PIPE_CAP_NPOT_TEXTURES))
+      ctx->internal_target = PIPE_TEXTURE_2D;
+   else
+      ctx->internal_target = PIPE_TEXTURE_RECT;
+
+   return ctx;
+}
+
+
+/**
+ * Destroy a blit context
+ */
+void
+util_destroy_blit(struct blit_state *ctx)
+{
+   struct pipe_context *pipe = ctx->pipe;
+   unsigned i;
+
+   if (ctx->vs)
+      pipe->delete_vs_state(pipe, ctx->vs);
+
+   for (i = 0; i < Elements(ctx->fs); i++)
+      if (ctx->fs[i])
+         pipe->delete_fs_state(pipe, ctx->fs[i]);
+
+   if (ctx->fs_depth)
+      pipe->delete_fs_state(pipe, ctx->fs_depth);
+
+   pipe_resource_reference(&ctx->vbuf, NULL);
+
+   FREE(ctx);
+}
+
+
+/**
+ * Helper function to set the fragment shaders.
+ */
+static INLINE void
+set_fragment_shader(struct blit_state *ctx, uint writemask)
+{
+   if (!ctx->fs[writemask])
+      ctx->fs[writemask] =
+         util_make_fragment_tex_shader_writemask(ctx->pipe, TGSI_TEXTURE_2D,
+                                                 TGSI_INTERPOLATE_LINEAR,
+                                                 writemask);
+
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]);
+}
+
+
+/**
+ * Helper function to set the depthwrite shader.
+ */
+static INLINE void
+set_depth_fragment_shader(struct blit_state *ctx)
+{
+   if (!ctx->fs_depth)
+      ctx->fs_depth =
+         util_make_fragment_tex_shader_writedepth(ctx->pipe, TGSI_TEXTURE_2D,
+                                                  TGSI_INTERPOLATE_LINEAR);
+
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs_depth);
+}
+
+
+/**
+ * Helper function to set the vertex shader.
+ */
+static INLINE void
+set_vertex_shader(struct blit_state *ctx)
+{
+   /* vertex shader - still required to provide the linkage between
+    * fragment shader input semantics and vertex_element/buffers.
+    */
+   if (!ctx->vs) {
+      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
+                                      TGSI_SEMANTIC_GENERIC };
+      const uint semantic_indexes[] = { 0, 0 };
+      ctx->vs = util_make_vertex_passthrough_shader(ctx->pipe, 2,
+                                                    semantic_names,
+                                                    semantic_indexes);
+   }
+
+   cso_set_vertex_shader_handle(ctx->cso, ctx->vs);
+}
+
+
+/**
+ * Get offset of next free slot in vertex buffer for quad vertices.
+ */
+static unsigned
+get_next_slot( struct blit_state *ctx )
+{
+   const unsigned max_slots = 4096 / sizeof ctx->vertices;
+
+   if (ctx->vbuf_slot >= max_slots) 
+      util_blit_flush( ctx );
+
+   if (!ctx->vbuf) {
+      ctx->vbuf = pipe_buffer_create(ctx->pipe->screen,
+                                     PIPE_BIND_VERTEX_BUFFER,
+                                     PIPE_USAGE_STREAM,
+                                     max_slots * sizeof ctx->vertices);
+   }
+   
+   return ctx->vbuf_slot++ * sizeof ctx->vertices;
+}
+
+
+
+
+/**
+ * Setup vertex data for the textured quad we'll draw.
+ * Note: y=0=top
+ */
+static unsigned
+setup_vertex_data_tex(struct blit_state *ctx,
+                      float x0, float y0, float x1, float y1,
+                      float s0, float t0, float s1, float t1,
+                      float z)
+{
+   unsigned offset;
+
+   ctx->vertices[0][0][0] = x0;
+   ctx->vertices[0][0][1] = y0;
+   ctx->vertices[0][0][2] = z;
+   ctx->vertices[0][1][0] = s0; /*s*/
+   ctx->vertices[0][1][1] = t0; /*t*/
+
+   ctx->vertices[1][0][0] = x1;
+   ctx->vertices[1][0][1] = y0;
+   ctx->vertices[1][0][2] = z;
+   ctx->vertices[1][1][0] = s1; /*s*/
+   ctx->vertices[1][1][1] = t0; /*t*/
+
+   ctx->vertices[2][0][0] = x1;
+   ctx->vertices[2][0][1] = y1;
+   ctx->vertices[2][0][2] = z;
+   ctx->vertices[2][1][0] = s1;
+   ctx->vertices[2][1][1] = t1;
+
+   ctx->vertices[3][0][0] = x0;
+   ctx->vertices[3][0][1] = y1;
+   ctx->vertices[3][0][2] = z;
+   ctx->vertices[3][1][0] = s0;
+   ctx->vertices[3][1][1] = t1;
+
+   offset = get_next_slot( ctx );
+
+   pipe_buffer_write_nooverlap(ctx->pipe, ctx->vbuf,
+                               offset, sizeof(ctx->vertices), ctx->vertices);
+
+   return offset;
+}
+
+
+/**
+ * \return TRUE if two regions overlap, FALSE otherwise
+ */
+static boolean
+regions_overlap(int srcX0, int srcY0,
+                int srcX1, int srcY1,
+                int dstX0, int dstY0,
+                int dstX1, int dstY1)
+{
+   if (MAX2(srcX0, srcX1) < MIN2(dstX0, dstX1))
+      return FALSE; /* src completely left of dst */
+
+   if (MAX2(dstX0, dstX1) < MIN2(srcX0, srcX1))
+      return FALSE; /* dst completely left of src */
+
+   if (MAX2(srcY0, srcY1) < MIN2(dstY0, dstY1))
+      return FALSE; /* src completely above dst */
+
+   if (MAX2(dstY0, dstY1) < MIN2(srcY0, srcY1))
+      return FALSE; /* dst completely above src */
+
+   return TRUE; /* some overlap */
+}
+
+
+/**
+ * Copy pixel block from src surface to dst surface.
+ * Overlapping regions are acceptable.
+ * Flipping and stretching are supported.
+ * \param filter  one of PIPE_TEX_MIPFILTER_NEAREST/LINEAR
+ * \param writemask  controls which channels in the dest surface are sourced
+ *                   from the src surface.  Disabled channels are sourced
+ *                   from (0,0,0,1).
+ * XXX need some control over blitting stencil.
+ */
+void
+util_blit_pixels_writemask(struct blit_state *ctx,
+                           struct pipe_resource *src_tex,
+                           unsigned src_level,
+                           int srcX0, int srcY0,
+                           int srcX1, int srcY1,
+                           int srcZ0,
+                           struct pipe_surface *dst,
+                           int dstX0, int dstY0,
+                           int dstX1, int dstY1,
+                           float z, uint filter,
+                           uint writemask)
+{
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   enum pipe_format src_format, dst_format;
+   struct pipe_sampler_view *sampler_view = NULL;
+   struct pipe_sampler_view sv_templ;
+   struct pipe_surface *dst_surface;
+   struct pipe_framebuffer_state fb;
+   const int srcW = abs(srcX1 - srcX0);
+   const int srcH = abs(srcY1 - srcY0);
+   unsigned offset;
+   boolean overlap, dst_is_depth;
+   float s0, t0, s1, t1;
+   boolean normalized;
+
+   assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
+          filter == PIPE_TEX_MIPFILTER_LINEAR);
+
+   assert(src_level <= src_tex->last_level);
+
+   /* do the regions overlap? */
+   overlap = src_tex == dst->texture &&
+             dst->u.tex.level == src_level &&
+             dst->u.tex.first_layer == srcZ0 &&
+      regions_overlap(srcX0, srcY0, srcX1, srcY1,
+                      dstX0, dstY0, dstX1, dstY1);
+
+   src_format = util_format_linear(src_tex->format);
+   dst_format = util_format_linear(dst->format);
+
+   /*
+    * Check for simple case:  no format conversion, no flipping, no stretching,
+    * no overlapping.
+    * Filter mode should not matter since there's no stretching.
+    */
+   if (dst_format == src_format &&
+       srcX0 < srcX1 &&
+       dstX0 < dstX1 &&
+       srcY0 < srcY1 &&
+       dstY0 < dstY1 &&
+       (dstX1 - dstX0) == (srcX1 - srcX0) &&
+       (dstY1 - dstY0) == (srcY1 - srcY0) &&
+       !overlap) {
+      struct pipe_box src_box;
+      src_box.x = srcX0;
+      src_box.y = srcY0;
+      src_box.z = srcZ0;
+      src_box.width = srcW;
+      src_box.height = srcH;
+      src_box.depth = 1;
+      pipe->resource_copy_region(pipe,
+                                 dst->texture, dst->u.tex.level,
+                                 dstX0, dstY0, dst->u.tex.first_layer,/* dest */
+                                 src_tex, src_level,
+                                 &src_box);
+       return;
+   }
+
+   if (dst_format == dst->format) {
+      dst_surface = dst;
+   } else {
+      struct pipe_surface templ = *dst;
+      templ.format = dst_format;
+      dst_surface = pipe->create_surface(pipe, dst->texture, &templ);
+   }
+
+   /* Create a temporary texture when src and dest alias or when src
+    * is anything other than a 2d texture.
+    * XXX should just use appropriate shader to access 1d / 3d slice / cube face,
+    * much like the u_blitter code does (should be pretty trivial).
+    * 
+    * This can still be improved upon.
+    */
+   if ((src_tex == dst_surface->texture &&
+       dst_surface->u.tex.level == src_level &&
+       dst_surface->u.tex.first_layer == srcZ0) ||
+       (src_tex->target != PIPE_TEXTURE_2D &&
+       src_tex->target != PIPE_TEXTURE_2D &&
+       src_tex->target != PIPE_TEXTURE_RECT))
+   {
+      struct pipe_resource texTemp;
+      struct pipe_resource *tex;
+      struct pipe_sampler_view sv_templ;
+      struct pipe_box src_box;
+      const int srcLeft = MIN2(srcX0, srcX1);
+      const int srcTop = MIN2(srcY0, srcY1);
+
+      if (srcLeft != srcX0) {
+         /* left-right flip */
+         int tmp = dstX0;
+         dstX0 = dstX1;
+         dstX1 = tmp;
+      }
+
+      if (srcTop != srcY0) {
+         /* up-down flip */
+         int tmp = dstY0;
+         dstY0 = dstY1;
+         dstY1 = tmp;
+      }
+
+      /* create temp texture */
+      memset(&texTemp, 0, sizeof(texTemp));
+      texTemp.target = ctx->internal_target;
+      texTemp.format = src_format;
+      texTemp.last_level = 0;
+      texTemp.width0 = srcW;
+      texTemp.height0 = srcH;
+      texTemp.depth0 = 1;
+      texTemp.array_size = 1;
+      texTemp.bind = PIPE_BIND_SAMPLER_VIEW;
+
+      tex = screen->resource_create(screen, &texTemp);
+      if (!tex)
+         return;
+
+      src_box.x = srcLeft;
+      src_box.y = srcTop;
+      src_box.z = srcZ0;
+      src_box.width = srcW;
+      src_box.height = srcH;
+      src_box.depth = 1;
+      /* load temp texture */
+      pipe->resource_copy_region(pipe,
+                                 tex, 0, 0, 0, 0,  /* dest */
+                                 src_tex, src_level, &src_box);
+
+      normalized = tex->target != PIPE_TEXTURE_RECT;
+      if(normalized) {
+         s0 = 0.0f;
+         s1 = 1.0f;
+         t0 = 0.0f;
+         t1 = 1.0f;
+      }
+      else {
+         s0 = 0;
+         s1 = srcW;
+         t0 = 0;
+         t1 = srcH;
+      }
+
+      u_sampler_view_default_template(&sv_templ, tex, tex->format);
+      sampler_view = pipe->create_sampler_view(pipe, tex, &sv_templ);
+
+      if (!sampler_view) {
+         pipe_resource_reference(&tex, NULL);
+         return;
+      }
+      pipe_resource_reference(&tex, NULL);
+   }
+   else {
+      u_sampler_view_default_template(&sv_templ, src_tex, src_format);
+      sampler_view = pipe->create_sampler_view(pipe, src_tex, &sv_templ);
+
+      if (!sampler_view) {
+         return;
+      }
+
+      s0 = srcX0;
+      s1 = srcX1;
+      t0 = srcY0;
+      t1 = srcY1;
+      normalized = sampler_view->texture->target != PIPE_TEXTURE_RECT;
+      if(normalized)
+      {
+         s0 /= (float)(u_minify(sampler_view->texture->width0, src_level));
+         s1 /= (float)(u_minify(sampler_view->texture->width0, src_level));
+         t0 /= (float)(u_minify(sampler_view->texture->height0, src_level));
+         t1 /= (float)(u_minify(sampler_view->texture->height0, src_level));
+      }
+   }
+
+   dst_is_depth = util_format_is_depth_or_stencil(dst_format);
+
+   assert(screen->is_format_supported(screen, sampler_view->format, ctx->internal_target,
+                                      sampler_view->texture->nr_samples,
+                                      PIPE_BIND_SAMPLER_VIEW));
+   assert(screen->is_format_supported(screen, dst_format, ctx->internal_target,
+                                      dst_surface->texture->nr_samples,
+                                      dst_is_depth ? PIPE_BIND_DEPTH_STENCIL :
+                                                     PIPE_BIND_RENDER_TARGET));
+   /* save state (restored below) */
+   cso_save_blend(ctx->cso);
+   cso_save_depth_stencil_alpha(ctx->cso);
+   cso_save_rasterizer(ctx->cso);
+   cso_save_samplers(ctx->cso);
+   cso_save_fragment_sampler_views(ctx->cso);
+   cso_save_viewport(ctx->cso);
+   cso_save_framebuffer(ctx->cso);
+   cso_save_fragment_shader(ctx->cso);
+   cso_save_vertex_shader(ctx->cso);
+   cso_save_clip(ctx->cso);
+   cso_save_vertex_elements(ctx->cso);
+   cso_save_vertex_buffers(ctx->cso);
+
+   /* set misc state we care about */
+   cso_set_blend(ctx->cso, &ctx->blend);
+   cso_set_depth_stencil_alpha(ctx->cso,
+                               dst_is_depth ? &ctx->depthstencil_write :
+                                              &ctx->depthstencil_keep);
+   cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
+   cso_set_clip(ctx->cso, &ctx->clip);
+   cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
+
+   /* sampler */
+   ctx->sampler.normalized_coords = normalized;
+   ctx->sampler.min_img_filter = filter;
+   ctx->sampler.mag_img_filter = filter;
+   ctx->sampler.min_lod = src_level;
+   ctx->sampler.max_lod = src_level;
+   cso_single_sampler(ctx->cso, 0, &ctx->sampler);
+   cso_single_sampler_done(ctx->cso);
+
+   /* viewport */
+   ctx->viewport.scale[0] = 0.5f * dst_surface->width;
+   ctx->viewport.scale[1] = 0.5f * dst_surface->height;
+   ctx->viewport.scale[2] = 0.5f;
+   ctx->viewport.scale[3] = 1.0f;
+   ctx->viewport.translate[0] = 0.5f * dst_surface->width;
+   ctx->viewport.translate[1] = 0.5f * dst_surface->height;
+   ctx->viewport.translate[2] = 0.5f;
+   ctx->viewport.translate[3] = 0.0f;
+   cso_set_viewport(ctx->cso, &ctx->viewport);
+
+   /* texture */
+   cso_set_fragment_sampler_views(ctx->cso, 1, &sampler_view);
+
+   /* shaders */
+   if (dst_is_depth) {
+      set_depth_fragment_shader(ctx);
+   } else {
+      set_fragment_shader(ctx, writemask);
+   }
+   set_vertex_shader(ctx);
+
+   /* drawing dest */
+   memset(&fb, 0, sizeof(fb));
+   fb.width = dst_surface->width;
+   fb.height = dst_surface->height;
+   if (dst_is_depth) {
+      fb.zsbuf = dst_surface;
+   } else {
+      fb.nr_cbufs = 1;
+      fb.cbufs[0] = dst_surface;
+   }
+   cso_set_framebuffer(ctx->cso, &fb);
+
+   /* draw quad */
+   offset = setup_vertex_data_tex(ctx,
+                                  (float) dstX0 / dst_surface->width * 2.0f - 1.0f,
+                                  (float) dstY0 / dst_surface->height * 2.0f - 1.0f,
+                                  (float) dstX1 / dst_surface->width * 2.0f - 1.0f,
+                                  (float) dstY1 / dst_surface->height * 2.0f - 1.0f,
+                                  s0, t0,
+                                  s1, t1,
+                                  z);
+
+   util_draw_vertex_buffer(ctx->pipe, ctx->cso, ctx->vbuf, offset,
+                           PIPE_PRIM_TRIANGLE_FAN,
+                           4,  /* verts */
+                           2); /* attribs/vert */
+
+   /* restore state we changed */
+   cso_restore_blend(ctx->cso);
+   cso_restore_depth_stencil_alpha(ctx->cso);
+   cso_restore_rasterizer(ctx->cso);
+   cso_restore_samplers(ctx->cso);
+   cso_restore_fragment_sampler_views(ctx->cso);
+   cso_restore_viewport(ctx->cso);
+   cso_restore_framebuffer(ctx->cso);
+   cso_restore_fragment_shader(ctx->cso);
+   cso_restore_vertex_shader(ctx->cso);
+   cso_restore_clip(ctx->cso);
+   cso_restore_vertex_elements(ctx->cso);
+   cso_restore_vertex_buffers(ctx->cso);
+
+   pipe_sampler_view_reference(&sampler_view, NULL);
+   if (dst_surface != dst)
+      pipe_surface_reference(&dst_surface, NULL);
+}
+
+
+void
+util_blit_pixels(struct blit_state *ctx,
+                 struct pipe_resource *src_tex,
+                 unsigned src_level,
+                 int srcX0, int srcY0,
+                 int srcX1, int srcY1,
+                 int srcZ,
+                 struct pipe_surface *dst,
+                 int dstX0, int dstY0,
+                 int dstX1, int dstY1,
+                 float z, uint filter )
+{
+   util_blit_pixels_writemask( ctx, src_tex,
+                               src_level,
+                               srcX0, srcY0,
+                               srcX1, srcY1,
+                               srcZ,
+                               dst,
+                               dstX0, dstY0,
+                               dstX1, dstY1,
+                               z, filter,
+                               TGSI_WRITEMASK_XYZW );
+}
+
+
+/* Release vertex buffer at end of frame to avoid synchronous
+ * rendering.
+ */
+void util_blit_flush( struct blit_state *ctx )
+{
+   pipe_resource_reference(&ctx->vbuf, NULL);
+   ctx->vbuf_slot = 0;
+} 
+
+
+
+/**
+ * Copy pixel block from src texture to dst surface.
+ * The sampler view's first_level field indicates the source
+ * mipmap level to use.
+ * XXX need some control over blitting Z and/or stencil.
+ */
+void
+util_blit_pixels_tex(struct blit_state *ctx,
+                     struct pipe_sampler_view *src_sampler_view,
+                     int srcX0, int srcY0,
+                     int srcX1, int srcY1,
+                     struct pipe_surface *dst,
+                     int dstX0, int dstY0,
+                     int dstX1, int dstY1,
+                     float z, uint filter)
+{
+   boolean normalized = src_sampler_view->texture->target != PIPE_TEXTURE_RECT;
+   struct pipe_framebuffer_state fb;
+   float s0, t0, s1, t1;
+   unsigned offset;
+   struct pipe_resource *tex = src_sampler_view->texture;
+
+   assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
+          filter == PIPE_TEX_MIPFILTER_LINEAR);
+
+   assert(tex);
+   assert(tex->width0 != 0);
+   assert(tex->height0 != 0);
+
+   s0 = srcX0;
+   s1 = srcX1;
+   t0 = srcY0;
+   t1 = srcY1;
+
+   if(normalized)
+   {
+      /* normalize according to the mipmap level's size */
+      int level = src_sampler_view->u.tex.first_level;
+      float w = (float) u_minify(tex->width0, level);
+      float h = (float) u_minify(tex->height0, level);
+      s0 /= w;
+      s1 /= w;
+      t0 /= h;
+      t1 /= h;
+   }
+
+   assert(ctx->pipe->screen->is_format_supported(ctx->pipe->screen, dst->format,
+                                                 PIPE_TEXTURE_2D,
+                                                 dst->texture->nr_samples,
+                                                 PIPE_BIND_RENDER_TARGET));
+
+   /* save state (restored below) */
+   cso_save_blend(ctx->cso);
+   cso_save_depth_stencil_alpha(ctx->cso);
+   cso_save_rasterizer(ctx->cso);
+   cso_save_samplers(ctx->cso);
+   cso_save_fragment_sampler_views(ctx->cso);
+   cso_save_viewport(ctx->cso);
+   cso_save_framebuffer(ctx->cso);
+   cso_save_fragment_shader(ctx->cso);
+   cso_save_vertex_shader(ctx->cso);
+   cso_save_clip(ctx->cso);
+   cso_save_vertex_elements(ctx->cso);
+   cso_save_vertex_buffers(ctx->cso);
+
+   /* set misc state we care about */
+   cso_set_blend(ctx->cso, &ctx->blend);
+   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil_keep);
+   cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
+   cso_set_clip(ctx->cso, &ctx->clip);
+   cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
+
+   /* sampler */
+   ctx->sampler.normalized_coords = normalized;
+   ctx->sampler.min_img_filter = filter;
+   ctx->sampler.mag_img_filter = filter;
+   cso_single_sampler(ctx->cso, 0, &ctx->sampler);
+   cso_single_sampler_done(ctx->cso);
+
+   /* viewport */
+   ctx->viewport.scale[0] = 0.5f * dst->width;
+   ctx->viewport.scale[1] = 0.5f * dst->height;
+   ctx->viewport.scale[2] = 0.5f;
+   ctx->viewport.scale[3] = 1.0f;
+   ctx->viewport.translate[0] = 0.5f * dst->width;
+   ctx->viewport.translate[1] = 0.5f * dst->height;
+   ctx->viewport.translate[2] = 0.5f;
+   ctx->viewport.translate[3] = 0.0f;
+   cso_set_viewport(ctx->cso, &ctx->viewport);
+
+   /* texture */
+   cso_set_fragment_sampler_views(ctx->cso, 1, &src_sampler_view);
+
+   /* shaders */
+   set_fragment_shader(ctx, TGSI_WRITEMASK_XYZW);
+   set_vertex_shader(ctx);
+
+   /* drawing dest */
+   memset(&fb, 0, sizeof(fb));
+   fb.width = dst->width;
+   fb.height = dst->height;
+   fb.nr_cbufs = 1;
+   fb.cbufs[0] = dst;
+   cso_set_framebuffer(ctx->cso, &fb);
+
+   /* draw quad */
+   offset = setup_vertex_data_tex(ctx,
+                                  (float) dstX0 / dst->width * 2.0f - 1.0f,
+                                  (float) dstY0 / dst->height * 2.0f - 1.0f,
+                                  (float) dstX1 / dst->width * 2.0f - 1.0f,
+                                  (float) dstY1 / dst->height * 2.0f - 1.0f,
+                                  s0, t0, s1, t1,
+                                  z);
+
+   util_draw_vertex_buffer(ctx->pipe, ctx->cso,
+                           ctx->vbuf, offset,
+                           PIPE_PRIM_TRIANGLE_FAN,
+                           4,  /* verts */
+                           2); /* attribs/vert */
+
+   /* restore state we changed */
+   cso_restore_blend(ctx->cso);
+   cso_restore_depth_stencil_alpha(ctx->cso);
+   cso_restore_rasterizer(ctx->cso);
+   cso_restore_samplers(ctx->cso);
+   cso_restore_fragment_sampler_views(ctx->cso);
+   cso_restore_viewport(ctx->cso);
+   cso_restore_framebuffer(ctx->cso);
+   cso_restore_fragment_shader(ctx->cso);
+   cso_restore_vertex_shader(ctx->cso);
+   cso_restore_clip(ctx->cso);
+   cso_restore_vertex_elements(ctx->cso);
+   cso_restore_vertex_buffers(ctx->cso);
+}
diff --git a/mesalib/src/gallium/auxiliary/util/u_format_latc.c b/mesalib/src/gallium/auxiliary/util/u_format_latc.c
index a25faf5d9..113a793e2 100644
--- a/mesalib/src/gallium/auxiliary/util/u_format_latc.c
+++ b/mesalib/src/gallium/auxiliary/util/u_format_latc.c
@@ -1,328 +1,326 @@
-/**************************************************************************
- *
- * Copyright (C) 2011 Red Hat Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include <stdio.h>
-#include "u_math.h"
-#include "u_format.h"
-#include "u_format_rgtc.h"
-#include "u_format_latc.h"
-
-static void u_format_unsigned_encode_rgtc_chan(uint8_t *blkaddr, uint8_t srccolors[4][4],
-					       int numxpixels, int numypixels);
-
-static void u_format_unsigned_fetch_texel_rgtc(unsigned srcRowStride, const uint8_t *pixdata,
-					       unsigned i, unsigned j, uint8_t *value, unsigned comps);
-
-static void u_format_signed_encode_rgtc_chan(int8_t *blkaddr, int8_t srccolors[4][4],
-					     int numxpixels, int numypixels);
-
-static void u_format_signed_fetch_texel_rgtc(unsigned srcRowStride, const int8_t *pixdata,
-					       unsigned i, unsigned j, int8_t *value, unsigned comps);
-
-void
-util_format_latc1_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   /* Fix warnings here: */
-   (void) u_format_unsigned_encode_rgtc_chan;
-   (void) u_format_signed_encode_rgtc_chan;
-
-   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 1);
-}
-
-void
-util_format_latc1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   util_format_rgtc1_unorm_unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
-}
-
-void
-util_format_latc1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row,
-					 unsigned src_stride, unsigned width, unsigned height)
-{
-   util_format_rgtc1_unorm_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
-}
-
-void
-util_format_latc1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   unsigned x, y, i, j;
-   int block_size = 8;
-
-   for(y = 0; y < height; y += 4) {
-      const uint8_t *src = src_row;
-      for(x = 0; x < width; x += 4) {
-         for(j = 0; j < 4; ++j) {
-            for(i = 0; i < 4; ++i) {
-               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
-               uint8_t tmp_r;
-               u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
-               dst[0] =
-               dst[1] =
-               dst[2] = ubyte_to_float(tmp_r);
-               dst[3] = 1.0;
-            }
-         }
-         src += block_size;
-      }
-      src_row += src_stride;
-   }
-}
-
-void
-util_format_latc1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   util_format_rgtc1_unorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
-}
-
-void
-util_format_latc1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   uint8_t tmp_r;
-
-   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
-   dst[0] =
-   dst[1] =
-   dst[2] = ubyte_to_float(tmp_r);
-   dst[3] = 1.0;
-}
-
-void
-util_format_latc1_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_latc1_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_latc1_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_latc1_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   util_format_rgtc1_snorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
-}
-
-void
-util_format_latc1_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   unsigned x, y, i, j;
-   int block_size = 8;
-
-   for(y = 0; y < height; y += 4) {
-      const int8_t *src = (int8_t *)src_row;
-      for(x = 0; x < width; x += 4) {
-         for(j = 0; j < 4; ++j) {
-            for(i = 0; i < 4; ++i) {
-               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
-               int8_t tmp_r;
-               u_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
-               dst[0] =
-               dst[1] =
-               dst[2] = byte_to_float_tex(tmp_r);
-               dst[3] = 1.0;
-            }
-         }
-         src += block_size;
-      }
-      src_row += src_stride;
-   }
-}
-
-void
-util_format_latc1_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   int8_t tmp_r;
-
-   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 1);
-   dst[0] =
-   dst[1] =
-   dst[2] = byte_to_float_tex(tmp_r);
-   dst[3] = 1.0;
-}
-
-
-void
-util_format_latc2_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   puts(__func__);
-
-   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 2);
-   u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, dst + 1, 2);
-}
-
-void
-util_format_latc2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   util_format_rgtc2_unorm_unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
-}
-
-void
-util_format_latc2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   util_format_rgtc2_unorm_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
-}
-
-void
-util_format_latc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   util_format_rxtc2_unorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 3);
-}
-
-void
-util_format_latc2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   unsigned x, y, i, j;
-   int block_size = 16;
-
-   for(y = 0; y < height; y += 4) {
-      const uint8_t *src = src_row;
-      for(x = 0; x < width; x += 4) {
-         for(j = 0; j < 4; ++j) {
-            for(i = 0; i < 4; ++i) {
-               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
-               uint8_t tmp_r, tmp_g;
-               u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
-               u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
-               dst[0] =
-               dst[1] =
-               dst[2] = ubyte_to_float(tmp_r);
-               dst[3] = ubyte_to_float(tmp_g);
-            }
-         }
-         src += block_size;
-      }
-      src_row += src_stride;
-   }
-}
-
-void
-util_format_latc2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   uint8_t tmp_r, tmp_g;
-
-   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
-   u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
-   dst[0] =
-   dst[1] =
-   dst[2] = ubyte_to_float(tmp_r);
-   dst[3] = ubyte_to_float(tmp_g);
-}
-
-
-void
-util_format_latc2_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_latc2_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_latc2_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_latc2_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   unsigned x, y, i, j;
-   int block_size = 16;
-
-   for(y = 0; y < height; y += 4) {
-      const int8_t *src = (int8_t *)src_row;
-      for(x = 0; x < width; x += 4) {
-         for(j = 0; j < 4; ++j) {
-            for(i = 0; i < 4; ++i) {
-               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
-               int8_t tmp_r, tmp_g;
-               u_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
-               u_format_signed_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
-               dst[0] =
-               dst[1] =
-               dst[2] = byte_to_float_tex(tmp_r);
-               dst[3] = byte_to_float_tex(tmp_g);
-            }
-         }
-         src += block_size;
-      }
-      src_row += src_stride;
-   }
-}
-
-void
-util_format_latc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   util_format_rxtc2_snorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 3);
-}
-
-void
-util_format_latc2_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   int8_t tmp_r, tmp_g;
-
-   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 2);
-   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src + 8, i, j, &tmp_g, 2);
-   dst[0] =
-   dst[1] =
-   dst[2] = byte_to_float_tex(tmp_r);
-   dst[3] = byte_to_float_tex(tmp_g);
-}
-
-
-#define TAG(x) u_format_unsigned_##x
-#define TYPE uint8_t
-#define T_MIN 0
-#define T_MAX 255
-
-#include "../../../mesa/main/texcompress_rgtc_tmp.h"
-
-#undef TYPE
-#undef TAG
-#undef T_MIN
-#undef T_MAX
-
-
-#define TAG(x) u_format_signed_##x
-#define TYPE int8_t
-#define T_MIN (int8_t)-128
-#define T_MAX (int8_t)127
-
-#include "../../../mesa/main/texcompress_rgtc_tmp.h"
-
-#undef TYPE
-#undef TAG
-#undef T_MIN
-#undef T_MAX
+/**************************************************************************
+ *
+ * Copyright (C) 2011 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdio.h>
+#include "u_math.h"
+#include "u_format.h"
+#include "u_format_rgtc.h"
+#include "u_format_latc.h"
+
+static void u_format_unsigned_encode_rgtc_ubyte(uint8_t *blkaddr, uint8_t srccolors[4][4],
+					       int numxpixels, int numypixels);
+
+static void u_format_unsigned_fetch_texel_rgtc(unsigned srcRowStride, const uint8_t *pixdata,
+					       unsigned i, unsigned j, uint8_t *value, unsigned comps);
+
+static void u_format_signed_encode_rgtc_ubyte(int8_t *blkaddr, int8_t srccolors[4][4],
+					     int numxpixels, int numypixels);
+
+static void u_format_signed_fetch_texel_rgtc(unsigned srcRowStride, const int8_t *pixdata,
+					       unsigned i, unsigned j, int8_t *value, unsigned comps);
+
+void
+util_format_latc1_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   /* Fix warnings here: */
+   (void) u_format_unsigned_encode_rgtc_ubyte;
+   (void) u_format_signed_encode_rgtc_ubyte;
+
+   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 1);
+}
+
+void
+util_format_latc1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_rgtc1_unorm_unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_latc1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row,
+					 unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_rgtc1_unorm_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_latc1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   int block_size = 8;
+
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               uint8_t tmp_r;
+               u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
+               dst[0] =
+               dst[1] =
+               dst[2] = ubyte_to_float(tmp_r);
+               dst[3] = 1.0;
+            }
+         }
+         src += block_size;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_latc1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_rgtc1_unorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_latc1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   uint8_t tmp_r;
+
+   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
+   dst[0] =
+   dst[1] =
+   dst[2] = ubyte_to_float(tmp_r);
+   dst[3] = 1.0;
+}
+
+void
+util_format_latc1_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_latc1_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_latc1_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_latc1_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_rgtc1_snorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_latc1_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   int block_size = 8;
+
+   for(y = 0; y < height; y += 4) {
+      const int8_t *src = (int8_t *)src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               int8_t tmp_r;
+               u_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
+               dst[0] =
+               dst[1] =
+               dst[2] = byte_to_float_tex(tmp_r);
+               dst[3] = 1.0;
+            }
+         }
+         src += block_size;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_latc1_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   int8_t tmp_r;
+
+   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 1);
+   dst[0] =
+   dst[1] =
+   dst[2] = byte_to_float_tex(tmp_r);
+   dst[3] = 1.0;
+}
+
+
+void
+util_format_latc2_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 2);
+   u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, dst + 1, 2);
+}
+
+void
+util_format_latc2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_rgtc2_unorm_unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_latc2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_rgtc2_unorm_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height);
+}
+
+void
+util_format_latc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_rxtc2_unorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 3);
+}
+
+void
+util_format_latc2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   int block_size = 16;
+
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               uint8_t tmp_r, tmp_g;
+               u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
+               u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
+               dst[0] =
+               dst[1] =
+               dst[2] = ubyte_to_float(tmp_r);
+               dst[3] = ubyte_to_float(tmp_g);
+            }
+         }
+         src += block_size;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_latc2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   uint8_t tmp_r, tmp_g;
+
+   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
+   u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
+   dst[0] =
+   dst[1] =
+   dst[2] = ubyte_to_float(tmp_r);
+   dst[3] = ubyte_to_float(tmp_g);
+}
+
+
+void
+util_format_latc2_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_latc2_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_latc2_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_latc2_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   int block_size = 16;
+
+   for(y = 0; y < height; y += 4) {
+      const int8_t *src = (int8_t *)src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               int8_t tmp_r, tmp_g;
+               u_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
+               u_format_signed_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
+               dst[0] =
+               dst[1] =
+               dst[2] = byte_to_float_tex(tmp_r);
+               dst[3] = byte_to_float_tex(tmp_g);
+            }
+         }
+         src += block_size;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_latc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_rxtc2_snorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 3);
+}
+
+void
+util_format_latc2_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   int8_t tmp_r, tmp_g;
+
+   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 2);
+   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src + 8, i, j, &tmp_g, 2);
+   dst[0] =
+   dst[1] =
+   dst[2] = byte_to_float_tex(tmp_r);
+   dst[3] = byte_to_float_tex(tmp_g);
+}
+
+
+#define TAG(x) u_format_unsigned_##x
+#define TYPE uint8_t
+#define T_MIN 0
+#define T_MAX 255
+
+#include "../../../mesa/main/texcompress_rgtc_tmp.h"
+
+#undef TYPE
+#undef TAG
+#undef T_MIN
+#undef T_MAX
+
+
+#define TAG(x) u_format_signed_##x
+#define TYPE int8_t
+#define T_MIN (int8_t)-128
+#define T_MAX (int8_t)127
+
+#include "../../../mesa/main/texcompress_rgtc_tmp.h"
+
+#undef TYPE
+#undef TAG
+#undef T_MIN
+#undef T_MAX
diff --git a/mesalib/src/gallium/auxiliary/util/u_format_rgtc.c b/mesalib/src/gallium/auxiliary/util/u_format_rgtc.c
index c3fa54c74..2371bab1e 100644
--- a/mesalib/src/gallium/auxiliary/util/u_format_rgtc.c
+++ b/mesalib/src/gallium/auxiliary/util/u_format_rgtc.c
@@ -1,464 +1,464 @@
-/**************************************************************************
- *
- * Copyright (C) 2011 Red Hat Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include <stdio.h>
-#include "u_math.h"
-#include "u_format.h"
-#include "u_format_rgtc.h"
-
-static void u_format_unsigned_encode_rgtc_chan(uint8_t *blkaddr, uint8_t srccolors[4][4],
-					       int numxpixels, int numypixels);
-
-static void u_format_unsigned_fetch_texel_rgtc(unsigned srcRowStride, const uint8_t *pixdata,
-					       unsigned i, unsigned j, uint8_t *value, unsigned comps);
-
-static void u_format_signed_encode_rgtc_chan(int8_t *blkaddr, int8_t srccolors[4][4],
-					     int numxpixels, int numypixels);
-
-static void u_format_signed_fetch_texel_rgtc(unsigned srcRowStride, const int8_t *pixdata,
-					       unsigned i, unsigned j, int8_t *value, unsigned comps);
-
-void
-util_format_rgtc1_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 1);
-}
-
-void
-util_format_rgtc1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   const unsigned bw = 4, bh = 4, comps = 4;
-   unsigned x, y, i, j;
-   unsigned block_size = 8;
-
-   for(y = 0; y < height; y += bh) {
-      const uint8_t *src = src_row;
-      for(x = 0; x < width; x += bw) {
-         for(j = 0; j < bh; ++j) {
-            for(i = 0; i < bw; ++i) {
-               uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*comps;
-	       u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 1);
-	    }
-	 }
-	 src += block_size;
-      }
-      src_row += src_stride;
-   }
-}
-
-void
-util_format_rgtc1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, 
-					 unsigned src_stride, unsigned width, unsigned height)
-{
-   const unsigned bw = 4, bh = 4, bytes_per_block = 8;
-   unsigned x, y, i, j;
-
-   for(y = 0; y < height; y += bh) {
-      uint8_t *dst = dst_row;
-      for(x = 0; x < width; x += bw) {
-         uint8_t tmp[4][4];  /* [bh][bw][comps] */
-         for(j = 0; j < bh; ++j) {
-            for(i = 0; i < bw; ++i) {
-	       tmp[j][i] = src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4];
-            }
-         }
-         u_format_unsigned_encode_rgtc_chan(dst, tmp, 4, 4);
-         dst += bytes_per_block;
-      }
-      dst_row += dst_stride / sizeof(*dst_row);
-   }
-}
-
-void
-util_format_rgtc1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   unsigned x, y, i, j;
-   int block_size = 8;
-   for(y = 0; y < height; y += 4) {
-      const uint8_t *src = src_row;
-      for(x = 0; x < width; x += 4) {
-         for(j = 0; j < 4; ++j) {
-            for(i = 0; i < 4; ++i) {
-               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
-               uint8_t tmp_r;
-               u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
-               dst[0] = ubyte_to_float(tmp_r);
-               dst[1] = 0.0;
-               dst[2] = 0.0;
-               dst[3] = 1.0;
-            }
-         }
-         src += block_size;
-      }
-      src_row += src_stride;
-   }
-}
-
-void
-util_format_rgtc1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   const unsigned bw = 4, bh = 4, bytes_per_block = 8;
-   unsigned x, y, i, j;
-
-   for(y = 0; y < height; y += bh) {
-      uint8_t *dst = dst_row;
-      for(x = 0; x < width; x += bw) {
-         uint8_t tmp[4][4];  /* [bh][bw][comps] */
-         for(j = 0; j < bh; ++j) {
-            for(i = 0; i < bw; ++i) {
-	       tmp[j][i] = float_to_ubyte(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]);
-            }
-         }
-         u_format_unsigned_encode_rgtc_chan(dst, tmp, 4, 4);
-         dst += bytes_per_block;
-      }
-      dst_row += dst_stride / sizeof(*dst_row);
-   }
-}
-
-void
-util_format_rgtc1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   uint8_t tmp_r;
-   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
-   dst[0] = ubyte_to_float(tmp_r);
-   dst[1] = 0.0;
-   dst[2] = 0.0;
-   dst[3] = 1.0;
-}
-
-void
-util_format_rgtc1_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_rgtc1_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_rgtc1_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_rgtc1_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   const unsigned bw = 4, bh = 4, bytes_per_block = 8;
-   unsigned x, y, i, j;
-
-   for(y = 0; y < height; y += bh) {
-      int8_t *dst = (int8_t *)dst_row;
-      for(x = 0; x < width; x += bw) {
-         int8_t tmp[4][4];  /* [bh][bw][comps] */
-         for(j = 0; j < bh; ++j) {
-            for(i = 0; i < bw; ++i) {
-	       tmp[j][i] = float_to_byte_tex(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]);
-            }
-         }
-         u_format_signed_encode_rgtc_chan(dst, tmp, 4, 4);
-         dst += bytes_per_block;
-      }
-      dst_row += dst_stride / sizeof(*dst_row);
-   }
-}
-
-void
-util_format_rgtc1_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   unsigned x, y, i, j;
-   int block_size = 8;
-   for(y = 0; y < height; y += 4) {
-      const int8_t *src = (int8_t *)src_row;
-      for(x = 0; x < width; x += 4) {
-         for(j = 0; j < 4; ++j) {
-            for(i = 0; i < 4; ++i) {
-               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
-               int8_t tmp_r;
-               u_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
-               dst[0] = byte_to_float_tex(tmp_r);
-               dst[1] = 0.0;
-               dst[2] = 0.0;
-               dst[3] = 1.0;
-            }
-         }
-         src += block_size;
-      }
-      src_row += src_stride;
-   }
-}
-
-void
-util_format_rgtc1_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   int8_t tmp_r;
-   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 1);
-   dst[0] = byte_to_float_tex(tmp_r);
-   dst[1] = 0.0;
-   dst[2] = 0.0;
-   dst[3] = 1.0;
-}
-
-
-void
-util_format_rgtc2_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 2);
-   u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, dst + 1, 2);
-}
-
-void
-util_format_rgtc2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   const unsigned bw = 4, bh = 4, comps = 4;
-   unsigned x, y, i, j;
-   unsigned block_size = 16;
-
-   for(y = 0; y < height; y += bh) {
-      const uint8_t *src = src_row;
-      for(x = 0; x < width; x += bw) {
-         for(j = 0; j < bh; ++j) {
-            for(i = 0; i < bw; ++i) {
-               uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*comps;
-	       u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 2);
-	       u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, dst + 1, 2);
-
-	    }
-	 }
-	 src += block_size;
-      }
-      src_row += src_stride;
-   }
-}
-
-void
-util_format_rgtc2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   const unsigned bw = 4, bh = 4, bytes_per_block = 16;
-   unsigned x, y, i, j;
-
-   for(y = 0; y < height; y += bh) {
-      uint8_t *dst = dst_row;
-      for(x = 0; x < width; x += bw) {
-         uint8_t tmp_r[4][4];  /* [bh][bw] */
-         uint8_t tmp_g[4][4];  /* [bh][bw] */
-         for(j = 0; j < bh; ++j) {
-            for(i = 0; i < bw; ++i) {
-	       tmp_r[j][i] = src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4];
-	       tmp_g[j][i] = src_row[((y + j)*src_stride/sizeof(*src_row) + (x + i)*4) + 1];
-            }
-         }
-         u_format_unsigned_encode_rgtc_chan(dst, tmp_r, 4, 4);
-         u_format_unsigned_encode_rgtc_chan(dst + 8, tmp_g, 4, 4);
-         dst += bytes_per_block;
-      }
-      dst_row += dst_stride / sizeof(*dst_row);
-   }
-}
-
-void
-util_format_rxtc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height, unsigned chan2off)
-{
-   const unsigned bw = 4, bh = 4, bytes_per_block = 16;
-   unsigned x, y, i, j;
-
-   for(y = 0; y < height; y += bh) {
-      uint8_t *dst = dst_row;
-      for(x = 0; x < width; x += bw) {
-         uint8_t tmp_r[4][4];  /* [bh][bw][comps] */
-         uint8_t tmp_g[4][4];  /* [bh][bw][comps] */
-         for(j = 0; j < bh; ++j) {
-            for(i = 0; i < bw; ++i) {
-	       tmp_r[j][i] = float_to_ubyte(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]);
-               tmp_g[j][i] = float_to_ubyte(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4 + chan2off]);
-            }
-         }
-         u_format_unsigned_encode_rgtc_chan(dst, tmp_r, 4, 4);
-         u_format_unsigned_encode_rgtc_chan(dst + 8, tmp_g, 4, 4);
-         dst += bytes_per_block;
-      }
-      dst_row += dst_stride / sizeof(*dst_row);
-   }
-}
-
-void
-util_format_rgtc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   util_format_rxtc2_unorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 1);
-}
-
-void
-util_format_rgtc2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   unsigned x, y, i, j;
-   int block_size = 16;
-   for(y = 0; y < height; y += 4) {
-      const uint8_t *src = src_row;
-      for(x = 0; x < width; x += 4) {
-         for(j = 0; j < 4; ++j) {
-            for(i = 0; i < 4; ++i) {
-               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
-               uint8_t tmp_r, tmp_g;
-               u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
-               u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
-               dst[0] = ubyte_to_float(tmp_r);
-               dst[1] = ubyte_to_float(tmp_g);
-               dst[2] = 0.0;
-               dst[3] = 1.0;
-            }
-         }
-         src += block_size;
-      }
-      src_row += src_stride;
-   }
-}
-
-void
-util_format_rgtc2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   uint8_t tmp_r, tmp_g;
-   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
-   u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
-   dst[0] = ubyte_to_float(tmp_r);
-   dst[1] = ubyte_to_float(tmp_g);
-   dst[2] = 0.0;
-   dst[3] = 1.0;
-}
-
-
-void
-util_format_rgtc2_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_rgtc2_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_rgtc2_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   fprintf(stderr,"%s\n", __func__);
-}
-
-void
-util_format_rgtc2_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   unsigned x, y, i, j;
-   int block_size = 16;
-   for(y = 0; y < height; y += 4) {
-      const int8_t *src = (int8_t *)src_row;
-      for(x = 0; x < width; x += 4) {
-         for(j = 0; j < 4; ++j) {
-            for(i = 0; i < 4; ++i) {
-               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
-               int8_t tmp_r, tmp_g;
-               u_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
-               u_format_signed_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
-               dst[0] = byte_to_float_tex(tmp_r);
-               dst[1] = byte_to_float_tex(tmp_g);
-               dst[2] = 0.0;
-               dst[3] = 1.0;
-            }
-         }
-         src += block_size;
-      }
-      src_row += src_stride;
-   }
-}
-
-void
-util_format_rxtc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height, unsigned chan2off)
-{
-   const unsigned bw = 4, bh = 4, bytes_per_block = 16;
-   unsigned x, y, i, j;
-
-   for(y = 0; y < height; y += bh) {
-      int8_t *dst = (int8_t *)dst_row;
-      for(x = 0; x < width; x += bw) {
-         int8_t tmp_r[4][4];  /* [bh][bw][comps] */
-         int8_t tmp_g[4][4];  /* [bh][bw][comps] */
-         for(j = 0; j < bh; ++j) {
-            for(i = 0; i < bw; ++i) {
-	       tmp_r[j][i] = float_to_byte_tex(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]);
-               tmp_g[j][i] = float_to_byte_tex(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4 + chan2off]);
-            }
-         }
-         u_format_signed_encode_rgtc_chan(dst, tmp_r, 4, 4);
-         u_format_signed_encode_rgtc_chan(dst + 8, tmp_g, 4, 4);
-         dst += bytes_per_block;
-      }
-      dst_row += dst_stride / sizeof(*dst_row);
-   }
-}
-
-void
-util_format_rgtc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
-{
-   util_format_rxtc2_snorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 1);
-}
-
-void
-util_format_rgtc2_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
-{
-   int8_t tmp_r, tmp_g;
-   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 2);
-   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src + 8, i, j, &tmp_g, 2);
-   dst[0] = byte_to_float_tex(tmp_r);
-   dst[1] = byte_to_float_tex(tmp_g);
-   dst[2] = 0.0;
-   dst[3] = 1.0;
-}
-
-
-#define TAG(x) u_format_unsigned_##x
-#define TYPE uint8_t
-#define T_MIN 0
-#define T_MAX 255
-
-#include "../../../mesa/main/texcompress_rgtc_tmp.h"
-
-#undef TYPE
-#undef TAG
-#undef T_MIN
-#undef T_MAX
-
-
-#define TAG(x) u_format_signed_##x
-#define TYPE int8_t
-#define T_MIN (int8_t)-128
-#define T_MAX (int8_t)127
-
-#include "../../../mesa/main/texcompress_rgtc_tmp.h"
-
-#undef TYPE
-#undef TAG
-#undef T_MIN
-#undef T_MAX
+/**************************************************************************
+ *
+ * Copyright (C) 2011 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdio.h>
+#include "u_math.h"
+#include "u_format.h"
+#include "u_format_rgtc.h"
+
+static void u_format_unsigned_encode_rgtc_ubyte(uint8_t *blkaddr, uint8_t srccolors[4][4],
+					       int numxpixels, int numypixels);
+
+static void u_format_unsigned_fetch_texel_rgtc(unsigned srcRowStride, const uint8_t *pixdata,
+					       unsigned i, unsigned j, uint8_t *value, unsigned comps);
+
+static void u_format_signed_encode_rgtc_ubyte(int8_t *blkaddr, int8_t srccolors[4][4],
+					     int numxpixels, int numypixels);
+
+static void u_format_signed_fetch_texel_rgtc(unsigned srcRowStride, const int8_t *pixdata,
+					       unsigned i, unsigned j, int8_t *value, unsigned comps);
+
+void
+util_format_rgtc1_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 1);
+}
+
+void
+util_format_rgtc1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   const unsigned bw = 4, bh = 4, comps = 4;
+   unsigned x, y, i, j;
+   unsigned block_size = 8;
+
+   for(y = 0; y < height; y += bh) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += bw) {
+         for(j = 0; j < bh; ++j) {
+            for(i = 0; i < bw; ++i) {
+               uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*comps;
+	       u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 1);
+	    }
+	 }
+	 src += block_size;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_rgtc1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, 
+					 unsigned src_stride, unsigned width, unsigned height)
+{
+   const unsigned bw = 4, bh = 4, bytes_per_block = 8;
+   unsigned x, y, i, j;
+
+   for(y = 0; y < height; y += bh) {
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += bw) {
+         uint8_t tmp[4][4];  /* [bh][bw][comps] */
+         for(j = 0; j < bh; ++j) {
+            for(i = 0; i < bw; ++i) {
+	       tmp[j][i] = src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4];
+            }
+         }
+         u_format_unsigned_encode_rgtc_ubyte(dst, tmp, 4, 4);
+         dst += bytes_per_block;
+      }
+      dst_row += dst_stride / sizeof(*dst_row);
+   }
+}
+
+void
+util_format_rgtc1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   int block_size = 8;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               uint8_t tmp_r;
+               u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
+               dst[0] = ubyte_to_float(tmp_r);
+               dst[1] = 0.0;
+               dst[2] = 0.0;
+               dst[3] = 1.0;
+            }
+         }
+         src += block_size;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_rgtc1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   const unsigned bw = 4, bh = 4, bytes_per_block = 8;
+   unsigned x, y, i, j;
+
+   for(y = 0; y < height; y += bh) {
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += bw) {
+         uint8_t tmp[4][4];  /* [bh][bw][comps] */
+         for(j = 0; j < bh; ++j) {
+            for(i = 0; i < bw; ++i) {
+	       tmp[j][i] = float_to_ubyte(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]);
+            }
+         }
+         u_format_unsigned_encode_rgtc_ubyte(dst, tmp, 4, 4);
+         dst += bytes_per_block;
+      }
+      dst_row += dst_stride / sizeof(*dst_row);
+   }
+}
+
+void
+util_format_rgtc1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   uint8_t tmp_r;
+   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
+   dst[0] = ubyte_to_float(tmp_r);
+   dst[1] = 0.0;
+   dst[2] = 0.0;
+   dst[3] = 1.0;
+}
+
+void
+util_format_rgtc1_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_rgtc1_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_rgtc1_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_rgtc1_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   const unsigned bw = 4, bh = 4, bytes_per_block = 8;
+   unsigned x, y, i, j;
+
+   for(y = 0; y < height; y += bh) {
+      int8_t *dst = (int8_t *)dst_row;
+      for(x = 0; x < width; x += bw) {
+         int8_t tmp[4][4];  /* [bh][bw][comps] */
+         for(j = 0; j < bh; ++j) {
+            for(i = 0; i < bw; ++i) {
+	       tmp[j][i] = float_to_byte_tex(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]);
+            }
+         }
+         u_format_signed_encode_rgtc_ubyte(dst, tmp, 4, 4);
+         dst += bytes_per_block;
+      }
+      dst_row += dst_stride / sizeof(*dst_row);
+   }
+}
+
+void
+util_format_rgtc1_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   int block_size = 8;
+   for(y = 0; y < height; y += 4) {
+      const int8_t *src = (int8_t *)src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               int8_t tmp_r;
+               u_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1);
+               dst[0] = byte_to_float_tex(tmp_r);
+               dst[1] = 0.0;
+               dst[2] = 0.0;
+               dst[3] = 1.0;
+            }
+         }
+         src += block_size;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_rgtc1_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   int8_t tmp_r;
+   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 1);
+   dst[0] = byte_to_float_tex(tmp_r);
+   dst[1] = 0.0;
+   dst[2] = 0.0;
+   dst[3] = 1.0;
+}
+
+
+void
+util_format_rgtc2_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 2);
+   u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, dst + 1, 2);
+}
+
+void
+util_format_rgtc2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   const unsigned bw = 4, bh = 4, comps = 4;
+   unsigned x, y, i, j;
+   unsigned block_size = 16;
+
+   for(y = 0; y < height; y += bh) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += bw) {
+         for(j = 0; j < bh; ++j) {
+            for(i = 0; i < bw; ++i) {
+               uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*comps;
+	       u_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 2);
+	       u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, dst + 1, 2);
+
+	    }
+	 }
+	 src += block_size;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_rgtc2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   const unsigned bw = 4, bh = 4, bytes_per_block = 16;
+   unsigned x, y, i, j;
+
+   for(y = 0; y < height; y += bh) {
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += bw) {
+         uint8_t tmp_r[4][4];  /* [bh][bw] */
+         uint8_t tmp_g[4][4];  /* [bh][bw] */
+         for(j = 0; j < bh; ++j) {
+            for(i = 0; i < bw; ++i) {
+	       tmp_r[j][i] = src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4];
+	       tmp_g[j][i] = src_row[((y + j)*src_stride/sizeof(*src_row) + (x + i)*4) + 1];
+            }
+         }
+         u_format_unsigned_encode_rgtc_ubyte(dst, tmp_r, 4, 4);
+         u_format_unsigned_encode_rgtc_ubyte(dst + 8, tmp_g, 4, 4);
+         dst += bytes_per_block;
+      }
+      dst_row += dst_stride / sizeof(*dst_row);
+   }
+}
+
+void
+util_format_rxtc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height, unsigned chan2off)
+{
+   const unsigned bw = 4, bh = 4, bytes_per_block = 16;
+   unsigned x, y, i, j;
+
+   for(y = 0; y < height; y += bh) {
+      uint8_t *dst = dst_row;
+      for(x = 0; x < width; x += bw) {
+         uint8_t tmp_r[4][4];  /* [bh][bw][comps] */
+         uint8_t tmp_g[4][4];  /* [bh][bw][comps] */
+         for(j = 0; j < bh; ++j) {
+            for(i = 0; i < bw; ++i) {
+	       tmp_r[j][i] = float_to_ubyte(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]);
+               tmp_g[j][i] = float_to_ubyte(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4 + chan2off]);
+            }
+         }
+         u_format_unsigned_encode_rgtc_ubyte(dst, tmp_r, 4, 4);
+         u_format_unsigned_encode_rgtc_ubyte(dst + 8, tmp_g, 4, 4);
+         dst += bytes_per_block;
+      }
+      dst_row += dst_stride / sizeof(*dst_row);
+   }
+}
+
+void
+util_format_rgtc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_rxtc2_unorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 1);
+}
+
+void
+util_format_rgtc2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   int block_size = 16;
+   for(y = 0; y < height; y += 4) {
+      const uint8_t *src = src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               uint8_t tmp_r, tmp_g;
+               u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
+               u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
+               dst[0] = ubyte_to_float(tmp_r);
+               dst[1] = ubyte_to_float(tmp_g);
+               dst[2] = 0.0;
+               dst[3] = 1.0;
+            }
+         }
+         src += block_size;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_rgtc2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   uint8_t tmp_r, tmp_g;
+   u_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
+   u_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
+   dst[0] = ubyte_to_float(tmp_r);
+   dst[1] = ubyte_to_float(tmp_g);
+   dst[2] = 0.0;
+   dst[3] = 1.0;
+}
+
+
+void
+util_format_rgtc2_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_rgtc2_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_rgtc2_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   fprintf(stderr,"%s\n", __func__);
+}
+
+void
+util_format_rgtc2_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   unsigned x, y, i, j;
+   int block_size = 16;
+   for(y = 0; y < height; y += 4) {
+      const int8_t *src = (int8_t *)src_row;
+      for(x = 0; x < width; x += 4) {
+         for(j = 0; j < 4; ++j) {
+            for(i = 0; i < 4; ++i) {
+               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4;
+               int8_t tmp_r, tmp_g;
+               u_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2);
+               u_format_signed_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2);
+               dst[0] = byte_to_float_tex(tmp_r);
+               dst[1] = byte_to_float_tex(tmp_g);
+               dst[2] = 0.0;
+               dst[3] = 1.0;
+            }
+         }
+         src += block_size;
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+util_format_rxtc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height, unsigned chan2off)
+{
+   const unsigned bw = 4, bh = 4, bytes_per_block = 16;
+   unsigned x, y, i, j;
+
+   for(y = 0; y < height; y += bh) {
+      int8_t *dst = (int8_t *)dst_row;
+      for(x = 0; x < width; x += bw) {
+         int8_t tmp_r[4][4];  /* [bh][bw][comps] */
+         int8_t tmp_g[4][4];  /* [bh][bw][comps] */
+         for(j = 0; j < bh; ++j) {
+            for(i = 0; i < bw; ++i) {
+	       tmp_r[j][i] = float_to_byte_tex(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]);
+               tmp_g[j][i] = float_to_byte_tex(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4 + chan2off]);
+            }
+         }
+         u_format_signed_encode_rgtc_ubyte(dst, tmp_r, 4, 4);
+         u_format_signed_encode_rgtc_ubyte(dst + 8, tmp_g, 4, 4);
+         dst += bytes_per_block;
+      }
+      dst_row += dst_stride / sizeof(*dst_row);
+   }
+}
+
+void
+util_format_rgtc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_rxtc2_snorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 1);
+}
+
+void
+util_format_rgtc2_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
+{
+   int8_t tmp_r, tmp_g;
+   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 2);
+   u_format_signed_fetch_texel_rgtc(0, (int8_t *)src + 8, i, j, &tmp_g, 2);
+   dst[0] = byte_to_float_tex(tmp_r);
+   dst[1] = byte_to_float_tex(tmp_g);
+   dst[2] = 0.0;
+   dst[3] = 1.0;
+}
+
+
+#define TAG(x) u_format_unsigned_##x
+#define TYPE uint8_t
+#define T_MIN 0
+#define T_MAX 255
+
+#include "../../../mesa/main/texcompress_rgtc_tmp.h"
+
+#undef TYPE
+#undef TAG
+#undef T_MIN
+#undef T_MAX
+
+
+#define TAG(x) u_format_signed_##x
+#define TYPE int8_t
+#define T_MIN (int8_t)-128
+#define T_MAX (int8_t)127
+
+#include "../../../mesa/main/texcompress_rgtc_tmp.h"
+
+#undef TYPE
+#undef TAG
+#undef T_MIN
+#undef T_MAX
diff --git a/mesalib/src/glsl/Makefile b/mesalib/src/glsl/Makefile
index c20a6c9ed..00b7b9164 100644
--- a/mesalib/src/glsl/Makefile
+++ b/mesalib/src/glsl/Makefile
@@ -188,11 +188,11 @@ install-dricore: default
 
 ##### RULES #####
 
-glsl_compiler: $(GLSL2_OBJECTS) libglsl.a builtin_stubs.o
-	$(APP_CXX) $(INCLUDES) $(CFLAGS) $(LDFLAGS) $(GLSL2_OBJECTS) builtin_stubs.o $(LIBS) -o $@
+glsl_compiler: $(GLSL2_OBJECTS) libglsl.a
+	$(APP_CXX) $(INCLUDES) $(CFLAGS) $(LDFLAGS) $(GLSL2_OBJECTS) $(LIBS) -o $@
 
-glsl_test: $(TEST_OBJECTS) libglsl.a builtin_stubs.o
-	$(APP_CXX) $(INCLUDES) $(CFLAGS) $(LDFLAGS) $(TEST_OBJECTS) builtin_stubs.o $(LIBS) -o $@
+glsl_test: $(TEST_OBJECTS) libglsl.a
+	$(APP_CXX) $(INCLUDES) $(CFLAGS) $(LDFLAGS) $(TEST_OBJECTS) $(LIBS) -o $@
 
 glcpp: glcpp/glcpp
 glcpp/glcpp: $(GLCPP_OBJECTS)
diff --git a/mesalib/src/mesa/drivers/common/meta.c b/mesalib/src/mesa/drivers/common/meta.c
index 1b71aa194..b6e80d70a 100644
--- a/mesalib/src/mesa/drivers/common/meta.c
+++ b/mesalib/src/mesa/drivers/common/meta.c
@@ -740,6 +740,11 @@ _mesa_meta_end(struct gl_context *ctx)
 
       _mesa_reference_shader_program(ctx, &ctx->Shader.ActiveProgram,
 				     save->ActiveShader);
+
+      _mesa_reference_shader_program(ctx, &save->VertexShader, NULL);
+      _mesa_reference_shader_program(ctx, &save->GeometryShader, NULL);
+      _mesa_reference_shader_program(ctx, &save->FragmentShader, NULL);
+      _mesa_reference_shader_program(ctx, &save->ActiveShader, NULL);
    }
 
    if (state & MESA_META_STENCIL_TEST) {
@@ -1223,7 +1228,7 @@ blitframebuffer_texture(struct gl_context *ctx,
 				GL_SKIP_DECODE_EXT);
 	 }
          if (ctx->Extensions.EXT_framebuffer_sRGB) {
-            _mesa_Disable(GL_FRAMEBUFFER_SRGB_EXT);
+            _mesa_set_enable(ctx, GL_FRAMEBUFFER_SRGB_EXT, GL_FALSE);
          }
 
          _mesa_TexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
@@ -1291,7 +1296,7 @@ blitframebuffer_texture(struct gl_context *ctx,
 	    _mesa_TexParameteri(target, GL_TEXTURE_SRGB_DECODE_EXT, srgbSave);
 	 }
 	 if (ctx->Extensions.EXT_framebuffer_sRGB && fbo_srgb_save) {
-	    _mesa_Enable(GL_FRAMEBUFFER_SRGB_EXT);
+	    _mesa_set_enable(ctx, GL_FRAMEBUFFER_SRGB_EXT, GL_TRUE);
 	 }
 
          /* Done with color buffer */
@@ -2452,6 +2457,15 @@ _mesa_meta_check_generate_mipmap_fallback(struct gl_context *ctx, GLenum target,
       return GL_TRUE;
    }
 
+   if (_mesa_get_format_color_encoding(baseImage->TexFormat) == GL_SRGB &&
+       !ctx->Extensions.EXT_texture_sRGB_decode) {
+      /* The texture format is sRGB but we can't turn off sRGB->linear
+       * texture sample conversion.  So we won't be able to generate the
+       * right colors when rendering.  Need to use a fallback.
+       */
+      return GL_TRUE;
+   }
+
    /*
     * Test that we can actually render in the texture's format.
     */
@@ -2669,6 +2683,8 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
    const GLenum wrapSSave = texObj->Sampler.WrapS;
    const GLenum wrapTSave = texObj->Sampler.WrapT;
    const GLenum wrapRSave = texObj->Sampler.WrapR;
+   const GLenum srgbDecodeSave = texObj->Sampler.sRGBDecode;
+   const GLenum srgbBufferSave = ctx->Color.sRGBEnabled;
    const GLuint fboSave = ctx->DrawBuffer->Name;
    const GLuint original_active_unit = ctx->Texture.CurrentUnit;
    GLenum faceTarget;
@@ -2731,6 +2747,15 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
    _mesa_TexParameteri(target, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
    _mesa_TexParameteri(target, GL_TEXTURE_WRAP_R, GL_CLAMP_TO_EDGE);
 
+   /* We don't want to encode or decode sRGB values; treat them as linear */
+   if (ctx->Extensions.EXT_texture_sRGB_decode) {
+      _mesa_TexParameteri(target, GL_TEXTURE_SRGB_DECODE_EXT,
+                          GL_SKIP_DECODE_EXT);
+   }
+   if (ctx->Extensions.EXT_framebuffer_sRGB) {
+      _mesa_set_enable(ctx, GL_FRAMEBUFFER_SRGB_EXT, GL_FALSE);
+   }
+
    _mesa_set_enable(ctx, target, GL_TRUE);
 
    /* setup texcoords (XXX what about border?) */
@@ -2875,6 +2900,14 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
       _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
    }
 
+   if (ctx->Extensions.EXT_texture_sRGB_decode) {
+      _mesa_TexParameteri(target, GL_TEXTURE_SRGB_DECODE_EXT,
+                          srgbDecodeSave);
+   }
+   if (ctx->Extensions.EXT_framebuffer_sRGB && srgbBufferSave) {
+      _mesa_set_enable(ctx, GL_FRAMEBUFFER_SRGB_EXT, GL_TRUE);
+   }
+
    _mesa_lock_texture(ctx, texObj); /* relock */
 
    _mesa_meta_end(ctx);
@@ -3154,7 +3187,7 @@ decompress_texture_image(struct gl_context *ctx,
 
    /* setup texture state */
    _mesa_BindTexture(target, texObj->Name);
-   _mesa_Enable(target);
+   _mesa_set_enable(ctx, target, GL_TRUE);
 
    {
       /* save texture object state */
@@ -3179,7 +3212,7 @@ decompress_texture_image(struct gl_context *ctx,
                              GL_SKIP_DECODE_EXT);
       }
       if (ctx->Extensions.EXT_framebuffer_sRGB) {
-         _mesa_Disable(GL_FRAMEBUFFER_SRGB_EXT);
+         _mesa_set_enable(ctx, GL_FRAMEBUFFER_SRGB_EXT, GL_FALSE);
       }
 
       /* render quad w/ texture into renderbuffer */
@@ -3205,6 +3238,9 @@ decompress_texture_image(struct gl_context *ctx,
    ctx->Pack.RowLength = destRowLength;
    _mesa_ReadPixels(0, 0, width, height, destFormat, destType, dest);
 
+   /* disable texture unit */
+   _mesa_set_enable(ctx, target, GL_FALSE);
+
    _mesa_meta_end(ctx);
 
    /* restore fbo bindings */
diff --git a/mesalib/src/mesa/drivers/dri/swrast/swrast_span.c b/mesalib/src/mesa/drivers/dri/swrast/swrast_span.c
index c7d0bfdac..772d09f5a 100644
--- a/mesalib/src/mesa/drivers/dri/swrast/swrast_span.c
+++ b/mesalib/src/mesa/drivers/dri/swrast/swrast_span.c
@@ -45,7 +45,7 @@ static const GLubyte kernel[16] = {
 #if DITHER
 #define DITHER_COMP(X, Y) kernel[((X) & 0x3) | (((Y) & 0x3) << 2)]
 
-#define DITHER_CLAMP(X) (((X) < CHAN_MAX) ? (X) : CHAN_MAX)
+#define DITHER_CLAMP(X) (((X) < 255) ? (X) : 255)
 #else
 #define DITHER_COMP(X, Y) 0
 
diff --git a/mesalib/src/mesa/main/api_validate.c b/mesalib/src/mesa/main/api_validate.c
index 699b414f5..1fcf5cd68 100644
--- a/mesalib/src/mesa/main/api_validate.c
+++ b/mesalib/src/mesa/main/api_validate.c
@@ -199,6 +199,27 @@ check_index_bounds(struct gl_context *ctx, GLsizei count, GLenum type,
 
 
 /**
+ * Is 'mode' a valid value for glBegin(), glDrawArrays(), glDrawElements(),
+ * etc?  The set of legal values depends on whether geometry shaders/programs
+ * are supported.
+ */
+GLboolean
+_mesa_valid_prim_mode(const struct gl_context *ctx, GLenum mode)
+{
+   if (ctx->Extensions.ARB_geometry_shader4 &&
+       mode > GL_TRIANGLE_STRIP_ADJACENCY_ARB) {
+      return GL_FALSE;
+   }
+   else if (mode > GL_POLYGON) {
+      return GL_FALSE;
+   }
+   else {
+      return GL_TRUE;
+   }
+}
+
+
+/**
  * Error checking for glDrawElements().  Includes parameter checking
  * and VBO bounds checking.
  * \return GL_TRUE if OK to render, GL_FALSE if error found
@@ -216,7 +237,7 @@ _mesa_validate_DrawElements(struct gl_context *ctx,
       return GL_FALSE;
    }
 
-   if (mode > GL_TRIANGLE_STRIP_ADJACENCY_ARB) {
+   if (!_mesa_valid_prim_mode(ctx, mode)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glDrawElements(mode)" );
       return GL_FALSE;
    }
@@ -273,7 +294,7 @@ _mesa_validate_DrawRangeElements(struct gl_context *ctx, GLenum mode,
       return GL_FALSE;
    }
 
-   if (mode > GL_TRIANGLE_STRIP_ADJACENCY_ARB) {
+   if (!_mesa_valid_prim_mode(ctx, mode)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glDrawRangeElements(mode)" );
       return GL_FALSE;
    }
@@ -332,7 +353,7 @@ _mesa_validate_DrawArrays(struct gl_context *ctx,
       return GL_FALSE;
    }
 
-   if (mode > GL_TRIANGLE_STRIP_ADJACENCY_ARB) {
+   if (!_mesa_valid_prim_mode(ctx, mode)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glDrawArrays(mode)" );
       return GL_FALSE;
    }
@@ -362,7 +383,7 @@ _mesa_validate_DrawArraysInstanced(struct gl_context *ctx, GLenum mode, GLint fi
       return GL_FALSE;
    }
 
-   if (mode > GL_TRIANGLE_STRIP_ADJACENCY_ARB) {
+   if (!_mesa_valid_prim_mode(ctx, mode)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glDrawArraysInstanced(mode=0x%x)", mode);
       return GL_FALSE;
@@ -408,7 +429,7 @@ _mesa_validate_DrawElementsInstanced(struct gl_context *ctx,
       return GL_FALSE;
    }
 
-   if (mode > GL_TRIANGLE_STRIP_ADJACENCY_ARB) {
+   if (!_mesa_valid_prim_mode(ctx, mode)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glDrawElementsInstanced(mode = 0x%x)", mode);
       return GL_FALSE;
diff --git a/mesalib/src/mesa/main/api_validate.h b/mesalib/src/mesa/main/api_validate.h
index 09e9522d2..7d6a66012 100644
--- a/mesalib/src/mesa/main/api_validate.h
+++ b/mesalib/src/mesa/main/api_validate.h
@@ -39,6 +39,11 @@ _mesa_max_buffer_index(struct gl_context *ctx, GLuint count, GLenum type,
                        const void *indices,
                        struct gl_buffer_object *elementBuf);
 
+
+extern GLboolean
+_mesa_valid_prim_mode(const struct gl_context *ctx, GLenum mode);
+
+
 extern GLboolean
 _mesa_validate_DrawArrays(struct gl_context *ctx,
 			  GLenum mode, GLint start, GLsizei count);
diff --git a/mesalib/src/mesa/main/attrib.c b/mesalib/src/mesa/main/attrib.c
index 9767740a3..d38a1a466 100644
--- a/mesalib/src/mesa/main/attrib.c
+++ b/mesalib/src/mesa/main/attrib.c
@@ -493,7 +493,7 @@ pop_enable_group(struct gl_context *ctx, const struct gl_enable_attrib *enable)
       }
    }
 
-   for (i=0;i<MAX_CLIP_PLANES;i++) {
+   for (i=0;i<ctx->Const.MaxClipPlanes;i++) {
       const GLuint mask = 1 << i;
       if ((ctx->Transform.ClipPlanesEnabled & mask) != (enable->ClipPlanes & mask))
 	  _mesa_set_enable(ctx, (GLenum) (GL_CLIP_PLANE0 + i),
@@ -1247,7 +1247,7 @@ _mesa_PopAttrib(void)
                   _math_matrix_analyse( ctx->ProjectionMatrixStack.Top );
 
                /* restore clip planes */
-               for (i = 0; i < MAX_CLIP_PLANES; i++) {
+               for (i = 0; i < ctx->Const.MaxClipPlanes; i++) {
                   const GLuint mask = 1 << i;
                   const GLfloat *eyePlane = xform->EyeUserPlane[i];
                   COPY_4V(ctx->Transform.EyeUserPlane[i], eyePlane);
diff --git a/mesalib/src/mesa/main/colormac.h b/mesalib/src/mesa/main/colormac.h
index 4b7c3b4a0..4294f3239 100644
--- a/mesalib/src/mesa/main/colormac.h
+++ b/mesalib/src/mesa/main/colormac.h
@@ -38,137 +38,6 @@
 #include "mtypes.h"
 
 
-/** \def BYTE_TO_CHAN
- * Convert from GLbyte to GLchan */
-
-/** \def UBYTE_TO_CHAN
- * Convert from GLubyte to GLchan */
-
-/** \def SHORT_TO_CHAN
- * Convert from GLshort to GLchan */
-
-/** \def USHORT_TO_CHAN
- * Convert from GLushort to GLchan */
-
-/** \def INT_TO_CHAN
- * Convert from GLint to GLchan */
-
-/** \def UINT_TO_CHAN
- * Convert from GLuint to GLchan */
-
-/** \def CHAN_TO_UBYTE
- * Convert from GLchan to GLubyte */
-
-/** \def CHAN_TO_FLOAT
- * Convert from GLchan to GLfloat */
-
-/** \def CLAMPED_FLOAT_TO_CHAN
- * Convert from GLclampf to GLchan */
-
-/** \def UNCLAMPED_FLOAT_TO_CHAN
- * Convert from GLfloat to GLchan */
-
-/** \def COPY_CHAN4
- * Copy a GLchan[4] array */
-
-#if CHAN_BITS == 8
-
-#define BYTE_TO_CHAN(b)   ((b) < 0 ? 0 : (GLchan) (b))
-#define UBYTE_TO_CHAN(b)  (b)
-#define SHORT_TO_CHAN(s)  ((s) < 0 ? 0 : (GLchan) ((s) >> 7))
-#define USHORT_TO_CHAN(s) ((GLchan) ((s) >> 8))
-#define INT_TO_CHAN(i)    ((i) < 0 ? 0 : (GLchan) ((i) >> 23))
-#define UINT_TO_CHAN(i)   ((GLchan) ((i) >> 24))
-
-#define CHAN_TO_UBYTE(c)  (c)
-#define CHAN_TO_USHORT(c) (((c) << 8) | (c))
-#define CHAN_TO_SHORT(c)  (((c) << 7) | ((c) >> 1))
-#define CHAN_TO_FLOAT(c)  UBYTE_TO_FLOAT(c)
-
-#define CLAMPED_FLOAT_TO_CHAN(c, f)    CLAMPED_FLOAT_TO_UBYTE(c, f)
-#define UNCLAMPED_FLOAT_TO_CHAN(c, f)  UNCLAMPED_FLOAT_TO_UBYTE(c, f)
-
-#define COPY_CHAN4(DST, SRC)  COPY_4UBV(DST, SRC)
-
-#elif CHAN_BITS == 16
-
-#define BYTE_TO_CHAN(b)   ((b) < 0 ? 0 : (((GLchan) (b)) * 516))
-#define UBYTE_TO_CHAN(b)  ((((GLchan) (b)) << 8) | ((GLchan) (b)))
-#define SHORT_TO_CHAN(s)  ((s) < 0 ? 0 : (GLchan) (s))
-#define USHORT_TO_CHAN(s) (s)
-#define INT_TO_CHAN(i)    ((i) < 0 ? 0 : (GLchan) ((i) >> 15))
-#define UINT_TO_CHAN(i)   ((GLchan) ((i) >> 16))
-
-#define CHAN_TO_UBYTE(c)  ((c) >> 8)
-#define CHAN_TO_USHORT(c) (c)
-#define CHAN_TO_SHORT(c)  ((c) >> 1)
-#define CHAN_TO_FLOAT(c)  ((GLfloat) ((c) * (1.0 / CHAN_MAXF)))
-
-#define CLAMPED_FLOAT_TO_CHAN(c, f)    CLAMPED_FLOAT_TO_USHORT(c, f)
-#define UNCLAMPED_FLOAT_TO_CHAN(c, f)  UNCLAMPED_FLOAT_TO_USHORT(c, f)
-
-#define COPY_CHAN4(DST, SRC)  COPY_4V(DST, SRC)
-
-#elif CHAN_BITS == 32
-
-/* XXX floating-point color channels not fully thought-out */
-#define BYTE_TO_CHAN(b)   ((GLfloat) ((b) * (1.0F / 127.0F)))
-#define UBYTE_TO_CHAN(b)  ((GLfloat) ((b) * (1.0F / 255.0F)))
-#define SHORT_TO_CHAN(s)  ((GLfloat) ((s) * (1.0F / 32767.0F)))
-#define USHORT_TO_CHAN(s) ((GLfloat) ((s) * (1.0F / 65535.0F)))
-#define INT_TO_CHAN(i)    ((GLfloat) ((i) * (1.0F / 2147483647.0F)))
-#define UINT_TO_CHAN(i)   ((GLfloat) ((i) * (1.0F / 4294967295.0F)))
-
-#define CHAN_TO_UBYTE(c)  FLOAT_TO_UBYTE(c)
-#define CHAN_TO_USHORT(c) ((GLushort) (CLAMP((c), 0.0f, 1.0f) * 65535.0))
-#define CHAN_TO_SHORT(c)  ((GLshort) (CLAMP((c), 0.0f, 1.0f) * 32767.0))
-#define CHAN_TO_FLOAT(c)  (c)
-
-#define CLAMPED_FLOAT_TO_CHAN(c, f)  c = (f)
-#define UNCLAMPED_FLOAT_TO_CHAN(c, f)      c = (f)
-
-#define COPY_CHAN4(DST, SRC)  COPY_4V(DST, SRC)
-
-#else
-
-#error unexpected CHAN_BITS size
-
-#endif
-
-
-/**
- * Convert 3 channels at once.
- *
- * \param dst pointer to destination GLchan[3] array.
- * \param f pointer to source GLfloat[3] array.
- *
- * \sa #UNCLAMPED_FLOAT_TO_CHAN.
- */
-#define UNCLAMPED_FLOAT_TO_RGB_CHAN(dst, f)	\
-do {						\
-   UNCLAMPED_FLOAT_TO_CHAN((dst)[0], (f)[0]);	\
-   UNCLAMPED_FLOAT_TO_CHAN((dst)[1], (f)[1]);	\
-   UNCLAMPED_FLOAT_TO_CHAN((dst)[2], (f)[2]);	\
-} while (0)
-
-
-/**
- * Convert 4 channels at once.
- *
- * \param dst pointer to destination GLchan[4] array.
- * \param f pointer to source GLfloat[4] array.
- *
- * \sa #UNCLAMPED_FLOAT_TO_CHAN.
- */
-#define UNCLAMPED_FLOAT_TO_RGBA_CHAN(dst, f)	\
-do {						\
-   UNCLAMPED_FLOAT_TO_CHAN((dst)[0], (f)[0]);	\
-   UNCLAMPED_FLOAT_TO_CHAN((dst)[1], (f)[1]);	\
-   UNCLAMPED_FLOAT_TO_CHAN((dst)[2], (f)[2]);	\
-   UNCLAMPED_FLOAT_TO_CHAN((dst)[3], (f)[3]);	\
-} while (0)
-
-
 /**
  * Convert four float values in [0,1] to ubytes in [0,255] with clamping.
  */
@@ -205,11 +74,11 @@ _mesa_unclamped_float_rgba_to_ubyte(GLubyte dst[4], const GLfloat src[4])
 
 #define PACK_COLOR_5551( R, G, B, A )					\
    ((((R) & 0xf8) << 8) | (((G) & 0xf8) << 3) | (((B) & 0xf8) >> 2) |	\
-    ((A) ? 1 : 0))
+    ((A) >> 7))
 
 #define PACK_COLOR_1555( A, B, G, R )					\
    ((((B) & 0xf8) << 7) | (((G) & 0xf8) << 2) | (((R) & 0xf8) >> 3) |	\
-    ((A) ? 0x8000 : 0))
+    (((A) & 0x80) << 8))
 
 #define PACK_COLOR_1555_REV( A, B, G, R )					\
    ((((B) & 0xf8) >> 1) | (((G) & 0xc0) >> 6) | (((G) & 0x38) << 10) | (((R) & 0xf8) << 5) |	\
diff --git a/mesalib/src/mesa/main/config.h b/mesalib/src/mesa/main/config.h
index 91aef90b7..7b7740ebe 100644
--- a/mesalib/src/mesa/main/config.h
+++ b/mesalib/src/mesa/main/config.h
@@ -61,8 +61,11 @@
 /** Maximum number of lights */
 #define MAX_LIGHTS 8
 
-/** Maximum user-defined clipping planes */
-#define MAX_CLIP_PLANES 6
+/**
+ * Maximum number of user-defined clipping planes supported by any driver in
+ * Mesa.  This is used to size arrays.
+ */
+#define MAX_CLIP_PLANES 8
 
 /** Maximum pixel map lookup table size */
 #define MAX_PIXEL_MAP_TABLE 256
@@ -329,7 +332,7 @@
 
 
 /**
- * Bits per color channel:  8, 16 or 32
+ * For swrast, bits per color channel:  8, 16 or 32
  */
 #ifndef CHAN_BITS
 #define CHAN_BITS 8
diff --git a/mesalib/src/mesa/main/context.c b/mesalib/src/mesa/main/context.c
index 0cf794735..b20063c33 100644
--- a/mesalib/src/mesa/main/context.c
+++ b/mesalib/src/mesa/main/context.c
@@ -582,7 +582,7 @@ _mesa_init_constants(struct gl_context *ctx)
    ctx->Const.MaxLineWidthAA = MAX_LINE_WIDTH;
    ctx->Const.LineWidthGranularity = (GLfloat) LINE_WIDTH_GRANULARITY;
    ctx->Const.MaxColorTableSize = MAX_COLOR_TABLE_SIZE;
-   ctx->Const.MaxClipPlanes = MAX_CLIP_PLANES;
+   ctx->Const.MaxClipPlanes = 6;
    ctx->Const.MaxLights = MAX_LIGHTS;
    ctx->Const.MaxShininess = 128.0;
    ctx->Const.MaxSpotExponent = 128.0;
diff --git a/mesalib/src/mesa/main/debug.c b/mesalib/src/mesa/main/debug.c
index 2bb37452d..0a393e5fa 100644
--- a/mesalib/src/mesa/main/debug.c
+++ b/mesalib/src/mesa/main/debug.c
@@ -567,9 +567,6 @@ _mesa_dump_image(const char *filename, const void *image, GLuint w, GLuint h,
 void
 _mesa_print_texture(struct gl_context *ctx, struct gl_texture_image *img)
 {
-#if CHAN_TYPE != GL_UNSIGNED_BYTE
-   _mesa_problem(NULL, "PrintTexture not supported");
-#else
    const GLint slice = 0;
    GLint srcRowStride;
    GLuint i, j, c;
@@ -626,5 +623,4 @@ _mesa_print_texture(struct gl_context *ctx, struct gl_texture_image *img)
    }
 
    ctx->Driver.UnmapTextureImage(ctx, img, slice);
-#endif
 }
diff --git a/mesalib/src/mesa/main/dlist.c b/mesalib/src/mesa/main/dlist.c
index 9bba52129..f11dae9d0 100644
--- a/mesalib/src/mesa/main/dlist.c
+++ b/mesalib/src/mesa/main/dlist.c
@@ -34,6 +34,7 @@
 #include "api_arrayelt.h"
 #include "api_exec.h"
 #include "api_loopback.h"
+#include "api_validate.h"
 #if FEATURE_ATI_fragment_shader
 #include "atifragshader.h"
 #endif
@@ -5762,8 +5763,8 @@ save_Begin(GLenum mode)
    Node *n;
    GLboolean error = GL_FALSE;
 
-   if ( /*mode < GL_POINTS || */ mode > GL_POLYGON) {
-      _mesa_compile_error(ctx, GL_INVALID_ENUM, "Begin (mode)");
+   if (!_mesa_valid_prim_mode(ctx, mode)) {
+      _mesa_compile_error(ctx, GL_INVALID_ENUM, "glBegin(mode)");
       error = GL_TRUE;
    }
    else if (ctx->Driver.CurrentSavePrimitive == PRIM_UNKNOWN) {
diff --git a/mesalib/src/mesa/main/macros.h b/mesalib/src/mesa/main/macros.h
index 01e4d20af..2a849e36a 100644
--- a/mesalib/src/mesa/main/macros.h
+++ b/mesalib/src/mesa/main/macros.h
@@ -175,10 +175,6 @@ extern GLfloat _mesa_ubyte_to_float_color_tab[256];
 #define STRIDE_4UB(p, i)  (p = (GLubyte (*)[4])((GLubyte *)p + i))
 /** Stepping a GLfloat[4] pointer by a byte stride */
 #define STRIDE_4F(p, i)  (p = (GLfloat (*)[4])((GLubyte *)p + i))
-/** Stepping a GLchan[4] pointer by a byte stride */
-#define STRIDE_4CHAN(p, i)  (p = (GLchan (*)[4])((GLubyte *)p + i))
-/** Stepping a GLchan pointer by a byte stride */
-#define STRIDE_CHAN(p, i)  (p = (GLchan *)((GLubyte *)p + i))
 /** Stepping a \p t pointer by a byte stride */
 #define STRIDE_T(p, t, i)  (p = (t)((GLubyte *)p + i))
 
@@ -602,14 +598,6 @@ do {                        \
    UNCLAMPED_FLOAT_TO_UBYTE( dstub, dstf ); \
 } while (0)
 
-#define INTERP_CHAN( t, dstc, outc, inc )   \
-do {                        \
-   GLfloat inf = CHAN_TO_FLOAT( inc );      \
-   GLfloat outf = CHAN_TO_FLOAT( outc );    \
-   GLfloat dstf = LINTERP( t, outf, inf );  \
-   UNCLAMPED_FLOAT_TO_CHAN( dstc, dstf );   \
-} while (0)
-
 #define INTERP_UI( t, dstui, outui, inui )  \
    dstui = (GLuint) (GLint) LINTERP( (t), (GLfloat) (outui), (GLfloat) (inui) )
 
@@ -631,21 +619,6 @@ do {                        \
    dst[2] = LINTERP( (t), (out)[2], (in)[2] );  \
 } while (0)
 
-#define INTERP_4CHAN( t, dst, out, in )         \
-do {                            \
-   INTERP_CHAN( (t), (dst)[0], (out)[0], (in)[0] ); \
-   INTERP_CHAN( (t), (dst)[1], (out)[1], (in)[1] ); \
-   INTERP_CHAN( (t), (dst)[2], (out)[2], (in)[2] ); \
-   INTERP_CHAN( (t), (dst)[3], (out)[3], (in)[3] ); \
-} while (0)
-
-#define INTERP_3CHAN( t, dst, out, in )         \
-do {                            \
-   INTERP_CHAN( (t), (dst)[0], (out)[0], (in)[0] ); \
-   INTERP_CHAN( (t), (dst)[1], (out)[1], (in)[1] ); \
-   INTERP_CHAN( (t), (dst)[2], (out)[2], (in)[2] ); \
-} while (0)
-
 #define INTERP_SZ( t, vec, to, out, in, sz )                \
 do {                                    \
    switch (sz) {                            \
diff --git a/mesalib/src/mesa/main/matrix.c b/mesalib/src/mesa/main/matrix.c
index 2579b7384..f479a22b0 100644
--- a/mesalib/src/mesa/main/matrix.c
+++ b/mesalib/src/mesa/main/matrix.c
@@ -1,789 +1,789 @@
-/*
- * Mesa 3-D graphics library
- * Version:  7.5
- *
- * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
- * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-/**
- * \file matrix.c
- * Matrix operations.
- *
- * \note
- * -# 4x4 transformation matrices are stored in memory in column major order.
- * -# Points/vertices are to be thought of as column vectors.
- * -# Transformation of a point p by a matrix M is: p' = M * p
- */
-
-
-#include "glheader.h"
-#include "imports.h"
-#include "context.h"
-#include "enums.h"
-#include "macros.h"
-#include "mfeatures.h"
-#include "matrix.h"
-#include "mtypes.h"
-#include "math/m_matrix.h"
-
-
-/**
- * Apply a perspective projection matrix.
- *
- * \param left left clipping plane coordinate.
- * \param right right clipping plane coordinate.
- * \param bottom bottom clipping plane coordinate.
- * \param top top clipping plane coordinate.
- * \param nearval distance to the near clipping plane.
- * \param farval distance to the far clipping plane.
- *
- * \sa glFrustum().
- *
- * Flushes vertices and validates parameters. Calls _math_matrix_frustum() with
- * the top matrix of the current matrix stack and sets
- * __struct gl_contextRec::NewState.
- */
-void GLAPIENTRY
-_mesa_Frustum( GLdouble left, GLdouble right,
-               GLdouble bottom, GLdouble top,
-               GLdouble nearval, GLdouble farval )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (nearval <= 0.0 ||
-       farval <= 0.0 ||
-       nearval == farval ||
-       left == right ||
-       top == bottom)
-   {
-      _mesa_error( ctx,  GL_INVALID_VALUE, "glFrustum" );
-      return;
-   }
-
-   _math_matrix_frustum( ctx->CurrentStack->Top,
-                         (GLfloat) left, (GLfloat) right, 
-			 (GLfloat) bottom, (GLfloat) top, 
-			 (GLfloat) nearval, (GLfloat) farval );
-   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
-}
-
-
-/**
- * Apply an orthographic projection matrix.
- *
- * \param left left clipping plane coordinate.
- * \param right right clipping plane coordinate.
- * \param bottom bottom clipping plane coordinate.
- * \param top top clipping plane coordinate.
- * \param nearval distance to the near clipping plane.
- * \param farval distance to the far clipping plane.
- *
- * \sa glOrtho().
- *
- * Flushes vertices and validates parameters. Calls _math_matrix_ortho() with
- * the top matrix of the current matrix stack and sets
- * __struct gl_contextRec::NewState.
- */
-void GLAPIENTRY
-_mesa_Ortho( GLdouble left, GLdouble right,
-             GLdouble bottom, GLdouble top,
-             GLdouble nearval, GLdouble farval )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glOrtho(%f, %f, %f, %f, %f, %f)\n",
-                  left, right, bottom, top, nearval, farval);
-
-   if (left == right ||
-       bottom == top ||
-       nearval == farval)
-   {
-      _mesa_error( ctx,  GL_INVALID_VALUE, "glOrtho" );
-      return;
-   }
-
-   _math_matrix_ortho( ctx->CurrentStack->Top,
-                       (GLfloat) left, (GLfloat) right, 
-		       (GLfloat) bottom, (GLfloat) top, 
-		       (GLfloat) nearval, (GLfloat) farval );
-   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
-}
-
-
-/**
- * Set the current matrix stack.
- *
- * \param mode matrix stack.
- *
- * \sa glMatrixMode().
- *
- * Flushes the vertices, validates the parameter and updates
- * __struct gl_contextRec::CurrentStack and gl_transform_attrib::MatrixMode
- * with the specified matrix stack.
- */
-void GLAPIENTRY
-_mesa_MatrixMode( GLenum mode )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END(ctx);
-
-   if (ctx->Transform.MatrixMode == mode && mode != GL_TEXTURE)
-      return;
-   FLUSH_VERTICES(ctx, _NEW_TRANSFORM);
-
-   switch (mode) {
-   case GL_MODELVIEW:
-      ctx->CurrentStack = &ctx->ModelviewMatrixStack;
-      break;
-   case GL_PROJECTION:
-      ctx->CurrentStack = &ctx->ProjectionMatrixStack;
-      break;
-   case GL_TEXTURE:
-      /* This error check is disabled because if we're called from
-       * glPopAttrib() when the active texture unit is >= MaxTextureCoordUnits
-       * we'll generate an unexpected error.
-       * From the GL_ARB_vertex_shader spec it sounds like we should instead
-       * do error checking in other places when we actually try to access
-       * texture matrices beyond MaxTextureCoordUnits.
-       */
-#if 0
-      if (ctx->Texture.CurrentUnit >= ctx->Const.MaxTextureCoordUnits) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glMatrixMode(invalid tex unit %d)",
-                     ctx->Texture.CurrentUnit);
-         return;
-      }
-#endif
-      ASSERT(ctx->Texture.CurrentUnit < Elements(ctx->TextureMatrixStack));
-      ctx->CurrentStack = &ctx->TextureMatrixStack[ctx->Texture.CurrentUnit];
-      break;
-   case GL_MATRIX0_NV:
-   case GL_MATRIX1_NV:
-   case GL_MATRIX2_NV:
-   case GL_MATRIX3_NV:
-   case GL_MATRIX4_NV:
-   case GL_MATRIX5_NV:
-   case GL_MATRIX6_NV:
-   case GL_MATRIX7_NV:
-      if (ctx->Extensions.NV_vertex_program) {
-         ctx->CurrentStack = &ctx->ProgramMatrixStack[mode - GL_MATRIX0_NV];
-      }
-      else {
-         _mesa_error( ctx,  GL_INVALID_ENUM, "glMatrixMode(mode)" );
-         return;
-      }
-      break;
-   case GL_MATRIX0_ARB:
-   case GL_MATRIX1_ARB:
-   case GL_MATRIX2_ARB:
-   case GL_MATRIX3_ARB:
-   case GL_MATRIX4_ARB:
-   case GL_MATRIX5_ARB:
-   case GL_MATRIX6_ARB:
-   case GL_MATRIX7_ARB:
-      if (ctx->Extensions.ARB_vertex_program ||
-          ctx->Extensions.ARB_fragment_program) {
-         const GLuint m = mode - GL_MATRIX0_ARB;
-         if (m > ctx->Const.MaxProgramMatrices) {
-            _mesa_error(ctx, GL_INVALID_ENUM,
-                        "glMatrixMode(GL_MATRIX%d_ARB)", m);
-            return;
-         }
-         ctx->CurrentStack = &ctx->ProgramMatrixStack[m];
-      }
-      else {
-         _mesa_error( ctx,  GL_INVALID_ENUM, "glMatrixMode(mode)" );
-         return;
-      }
-      break;
-   default:
-      _mesa_error( ctx,  GL_INVALID_ENUM, "glMatrixMode(mode)" );
-      return;
-   }
-
-   ctx->Transform.MatrixMode = mode;
-}
-
-
-/**
- * Push the current matrix stack.
- *
- * \sa glPushMatrix().
- * 
- * Verifies the current matrix stack is not full, and duplicates the top-most
- * matrix in the stack.
- * Marks __struct gl_contextRec::NewState with the stack dirty flag.
- */
-void GLAPIENTRY
-_mesa_PushMatrix( void )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   struct gl_matrix_stack *stack = ctx->CurrentStack;
-   ASSERT_OUTSIDE_BEGIN_END(ctx);
-
-   if (MESA_VERBOSE&VERBOSE_API)
-      _mesa_debug(ctx, "glPushMatrix %s\n",
-                  _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
-
-   if (stack->Depth + 1 >= stack->MaxDepth) {
-      if (ctx->Transform.MatrixMode == GL_TEXTURE) {
-         _mesa_error(ctx,  GL_STACK_OVERFLOW,
-                     "glPushMatrix(mode=GL_TEXTURE, unit=%d)",
-                      ctx->Texture.CurrentUnit);
-      }
-      else {
-         _mesa_error(ctx,  GL_STACK_OVERFLOW, "glPushMatrix(mode=%s)",
-                     _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
-      }
-      return;
-   }
-   _math_matrix_copy( &stack->Stack[stack->Depth + 1],
-                      &stack->Stack[stack->Depth] );
-   stack->Depth++;
-   stack->Top = &(stack->Stack[stack->Depth]);
-   ctx->NewState |= stack->DirtyFlag;
-}
-
-
-/**
- * Pop the current matrix stack.
- *
- * \sa glPopMatrix().
- * 
- * Flushes the vertices, verifies the current matrix stack is not empty, and
- * moves the stack head down.
- * Marks __struct gl_contextRec::NewState with the dirty stack flag.
- */
-void GLAPIENTRY
-_mesa_PopMatrix( void )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   struct gl_matrix_stack *stack = ctx->CurrentStack;
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE&VERBOSE_API)
-      _mesa_debug(ctx, "glPopMatrix %s\n",
-                  _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
-
-   if (stack->Depth == 0) {
-      if (ctx->Transform.MatrixMode == GL_TEXTURE) {
-         _mesa_error(ctx,  GL_STACK_UNDERFLOW,
-                     "glPopMatrix(mode=GL_TEXTURE, unit=%d)",
-                      ctx->Texture.CurrentUnit);
-      }
-      else {
-         _mesa_error(ctx,  GL_STACK_UNDERFLOW, "glPopMatrix(mode=%s)",
-                     _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
-      }
-      return;
-   }
-   stack->Depth--;
-   stack->Top = &(stack->Stack[stack->Depth]);
-   ctx->NewState |= stack->DirtyFlag;
-}
-
-
-/**
- * Replace the current matrix with the identity matrix.
- *
- * \sa glLoadIdentity().
- *
- * Flushes the vertices and calls _math_matrix_set_identity() with the
- * top-most matrix in the current stack.
- * Marks __struct gl_contextRec::NewState with the stack dirty flag.
- */
-void GLAPIENTRY
-_mesa_LoadIdentity( void )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glLoadIdentity()\n");
-
-   _math_matrix_set_identity( ctx->CurrentStack->Top );
-   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
-}
-
-
-/**
- * Replace the current matrix with a given matrix.
- *
- * \param m matrix.
- *
- * \sa glLoadMatrixf().
- *
- * Flushes the vertices and calls _math_matrix_loadf() with the top-most
- * matrix in the current stack and the given matrix.
- * Marks __struct gl_contextRec::NewState with the dirty stack flag.
- */
-void GLAPIENTRY
-_mesa_LoadMatrixf( const GLfloat *m )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   if (!m) return;
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx,
-          "glLoadMatrix(%f %f %f %f, %f %f %f %f, %f %f %f %f, %f %f %f %f\n",
-          m[0], m[4], m[8], m[12],
-          m[1], m[5], m[9], m[13],
-          m[2], m[6], m[10], m[14],
-          m[3], m[7], m[11], m[15]);
-
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-   _math_matrix_loadf( ctx->CurrentStack->Top, m );
-   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
-}
-
-
-/**
- * Multiply the current matrix with a given matrix.
- *
- * \param m matrix.
- *
- * \sa glMultMatrixf().
- *
- * Flushes the vertices and calls _math_matrix_mul_floats() with the top-most
- * matrix in the current stack and the given matrix. Marks
- * __struct gl_contextRec::NewState with the dirty stack flag.
- */
-void GLAPIENTRY
-_mesa_MultMatrixf( const GLfloat *m )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   if (!m) return;
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx,
-          "glMultMatrix(%f %f %f %f, %f %f %f %f, %f %f %f %f, %f %f %f %f\n",
-          m[0], m[4], m[8], m[12],
-          m[1], m[5], m[9], m[13],
-          m[2], m[6], m[10], m[14],
-          m[3], m[7], m[11], m[15]);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-   _math_matrix_mul_floats( ctx->CurrentStack->Top, m );
-   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
-}
-
-
-/**
- * Multiply the current matrix with a rotation matrix.
- *
- * \param angle angle of rotation, in degrees.
- * \param x rotation vector x coordinate.
- * \param y rotation vector y coordinate.
- * \param z rotation vector z coordinate.
- *
- * \sa glRotatef().
- *
- * Flushes the vertices and calls _math_matrix_rotate() with the top-most
- * matrix in the current stack and the given parameters. Marks
- * __struct gl_contextRec::NewState with the dirty stack flag.
- */
-void GLAPIENTRY
-_mesa_Rotatef( GLfloat angle, GLfloat x, GLfloat y, GLfloat z )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-   if (angle != 0.0F) {
-      _math_matrix_rotate( ctx->CurrentStack->Top, angle, x, y, z);
-      ctx->NewState |= ctx->CurrentStack->DirtyFlag;
-   }
-}
-
-
-/**
- * Multiply the current matrix with a general scaling matrix.
- *
- * \param x x axis scale factor.
- * \param y y axis scale factor.
- * \param z z axis scale factor.
- *
- * \sa glScalef().
- *
- * Flushes the vertices and calls _math_matrix_scale() with the top-most
- * matrix in the current stack and the given parameters. Marks
- * __struct gl_contextRec::NewState with the dirty stack flag.
- */
-void GLAPIENTRY
-_mesa_Scalef( GLfloat x, GLfloat y, GLfloat z )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-   _math_matrix_scale( ctx->CurrentStack->Top, x, y, z);
-   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
-}
-
-
-/**
- * Multiply the current matrix with a translation matrix.
- *
- * \param x translation vector x coordinate.
- * \param y translation vector y coordinate.
- * \param z translation vector z coordinate.
- *
- * \sa glTranslatef().
- *
- * Flushes the vertices and calls _math_matrix_translate() with the top-most
- * matrix in the current stack and the given parameters. Marks
- * __struct gl_contextRec::NewState with the dirty stack flag.
- */
-void GLAPIENTRY
-_mesa_Translatef( GLfloat x, GLfloat y, GLfloat z )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-   _math_matrix_translate( ctx->CurrentStack->Top, x, y, z);
-   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
-}
-
- 
-#if _HAVE_FULL_GL
-void GLAPIENTRY
-_mesa_LoadMatrixd( const GLdouble *m )
-{
-   GLint i;
-   GLfloat f[16];
-   if (!m) return;
-   for (i = 0; i < 16; i++)
-      f[i] = (GLfloat) m[i];
-   _mesa_LoadMatrixf(f);
-}
-
-void GLAPIENTRY
-_mesa_MultMatrixd( const GLdouble *m )
-{
-   GLint i;
-   GLfloat f[16];
-   if (!m) return;
-   for (i = 0; i < 16; i++)
-      f[i] = (GLfloat) m[i];
-   _mesa_MultMatrixf( f );
-}
-
-
-void GLAPIENTRY
-_mesa_Rotated( GLdouble angle, GLdouble x, GLdouble y, GLdouble z )
-{
-   _mesa_Rotatef((GLfloat) angle, (GLfloat) x, (GLfloat) y, (GLfloat) z);
-}
-
-
-void GLAPIENTRY
-_mesa_Scaled( GLdouble x, GLdouble y, GLdouble z )
-{
-   _mesa_Scalef((GLfloat) x, (GLfloat) y, (GLfloat) z);
-}
-
-
-void GLAPIENTRY
-_mesa_Translated( GLdouble x, GLdouble y, GLdouble z )
-{
-   _mesa_Translatef((GLfloat) x, (GLfloat) y, (GLfloat) z);
-}
-#endif
-
-
-#if _HAVE_FULL_GL
-void GLAPIENTRY
-_mesa_LoadTransposeMatrixfARB( const GLfloat *m )
-{
-   GLfloat tm[16];
-   if (!m) return;
-   _math_transposef(tm, m);
-   _mesa_LoadMatrixf(tm);
-}
-
-
-void GLAPIENTRY
-_mesa_LoadTransposeMatrixdARB( const GLdouble *m )
-{
-   GLfloat tm[16];
-   if (!m) return;
-   _math_transposefd(tm, m);
-   _mesa_LoadMatrixf(tm);
-}
-
-
-void GLAPIENTRY
-_mesa_MultTransposeMatrixfARB( const GLfloat *m )
-{
-   GLfloat tm[16];
-   if (!m) return;
-   _math_transposef(tm, m);
-   _mesa_MultMatrixf(tm);
-}
-
-
-void GLAPIENTRY
-_mesa_MultTransposeMatrixdARB( const GLdouble *m )
-{
-   GLfloat tm[16];
-   if (!m) return;
-   _math_transposefd(tm, m);
-   _mesa_MultMatrixf(tm);
-}
-#endif
-
-
-
-/**********************************************************************/
-/** \name State management */
-/*@{*/
-
-
-/**
- * Update the projection matrix stack.
- *
- * \param ctx GL context.
- *
- * Calls _math_matrix_analyse() with the top-matrix of the projection matrix
- * stack, and recomputes user clip positions if necessary.
- * 
- * \note This routine references __struct gl_contextRec::Tranform attribute
- * values to compute userclip positions in clip space, but is only called on
- * _NEW_PROJECTION.  The _mesa_ClipPlane() function keeps these values up to
- * date across changes to the __struct gl_contextRec::Transform attributes.
- */
-static void
-update_projection( struct gl_context *ctx )
-{
-   _math_matrix_analyse( ctx->ProjectionMatrixStack.Top );
-
-#if FEATURE_userclip
-   /* Recompute clip plane positions in clipspace.  This is also done
-    * in _mesa_ClipPlane().
-    */
-   if (ctx->Transform.ClipPlanesEnabled) {
-      GLuint p;
-      for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
-	 if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
-	    _mesa_transform_vector( ctx->Transform._ClipUserPlane[p],
-				 ctx->Transform.EyeUserPlane[p],
-				 ctx->ProjectionMatrixStack.Top->inv );
-	 }
-      }
-   }
-#endif
-}
-
-
-/**
- * Calculate the combined modelview-projection matrix.
- *
- * \param ctx GL context.
- *
- * Multiplies the top matrices of the projection and model view stacks into
- * __struct gl_contextRec::_ModelProjectMatrix via _math_matrix_mul_matrix()
- * and analyzes the resulting matrix via _math_matrix_analyse().
- */
-static void
-calculate_model_project_matrix( struct gl_context *ctx )
-{
-   _math_matrix_mul_matrix( &ctx->_ModelProjectMatrix,
-                            ctx->ProjectionMatrixStack.Top,
-                            ctx->ModelviewMatrixStack.Top );
-
-   _math_matrix_analyse( &ctx->_ModelProjectMatrix );
-}
-
-
-/**
- * Updates the combined modelview-projection matrix.
- *
- * \param ctx GL context.
- * \param new_state new state bit mask.
- *
- * If there is a new model view matrix then analyzes it. If there is a new
- * projection matrix, updates it. Finally calls
- * calculate_model_project_matrix() to recalculate the modelview-projection
- * matrix.
- */
-void _mesa_update_modelview_project( struct gl_context *ctx, GLuint new_state )
-{
-   if (new_state & _NEW_MODELVIEW) {
-      _math_matrix_analyse( ctx->ModelviewMatrixStack.Top );
-    
-      /* Bring cull position up to date.
-       */
-      TRANSFORM_POINT3( ctx->Transform.CullObjPos, 
-			ctx->ModelviewMatrixStack.Top->inv,
-			ctx->Transform.CullEyePos );
-   }
-
-
-   if (new_state & _NEW_PROJECTION)
-      update_projection( ctx );
-
-   /* Keep ModelviewProject up to date always to allow tnl
-    * implementations that go model->clip even when eye is required.
-    */
-   calculate_model_project_matrix(ctx);
-}
-
-/*@}*/
-
-
-/**********************************************************************/
-/** Matrix stack initialization */
-/*@{*/
-
-
-/**
- * Initialize a matrix stack.
- *
- * \param stack matrix stack.
- * \param maxDepth maximum stack depth.
- * \param dirtyFlag dirty flag.
- * 
- * Allocates an array of \p maxDepth elements for the matrix stack and calls
- * _math_matrix_ctr() and _math_matrix_alloc_inv() for each element to
- * initialize it.
- */
-static void
-init_matrix_stack( struct gl_matrix_stack *stack,
-                   GLuint maxDepth, GLuint dirtyFlag )
-{
-   GLuint i;
-
-   stack->Depth = 0;
-   stack->MaxDepth = maxDepth;
-   stack->DirtyFlag = dirtyFlag;
-   /* The stack */
-   stack->Stack = (GLmatrix *) CALLOC(maxDepth * sizeof(GLmatrix));
-   for (i = 0; i < maxDepth; i++) {
-      _math_matrix_ctr(&stack->Stack[i]);
-      _math_matrix_alloc_inv(&stack->Stack[i]);
-   }
-   stack->Top = stack->Stack;
-}
-
-/**
- * Free matrix stack.
- * 
- * \param stack matrix stack.
- * 
- * Calls _math_matrix_dtr() for each element of the matrix stack and
- * frees the array.
- */
-static void
-free_matrix_stack( struct gl_matrix_stack *stack )
-{
-   GLuint i;
-   for (i = 0; i < stack->MaxDepth; i++) {
-      _math_matrix_dtr(&stack->Stack[i]);
-   }
-   FREE(stack->Stack);
-   stack->Stack = stack->Top = NULL;
-}
-
-/*@}*/
-
-
-/**********************************************************************/
-/** \name Initialization */
-/*@{*/
-
-
-/**
- * Initialize the context matrix data.
- *
- * \param ctx GL context.
- *
- * Initializes each of the matrix stacks and the combined modelview-projection
- * matrix.
- */
-void _mesa_init_matrix( struct gl_context * ctx )
-{
-   GLint i;
-
-   /* Initialize matrix stacks */
-   init_matrix_stack(&ctx->ModelviewMatrixStack, MAX_MODELVIEW_STACK_DEPTH,
-                     _NEW_MODELVIEW);
-   init_matrix_stack(&ctx->ProjectionMatrixStack, MAX_PROJECTION_STACK_DEPTH,
-                     _NEW_PROJECTION);
-   for (i = 0; i < Elements(ctx->TextureMatrixStack); i++)
-      init_matrix_stack(&ctx->TextureMatrixStack[i], MAX_TEXTURE_STACK_DEPTH,
-                        _NEW_TEXTURE_MATRIX);
-   for (i = 0; i < Elements(ctx->ProgramMatrixStack); i++)
-      init_matrix_stack(&ctx->ProgramMatrixStack[i], 
-		        MAX_PROGRAM_MATRIX_STACK_DEPTH, _NEW_TRACK_MATRIX);
-   ctx->CurrentStack = &ctx->ModelviewMatrixStack;
-
-   /* Init combined Modelview*Projection matrix */
-   _math_matrix_ctr( &ctx->_ModelProjectMatrix );
-}
-
-
-/**
- * Free the context matrix data.
- * 
- * \param ctx GL context.
- *
- * Frees each of the matrix stacks and the combined modelview-projection
- * matrix.
- */
-void _mesa_free_matrix_data( struct gl_context *ctx )
-{
-   GLint i;
-
-   free_matrix_stack(&ctx->ModelviewMatrixStack);
-   free_matrix_stack(&ctx->ProjectionMatrixStack);
-   for (i = 0; i < Elements(ctx->TextureMatrixStack); i++)
-      free_matrix_stack(&ctx->TextureMatrixStack[i]);
-   for (i = 0; i < Elements(ctx->ProgramMatrixStack); i++)
-      free_matrix_stack(&ctx->ProgramMatrixStack[i]);
-   /* combined Modelview*Projection matrix */
-   _math_matrix_dtr( &ctx->_ModelProjectMatrix );
-
-}
-
-
-/** 
- * Initialize the context transform attribute group.
- *
- * \param ctx GL context.
- *
- * \todo Move this to a new file with other 'transform' routines.
- */
-void _mesa_init_transform( struct gl_context *ctx )
-{
-   GLint i;
-
-   /* Transformation group */
-   ctx->Transform.MatrixMode = GL_MODELVIEW;
-   ctx->Transform.Normalize = GL_FALSE;
-   ctx->Transform.RescaleNormals = GL_FALSE;
-   ctx->Transform.RasterPositionUnclipped = GL_FALSE;
-   for (i=0;i<MAX_CLIP_PLANES;i++) {
-      ASSIGN_4V( ctx->Transform.EyeUserPlane[i], 0.0, 0.0, 0.0, 0.0 );
-   }
-   ctx->Transform.ClipPlanesEnabled = 0;
-
-   ASSIGN_4V( ctx->Transform.CullObjPos, 0.0, 0.0, 1.0, 0.0 );
-   ASSIGN_4V( ctx->Transform.CullEyePos, 0.0, 0.0, 1.0, 0.0 );
-}
-
-
-/*@}*/
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.5
+ *
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * \file matrix.c
+ * Matrix operations.
+ *
+ * \note
+ * -# 4x4 transformation matrices are stored in memory in column major order.
+ * -# Points/vertices are to be thought of as column vectors.
+ * -# Transformation of a point p by a matrix M is: p' = M * p
+ */
+
+
+#include "glheader.h"
+#include "imports.h"
+#include "context.h"
+#include "enums.h"
+#include "macros.h"
+#include "mfeatures.h"
+#include "matrix.h"
+#include "mtypes.h"
+#include "math/m_matrix.h"
+
+
+/**
+ * Apply a perspective projection matrix.
+ *
+ * \param left left clipping plane coordinate.
+ * \param right right clipping plane coordinate.
+ * \param bottom bottom clipping plane coordinate.
+ * \param top top clipping plane coordinate.
+ * \param nearval distance to the near clipping plane.
+ * \param farval distance to the far clipping plane.
+ *
+ * \sa glFrustum().
+ *
+ * Flushes vertices and validates parameters. Calls _math_matrix_frustum() with
+ * the top matrix of the current matrix stack and sets
+ * __struct gl_contextRec::NewState.
+ */
+void GLAPIENTRY
+_mesa_Frustum( GLdouble left, GLdouble right,
+               GLdouble bottom, GLdouble top,
+               GLdouble nearval, GLdouble farval )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
+
+   if (nearval <= 0.0 ||
+       farval <= 0.0 ||
+       nearval == farval ||
+       left == right ||
+       top == bottom)
+   {
+      _mesa_error( ctx,  GL_INVALID_VALUE, "glFrustum" );
+      return;
+   }
+
+   _math_matrix_frustum( ctx->CurrentStack->Top,
+                         (GLfloat) left, (GLfloat) right, 
+			 (GLfloat) bottom, (GLfloat) top, 
+			 (GLfloat) nearval, (GLfloat) farval );
+   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
+}
+
+
+/**
+ * Apply an orthographic projection matrix.
+ *
+ * \param left left clipping plane coordinate.
+ * \param right right clipping plane coordinate.
+ * \param bottom bottom clipping plane coordinate.
+ * \param top top clipping plane coordinate.
+ * \param nearval distance to the near clipping plane.
+ * \param farval distance to the far clipping plane.
+ *
+ * \sa glOrtho().
+ *
+ * Flushes vertices and validates parameters. Calls _math_matrix_ortho() with
+ * the top matrix of the current matrix stack and sets
+ * __struct gl_contextRec::NewState.
+ */
+void GLAPIENTRY
+_mesa_Ortho( GLdouble left, GLdouble right,
+             GLdouble bottom, GLdouble top,
+             GLdouble nearval, GLdouble farval )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glOrtho(%f, %f, %f, %f, %f, %f)\n",
+                  left, right, bottom, top, nearval, farval);
+
+   if (left == right ||
+       bottom == top ||
+       nearval == farval)
+   {
+      _mesa_error( ctx,  GL_INVALID_VALUE, "glOrtho" );
+      return;
+   }
+
+   _math_matrix_ortho( ctx->CurrentStack->Top,
+                       (GLfloat) left, (GLfloat) right, 
+		       (GLfloat) bottom, (GLfloat) top, 
+		       (GLfloat) nearval, (GLfloat) farval );
+   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
+}
+
+
+/**
+ * Set the current matrix stack.
+ *
+ * \param mode matrix stack.
+ *
+ * \sa glMatrixMode().
+ *
+ * Flushes the vertices, validates the parameter and updates
+ * __struct gl_contextRec::CurrentStack and gl_transform_attrib::MatrixMode
+ * with the specified matrix stack.
+ */
+void GLAPIENTRY
+_mesa_MatrixMode( GLenum mode )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   ASSERT_OUTSIDE_BEGIN_END(ctx);
+
+   if (ctx->Transform.MatrixMode == mode && mode != GL_TEXTURE)
+      return;
+   FLUSH_VERTICES(ctx, _NEW_TRANSFORM);
+
+   switch (mode) {
+   case GL_MODELVIEW:
+      ctx->CurrentStack = &ctx->ModelviewMatrixStack;
+      break;
+   case GL_PROJECTION:
+      ctx->CurrentStack = &ctx->ProjectionMatrixStack;
+      break;
+   case GL_TEXTURE:
+      /* This error check is disabled because if we're called from
+       * glPopAttrib() when the active texture unit is >= MaxTextureCoordUnits
+       * we'll generate an unexpected error.
+       * From the GL_ARB_vertex_shader spec it sounds like we should instead
+       * do error checking in other places when we actually try to access
+       * texture matrices beyond MaxTextureCoordUnits.
+       */
+#if 0
+      if (ctx->Texture.CurrentUnit >= ctx->Const.MaxTextureCoordUnits) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glMatrixMode(invalid tex unit %d)",
+                     ctx->Texture.CurrentUnit);
+         return;
+      }
+#endif
+      ASSERT(ctx->Texture.CurrentUnit < Elements(ctx->TextureMatrixStack));
+      ctx->CurrentStack = &ctx->TextureMatrixStack[ctx->Texture.CurrentUnit];
+      break;
+   case GL_MATRIX0_NV:
+   case GL_MATRIX1_NV:
+   case GL_MATRIX2_NV:
+   case GL_MATRIX3_NV:
+   case GL_MATRIX4_NV:
+   case GL_MATRIX5_NV:
+   case GL_MATRIX6_NV:
+   case GL_MATRIX7_NV:
+      if (ctx->Extensions.NV_vertex_program) {
+         ctx->CurrentStack = &ctx->ProgramMatrixStack[mode - GL_MATRIX0_NV];
+      }
+      else {
+         _mesa_error( ctx,  GL_INVALID_ENUM, "glMatrixMode(mode)" );
+         return;
+      }
+      break;
+   case GL_MATRIX0_ARB:
+   case GL_MATRIX1_ARB:
+   case GL_MATRIX2_ARB:
+   case GL_MATRIX3_ARB:
+   case GL_MATRIX4_ARB:
+   case GL_MATRIX5_ARB:
+   case GL_MATRIX6_ARB:
+   case GL_MATRIX7_ARB:
+      if (ctx->Extensions.ARB_vertex_program ||
+          ctx->Extensions.ARB_fragment_program) {
+         const GLuint m = mode - GL_MATRIX0_ARB;
+         if (m > ctx->Const.MaxProgramMatrices) {
+            _mesa_error(ctx, GL_INVALID_ENUM,
+                        "glMatrixMode(GL_MATRIX%d_ARB)", m);
+            return;
+         }
+         ctx->CurrentStack = &ctx->ProgramMatrixStack[m];
+      }
+      else {
+         _mesa_error( ctx,  GL_INVALID_ENUM, "glMatrixMode(mode)" );
+         return;
+      }
+      break;
+   default:
+      _mesa_error( ctx,  GL_INVALID_ENUM, "glMatrixMode(mode)" );
+      return;
+   }
+
+   ctx->Transform.MatrixMode = mode;
+}
+
+
+/**
+ * Push the current matrix stack.
+ *
+ * \sa glPushMatrix().
+ * 
+ * Verifies the current matrix stack is not full, and duplicates the top-most
+ * matrix in the stack.
+ * Marks __struct gl_contextRec::NewState with the stack dirty flag.
+ */
+void GLAPIENTRY
+_mesa_PushMatrix( void )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_matrix_stack *stack = ctx->CurrentStack;
+   ASSERT_OUTSIDE_BEGIN_END(ctx);
+
+   if (MESA_VERBOSE&VERBOSE_API)
+      _mesa_debug(ctx, "glPushMatrix %s\n",
+                  _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+
+   if (stack->Depth + 1 >= stack->MaxDepth) {
+      if (ctx->Transform.MatrixMode == GL_TEXTURE) {
+         _mesa_error(ctx,  GL_STACK_OVERFLOW,
+                     "glPushMatrix(mode=GL_TEXTURE, unit=%d)",
+                      ctx->Texture.CurrentUnit);
+      }
+      else {
+         _mesa_error(ctx,  GL_STACK_OVERFLOW, "glPushMatrix(mode=%s)",
+                     _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+      }
+      return;
+   }
+   _math_matrix_copy( &stack->Stack[stack->Depth + 1],
+                      &stack->Stack[stack->Depth] );
+   stack->Depth++;
+   stack->Top = &(stack->Stack[stack->Depth]);
+   ctx->NewState |= stack->DirtyFlag;
+}
+
+
+/**
+ * Pop the current matrix stack.
+ *
+ * \sa glPopMatrix().
+ * 
+ * Flushes the vertices, verifies the current matrix stack is not empty, and
+ * moves the stack head down.
+ * Marks __struct gl_contextRec::NewState with the dirty stack flag.
+ */
+void GLAPIENTRY
+_mesa_PopMatrix( void )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_matrix_stack *stack = ctx->CurrentStack;
+   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
+
+   if (MESA_VERBOSE&VERBOSE_API)
+      _mesa_debug(ctx, "glPopMatrix %s\n",
+                  _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+
+   if (stack->Depth == 0) {
+      if (ctx->Transform.MatrixMode == GL_TEXTURE) {
+         _mesa_error(ctx,  GL_STACK_UNDERFLOW,
+                     "glPopMatrix(mode=GL_TEXTURE, unit=%d)",
+                      ctx->Texture.CurrentUnit);
+      }
+      else {
+         _mesa_error(ctx,  GL_STACK_UNDERFLOW, "glPopMatrix(mode=%s)",
+                     _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+      }
+      return;
+   }
+   stack->Depth--;
+   stack->Top = &(stack->Stack[stack->Depth]);
+   ctx->NewState |= stack->DirtyFlag;
+}
+
+
+/**
+ * Replace the current matrix with the identity matrix.
+ *
+ * \sa glLoadIdentity().
+ *
+ * Flushes the vertices and calls _math_matrix_set_identity() with the
+ * top-most matrix in the current stack.
+ * Marks __struct gl_contextRec::NewState with the stack dirty flag.
+ */
+void GLAPIENTRY
+_mesa_LoadIdentity( void )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glLoadIdentity()\n");
+
+   _math_matrix_set_identity( ctx->CurrentStack->Top );
+   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
+}
+
+
+/**
+ * Replace the current matrix with a given matrix.
+ *
+ * \param m matrix.
+ *
+ * \sa glLoadMatrixf().
+ *
+ * Flushes the vertices and calls _math_matrix_loadf() with the top-most
+ * matrix in the current stack and the given matrix.
+ * Marks __struct gl_contextRec::NewState with the dirty stack flag.
+ */
+void GLAPIENTRY
+_mesa_LoadMatrixf( const GLfloat *m )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   if (!m) return;
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx,
+          "glLoadMatrix(%f %f %f %f, %f %f %f %f, %f %f %f %f, %f %f %f %f\n",
+          m[0], m[4], m[8], m[12],
+          m[1], m[5], m[9], m[13],
+          m[2], m[6], m[10], m[14],
+          m[3], m[7], m[11], m[15]);
+
+   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
+   _math_matrix_loadf( ctx->CurrentStack->Top, m );
+   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
+}
+
+
+/**
+ * Multiply the current matrix with a given matrix.
+ *
+ * \param m matrix.
+ *
+ * \sa glMultMatrixf().
+ *
+ * Flushes the vertices and calls _math_matrix_mul_floats() with the top-most
+ * matrix in the current stack and the given matrix. Marks
+ * __struct gl_contextRec::NewState with the dirty stack flag.
+ */
+void GLAPIENTRY
+_mesa_MultMatrixf( const GLfloat *m )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   if (!m) return;
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx,
+          "glMultMatrix(%f %f %f %f, %f %f %f %f, %f %f %f %f, %f %f %f %f\n",
+          m[0], m[4], m[8], m[12],
+          m[1], m[5], m[9], m[13],
+          m[2], m[6], m[10], m[14],
+          m[3], m[7], m[11], m[15]);
+   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
+   _math_matrix_mul_floats( ctx->CurrentStack->Top, m );
+   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
+}
+
+
+/**
+ * Multiply the current matrix with a rotation matrix.
+ *
+ * \param angle angle of rotation, in degrees.
+ * \param x rotation vector x coordinate.
+ * \param y rotation vector y coordinate.
+ * \param z rotation vector z coordinate.
+ *
+ * \sa glRotatef().
+ *
+ * Flushes the vertices and calls _math_matrix_rotate() with the top-most
+ * matrix in the current stack and the given parameters. Marks
+ * __struct gl_contextRec::NewState with the dirty stack flag.
+ */
+void GLAPIENTRY
+_mesa_Rotatef( GLfloat angle, GLfloat x, GLfloat y, GLfloat z )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
+   if (angle != 0.0F) {
+      _math_matrix_rotate( ctx->CurrentStack->Top, angle, x, y, z);
+      ctx->NewState |= ctx->CurrentStack->DirtyFlag;
+   }
+}
+
+
+/**
+ * Multiply the current matrix with a general scaling matrix.
+ *
+ * \param x x axis scale factor.
+ * \param y y axis scale factor.
+ * \param z z axis scale factor.
+ *
+ * \sa glScalef().
+ *
+ * Flushes the vertices and calls _math_matrix_scale() with the top-most
+ * matrix in the current stack and the given parameters. Marks
+ * __struct gl_contextRec::NewState with the dirty stack flag.
+ */
+void GLAPIENTRY
+_mesa_Scalef( GLfloat x, GLfloat y, GLfloat z )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
+   _math_matrix_scale( ctx->CurrentStack->Top, x, y, z);
+   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
+}
+
+
+/**
+ * Multiply the current matrix with a translation matrix.
+ *
+ * \param x translation vector x coordinate.
+ * \param y translation vector y coordinate.
+ * \param z translation vector z coordinate.
+ *
+ * \sa glTranslatef().
+ *
+ * Flushes the vertices and calls _math_matrix_translate() with the top-most
+ * matrix in the current stack and the given parameters. Marks
+ * __struct gl_contextRec::NewState with the dirty stack flag.
+ */
+void GLAPIENTRY
+_mesa_Translatef( GLfloat x, GLfloat y, GLfloat z )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
+   _math_matrix_translate( ctx->CurrentStack->Top, x, y, z);
+   ctx->NewState |= ctx->CurrentStack->DirtyFlag;
+}
+
+ 
+#if _HAVE_FULL_GL
+void GLAPIENTRY
+_mesa_LoadMatrixd( const GLdouble *m )
+{
+   GLint i;
+   GLfloat f[16];
+   if (!m) return;
+   for (i = 0; i < 16; i++)
+      f[i] = (GLfloat) m[i];
+   _mesa_LoadMatrixf(f);
+}
+
+void GLAPIENTRY
+_mesa_MultMatrixd( const GLdouble *m )
+{
+   GLint i;
+   GLfloat f[16];
+   if (!m) return;
+   for (i = 0; i < 16; i++)
+      f[i] = (GLfloat) m[i];
+   _mesa_MultMatrixf( f );
+}
+
+
+void GLAPIENTRY
+_mesa_Rotated( GLdouble angle, GLdouble x, GLdouble y, GLdouble z )
+{
+   _mesa_Rotatef((GLfloat) angle, (GLfloat) x, (GLfloat) y, (GLfloat) z);
+}
+
+
+void GLAPIENTRY
+_mesa_Scaled( GLdouble x, GLdouble y, GLdouble z )
+{
+   _mesa_Scalef((GLfloat) x, (GLfloat) y, (GLfloat) z);
+}
+
+
+void GLAPIENTRY
+_mesa_Translated( GLdouble x, GLdouble y, GLdouble z )
+{
+   _mesa_Translatef((GLfloat) x, (GLfloat) y, (GLfloat) z);
+}
+#endif
+
+
+#if _HAVE_FULL_GL
+void GLAPIENTRY
+_mesa_LoadTransposeMatrixfARB( const GLfloat *m )
+{
+   GLfloat tm[16];
+   if (!m) return;
+   _math_transposef(tm, m);
+   _mesa_LoadMatrixf(tm);
+}
+
+
+void GLAPIENTRY
+_mesa_LoadTransposeMatrixdARB( const GLdouble *m )
+{
+   GLfloat tm[16];
+   if (!m) return;
+   _math_transposefd(tm, m);
+   _mesa_LoadMatrixf(tm);
+}
+
+
+void GLAPIENTRY
+_mesa_MultTransposeMatrixfARB( const GLfloat *m )
+{
+   GLfloat tm[16];
+   if (!m) return;
+   _math_transposef(tm, m);
+   _mesa_MultMatrixf(tm);
+}
+
+
+void GLAPIENTRY
+_mesa_MultTransposeMatrixdARB( const GLdouble *m )
+{
+   GLfloat tm[16];
+   if (!m) return;
+   _math_transposefd(tm, m);
+   _mesa_MultMatrixf(tm);
+}
+#endif
+
+
+
+/**********************************************************************/
+/** \name State management */
+/*@{*/
+
+
+/**
+ * Update the projection matrix stack.
+ *
+ * \param ctx GL context.
+ *
+ * Calls _math_matrix_analyse() with the top-matrix of the projection matrix
+ * stack, and recomputes user clip positions if necessary.
+ * 
+ * \note This routine references __struct gl_contextRec::Tranform attribute
+ * values to compute userclip positions in clip space, but is only called on
+ * _NEW_PROJECTION.  The _mesa_ClipPlane() function keeps these values up to
+ * date across changes to the __struct gl_contextRec::Transform attributes.
+ */
+static void
+update_projection( struct gl_context *ctx )
+{
+   _math_matrix_analyse( ctx->ProjectionMatrixStack.Top );
+
+#if FEATURE_userclip
+   /* Recompute clip plane positions in clipspace.  This is also done
+    * in _mesa_ClipPlane().
+    */
+   if (ctx->Transform.ClipPlanesEnabled) {
+      GLuint p;
+      for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
+	 if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
+	    _mesa_transform_vector( ctx->Transform._ClipUserPlane[p],
+				 ctx->Transform.EyeUserPlane[p],
+				 ctx->ProjectionMatrixStack.Top->inv );
+	 }
+      }
+   }
+#endif
+}
+
+
+/**
+ * Calculate the combined modelview-projection matrix.
+ *
+ * \param ctx GL context.
+ *
+ * Multiplies the top matrices of the projection and model view stacks into
+ * __struct gl_contextRec::_ModelProjectMatrix via _math_matrix_mul_matrix()
+ * and analyzes the resulting matrix via _math_matrix_analyse().
+ */
+static void
+calculate_model_project_matrix( struct gl_context *ctx )
+{
+   _math_matrix_mul_matrix( &ctx->_ModelProjectMatrix,
+                            ctx->ProjectionMatrixStack.Top,
+                            ctx->ModelviewMatrixStack.Top );
+
+   _math_matrix_analyse( &ctx->_ModelProjectMatrix );
+}
+
+
+/**
+ * Updates the combined modelview-projection matrix.
+ *
+ * \param ctx GL context.
+ * \param new_state new state bit mask.
+ *
+ * If there is a new model view matrix then analyzes it. If there is a new
+ * projection matrix, updates it. Finally calls
+ * calculate_model_project_matrix() to recalculate the modelview-projection
+ * matrix.
+ */
+void _mesa_update_modelview_project( struct gl_context *ctx, GLuint new_state )
+{
+   if (new_state & _NEW_MODELVIEW) {
+      _math_matrix_analyse( ctx->ModelviewMatrixStack.Top );
+    
+      /* Bring cull position up to date.
+       */
+      TRANSFORM_POINT3( ctx->Transform.CullObjPos, 
+			ctx->ModelviewMatrixStack.Top->inv,
+			ctx->Transform.CullEyePos );
+   }
+
+
+   if (new_state & _NEW_PROJECTION)
+      update_projection( ctx );
+
+   /* Keep ModelviewProject up to date always to allow tnl
+    * implementations that go model->clip even when eye is required.
+    */
+   calculate_model_project_matrix(ctx);
+}
+
+/*@}*/
+
+
+/**********************************************************************/
+/** Matrix stack initialization */
+/*@{*/
+
+
+/**
+ * Initialize a matrix stack.
+ *
+ * \param stack matrix stack.
+ * \param maxDepth maximum stack depth.
+ * \param dirtyFlag dirty flag.
+ * 
+ * Allocates an array of \p maxDepth elements for the matrix stack and calls
+ * _math_matrix_ctr() and _math_matrix_alloc_inv() for each element to
+ * initialize it.
+ */
+static void
+init_matrix_stack( struct gl_matrix_stack *stack,
+                   GLuint maxDepth, GLuint dirtyFlag )
+{
+   GLuint i;
+
+   stack->Depth = 0;
+   stack->MaxDepth = maxDepth;
+   stack->DirtyFlag = dirtyFlag;
+   /* The stack */
+   stack->Stack = (GLmatrix *) CALLOC(maxDepth * sizeof(GLmatrix));
+   for (i = 0; i < maxDepth; i++) {
+      _math_matrix_ctr(&stack->Stack[i]);
+      _math_matrix_alloc_inv(&stack->Stack[i]);
+   }
+   stack->Top = stack->Stack;
+}
+
+/**
+ * Free matrix stack.
+ * 
+ * \param stack matrix stack.
+ * 
+ * Calls _math_matrix_dtr() for each element of the matrix stack and
+ * frees the array.
+ */
+static void
+free_matrix_stack( struct gl_matrix_stack *stack )
+{
+   GLuint i;
+   for (i = 0; i < stack->MaxDepth; i++) {
+      _math_matrix_dtr(&stack->Stack[i]);
+   }
+   FREE(stack->Stack);
+   stack->Stack = stack->Top = NULL;
+}
+
+/*@}*/
+
+
+/**********************************************************************/
+/** \name Initialization */
+/*@{*/
+
+
+/**
+ * Initialize the context matrix data.
+ *
+ * \param ctx GL context.
+ *
+ * Initializes each of the matrix stacks and the combined modelview-projection
+ * matrix.
+ */
+void _mesa_init_matrix( struct gl_context * ctx )
+{
+   GLint i;
+
+   /* Initialize matrix stacks */
+   init_matrix_stack(&ctx->ModelviewMatrixStack, MAX_MODELVIEW_STACK_DEPTH,
+                     _NEW_MODELVIEW);
+   init_matrix_stack(&ctx->ProjectionMatrixStack, MAX_PROJECTION_STACK_DEPTH,
+                     _NEW_PROJECTION);
+   for (i = 0; i < Elements(ctx->TextureMatrixStack); i++)
+      init_matrix_stack(&ctx->TextureMatrixStack[i], MAX_TEXTURE_STACK_DEPTH,
+                        _NEW_TEXTURE_MATRIX);
+   for (i = 0; i < Elements(ctx->ProgramMatrixStack); i++)
+      init_matrix_stack(&ctx->ProgramMatrixStack[i], 
+		        MAX_PROGRAM_MATRIX_STACK_DEPTH, _NEW_TRACK_MATRIX);
+   ctx->CurrentStack = &ctx->ModelviewMatrixStack;
+
+   /* Init combined Modelview*Projection matrix */
+   _math_matrix_ctr( &ctx->_ModelProjectMatrix );
+}
+
+
+/**
+ * Free the context matrix data.
+ * 
+ * \param ctx GL context.
+ *
+ * Frees each of the matrix stacks and the combined modelview-projection
+ * matrix.
+ */
+void _mesa_free_matrix_data( struct gl_context *ctx )
+{
+   GLint i;
+
+   free_matrix_stack(&ctx->ModelviewMatrixStack);
+   free_matrix_stack(&ctx->ProjectionMatrixStack);
+   for (i = 0; i < Elements(ctx->TextureMatrixStack); i++)
+      free_matrix_stack(&ctx->TextureMatrixStack[i]);
+   for (i = 0; i < Elements(ctx->ProgramMatrixStack); i++)
+      free_matrix_stack(&ctx->ProgramMatrixStack[i]);
+   /* combined Modelview*Projection matrix */
+   _math_matrix_dtr( &ctx->_ModelProjectMatrix );
+
+}
+
+
+/** 
+ * Initialize the context transform attribute group.
+ *
+ * \param ctx GL context.
+ *
+ * \todo Move this to a new file with other 'transform' routines.
+ */
+void _mesa_init_transform( struct gl_context *ctx )
+{
+   GLint i;
+
+   /* Transformation group */
+   ctx->Transform.MatrixMode = GL_MODELVIEW;
+   ctx->Transform.Normalize = GL_FALSE;
+   ctx->Transform.RescaleNormals = GL_FALSE;
+   ctx->Transform.RasterPositionUnclipped = GL_FALSE;
+   for (i=0;i<ctx->Const.MaxClipPlanes;i++) {
+      ASSIGN_4V( ctx->Transform.EyeUserPlane[i], 0.0, 0.0, 0.0, 0.0 );
+   }
+   ctx->Transform.ClipPlanesEnabled = 0;
+
+   ASSIGN_4V( ctx->Transform.CullObjPos, 0.0, 0.0, 1.0, 0.0 );
+   ASSIGN_4V( ctx->Transform.CullEyePos, 0.0, 0.0, 1.0, 0.0 );
+}
+
+
+/*@}*/
diff --git a/mesalib/src/mesa/main/mipmap.c b/mesalib/src/mesa/main/mipmap.c
index f170d235a..1ead5ee10 100644
--- a/mesalib/src/mesa/main/mipmap.c
+++ b/mesalib/src/mesa/main/mipmap.c
@@ -1986,7 +1986,7 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
    gl_format temp_format;
    GLint components;
    GLuint temp_src_stride, temp_dst_stride; /* in bytes */
-   GLchan *temp_src = NULL, *temp_dst = NULL;
+   GLubyte *temp_src = NULL, *temp_dst = NULL;
    GLenum temp_datatype;
    GLenum temp_base_format;
 
@@ -2101,7 +2101,7 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
 
       /* swap src and dest pointers */
       {
-	 GLchan *temp = temp_src;
+	 GLubyte *temp = temp_src;
 	 temp_src = temp_dst;
 	 temp_dst = temp;
 
@@ -2109,7 +2109,7 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
       }
    } /* loop over mipmap levels */
 
-   free((void *) temp_src);
+   free(temp_src);
    free(temp_dst);
 }
 
@@ -2218,37 +2218,3 @@ do {									\
    }
 }
 
-
-/**
- * Upscale an image by replication, not (typical) stretching.
- * We use this when the image width or height is less than a
- * certain size (4, 8) and we need to upscale an image.
- */
-void
-_mesa_upscale_teximage2d(GLsizei inWidth, GLsizei inHeight,
-                         GLsizei outWidth, GLsizei outHeight,
-                         GLint comps, const GLchan *src, GLint srcRowStride,
-                         GLchan *dest )
-{
-   GLint i, j, k;
-
-   ASSERT(outWidth >= inWidth);
-   ASSERT(outHeight >= inHeight);
-#if 0
-   ASSERT(inWidth == 1 || inWidth == 2 || inHeight == 1 || inHeight == 2);
-   ASSERT((outWidth & 3) == 0);
-   ASSERT((outHeight & 3) == 0);
-#endif
-
-   for (i = 0; i < outHeight; i++) {
-      const GLint ii = i % inHeight;
-      for (j = 0; j < outWidth; j++) {
-         const GLint jj = j % inWidth;
-         for (k = 0; k < comps; k++) {
-            dest[(i * outWidth + j) * comps + k]
-               = src[ii * srcRowStride + jj * comps + k];
-         }
-      }
-   }
-}
-
diff --git a/mesalib/src/mesa/main/mipmap.h b/mesalib/src/mesa/main/mipmap.h
index c0c6c2592..478395021 100644
--- a/mesalib/src/mesa/main/mipmap.h
+++ b/mesalib/src/mesa/main/mipmap.h
@@ -1,64 +1,58 @@
-/*
- * Mesa 3-D graphics library
- * Version:  6.5.2
- *
- * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-#ifndef MIPMAP_H
-#define MIPMAP_H
-
-#include "mtypes.h"
-
-
-extern void
-_mesa_generate_mipmap_level(GLenum target,
-                            GLenum datatype, GLuint comps,
-                            GLint border,
-                            GLint srcWidth, GLint srcHeight, GLint srcDepth,
-                            const GLubyte *srcData,
-                            GLint srcRowStride,
-                            GLint dstWidth, GLint dstHeight, GLint dstDepth,
-                            GLubyte *dstData,
-                            GLint dstRowStride);
-
-
-extern void
-_mesa_generate_mipmap(struct gl_context *ctx, GLenum target,
-                      struct gl_texture_object *texObj);
-
-
-extern void
-_mesa_rescale_teximage2d(GLuint bytesPerPixel,
-                         GLuint srcStrideInPixels,
-                         GLuint dstRowStride,
-                         GLint srcWidth, GLint srcHeight,
-                         GLint dstWidth, GLint dstHeight,
-                         const GLvoid *srcImage, GLvoid *dstImage);
-
-extern void
-_mesa_upscale_teximage2d(GLsizei inWidth, GLsizei inHeight,
-                         GLsizei outWidth, GLsizei outHeight,
-                         GLint comps, const GLchan *src, GLint srcRowStride,
-                         GLchan *dest);
-
-
-#endif /* MIPMAP_H */
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.5.2
+ *
+ * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef MIPMAP_H
+#define MIPMAP_H
+
+#include "mtypes.h"
+
+
+extern void
+_mesa_generate_mipmap_level(GLenum target,
+                            GLenum datatype, GLuint comps,
+                            GLint border,
+                            GLint srcWidth, GLint srcHeight, GLint srcDepth,
+                            const GLubyte *srcData,
+                            GLint srcRowStride,
+                            GLint dstWidth, GLint dstHeight, GLint dstDepth,
+                            GLubyte *dstData,
+                            GLint dstRowStride);
+
+
+extern void
+_mesa_generate_mipmap(struct gl_context *ctx, GLenum target,
+                      struct gl_texture_object *texObj);
+
+
+extern void
+_mesa_rescale_teximage2d(GLuint bytesPerPixel,
+                         GLuint srcStrideInPixels,
+                         GLuint dstRowStride,
+                         GLint srcWidth, GLint srcHeight,
+                         GLint dstWidth, GLint dstHeight,
+                         const GLvoid *srcImage, GLvoid *dstImage);
+
+
+#endif /* MIPMAP_H */
diff --git a/mesalib/src/mesa/main/mtypes.h b/mesalib/src/mesa/main/mtypes.h
index 3b44ec6d5..42831d773 100644
--- a/mesalib/src/mesa/main/mtypes.h
+++ b/mesalib/src/mesa/main/mtypes.h
@@ -44,29 +44,6 @@
 
 
 /**
- * Color channel data type.
- */
-#if CHAN_BITS == 8
-   typedef GLubyte GLchan;
-#define CHAN_MAX 255
-#define CHAN_MAXF 255.0F
-#define CHAN_TYPE GL_UNSIGNED_BYTE
-#elif CHAN_BITS == 16
-   typedef GLushort GLchan;
-#define CHAN_MAX 65535
-#define CHAN_MAXF 65535.0F
-#define CHAN_TYPE GL_UNSIGNED_SHORT
-#elif CHAN_BITS == 32
-   typedef GLfloat GLchan;
-#define CHAN_MAX 1.0
-#define CHAN_MAXF 1.0F
-#define CHAN_TYPE GL_FLOAT
-#else
-#error "illegal number of color channel bits"
-#endif
-
-
-/**
  * Stencil buffer data type.
  */
 #if STENCIL_BITS==8
@@ -1950,6 +1927,8 @@ struct gl_vertex_program_state
    GLboolean _Enabled;           /**< Enabled and _valid_ user program? */
    GLboolean PointSizeEnabled;   /**< GL_VERTEX_PROGRAM_POINT_SIZE_ARB/NV */
    GLboolean TwoSideEnabled;     /**< GL_VERTEX_PROGRAM_TWO_SIDE_ARB/NV */
+   /** Computed two sided lighting for fixed function/programs. */
+   GLboolean _TwoSideEnabled;
    struct gl_vertex_program *Current;  /**< User-bound vertex program */
 
    /** Currently enabled and valid vertex program (including internal
diff --git a/mesalib/src/mesa/main/pack.c b/mesalib/src/mesa/main/pack.c
index 8388708a4..6d6ae59f4 100644
--- a/mesalib/src/mesa/main/pack.c
+++ b/mesalib/src/mesa/main/pack.c
@@ -3437,7 +3437,7 @@ extract_uint_rgba(GLuint n, GLuint rgba[][4],
 /*
  * Unpack a row of color image data from a client buffer according to
  * the pixel unpacking parameters.
- * Return GLchan values in the specified dest image format.
+ * Return GLubyte values in the specified dest image format.
  * This is used by glDrawPixels and glTexImage?D().
  * \param ctx - the context
  *         n - number of pixels in the span
@@ -3452,8 +3452,8 @@ extract_uint_rgba(GLuint n, GLuint rgba[][4],
  * XXX perhaps expand this to process whole images someday.
  */
 void
-_mesa_unpack_color_span_chan( struct gl_context *ctx,
-                              GLuint n, GLenum dstFormat, GLchan dest[],
+_mesa_unpack_color_span_ubyte(struct gl_context *ctx,
+                              GLuint n, GLenum dstFormat, GLubyte dest[],
                               GLenum srcFormat, GLenum srcType,
                               const GLvoid *source,
                               const struct gl_pixelstore_attrib *srcPacking,
@@ -3517,21 +3517,21 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
 
    /* Try simple cases first */
    if (transferOps == 0) {
-      if (srcType == CHAN_TYPE) {
+      if (srcType == GL_UNSIGNED_BYTE) {
          if (dstFormat == GL_RGBA) {
             if (srcFormat == GL_RGBA) {
-               memcpy( dest, source, n * 4 * sizeof(GLchan) );
+               memcpy( dest, source, n * 4 * sizeof(GLubyte) );
                return;
             }
             else if (srcFormat == GL_RGB) {
                GLuint i;
-               const GLchan *src = (const GLchan *) source;
-               GLchan *dst = dest;
+               const GLubyte *src = (const GLubyte *) source;
+               GLubyte *dst = dest;
                for (i = 0; i < n; i++) {
                   dst[0] = src[0];
                   dst[1] = src[1];
                   dst[2] = src[2];
-                  dst[3] = CHAN_MAX;
+                  dst[3] = 255;
                   src += 3;
                   dst += 4;
                }
@@ -3540,13 +3540,13 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
          }
          else if (dstFormat == GL_RGB) {
             if (srcFormat == GL_RGB) {
-               memcpy( dest, source, n * 3 * sizeof(GLchan) );
+               memcpy( dest, source, n * 3 * sizeof(GLubyte) );
                return;
             }
             else if (srcFormat == GL_RGBA) {
                GLuint i;
-               const GLchan *src = (const GLchan *) source;
-               GLchan *dst = dest;
+               const GLubyte *src = (const GLubyte *) source;
+               GLubyte *dst = dest;
                for (i = 0; i < n; i++) {
                   dst[0] = src[0];
                   dst[1] = src[1];
@@ -3560,7 +3560,7 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
          else if (dstFormat == srcFormat) {
             GLint comps = _mesa_components_in_format(srcFormat);
             assert(comps > 0);
-            memcpy( dest, source, n * comps * sizeof(GLchan) );
+            memcpy( dest, source, n * comps * sizeof(GLubyte) );
             return;
          }
       }
@@ -3573,12 +3573,12 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
             if (srcFormat == GL_RGB) {
                GLuint i;
                const GLubyte *src = (const GLubyte *) source;
-               GLchan *dst = dest;
+               GLubyte *dst = dest;
                for (i = 0; i < n; i++) {
-                  dst[0] = UBYTE_TO_CHAN(src[0]);
-                  dst[1] = UBYTE_TO_CHAN(src[1]);
-                  dst[2] = UBYTE_TO_CHAN(src[2]);
-                  dst[3] = CHAN_MAX;
+                  dst[0] = src[0];
+                  dst[1] = src[1];
+                  dst[2] = src[2];
+                  dst[3] = 255;
                   src += 3;
                   dst += 4;
                }
@@ -3587,12 +3587,12 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
             else if (srcFormat == GL_RGBA) {
                GLuint i;
                const GLubyte *src = (const GLubyte *) source;
-               GLchan *dst = dest;
+               GLubyte *dst = dest;
                for (i = 0; i < n; i++) {
-                  dst[0] = UBYTE_TO_CHAN(src[0]);
-                  dst[1] = UBYTE_TO_CHAN(src[1]);
-                  dst[2] = UBYTE_TO_CHAN(src[2]);
-                  dst[3] = UBYTE_TO_CHAN(src[3]);
+                  dst[0] = src[0];
+                  dst[1] = src[1];
+                  dst[2] = src[2];
+                  dst[3] = src[3];
                   src += 4;
                   dst += 4;
                }
@@ -3603,11 +3603,11 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
             if (srcFormat == GL_RGB) {
                GLuint i;
                const GLubyte *src = (const GLubyte *) source;
-               GLchan *dst = dest;
+               GLubyte *dst = dest;
                for (i = 0; i < n; i++) {
-                  dst[0] = UBYTE_TO_CHAN(src[0]);
-                  dst[1] = UBYTE_TO_CHAN(src[1]);
-                  dst[2] = UBYTE_TO_CHAN(src[2]);
+                  dst[0] = src[0];
+                  dst[1] = src[1];
+                  dst[2] = src[2];
                   src += 3;
                   dst += 3;
                }
@@ -3616,11 +3616,11 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
             else if (srcFormat == GL_RGBA) {
                GLuint i;
                const GLubyte *src = (const GLubyte *) source;
-               GLchan *dst = dest;
+               GLubyte *dst = dest;
                for (i = 0; i < n; i++) {
-                  dst[0] = UBYTE_TO_CHAN(src[0]);
-                  dst[1] = UBYTE_TO_CHAN(src[1]);
-                  dst[2] = UBYTE_TO_CHAN(src[2]);
+                  dst[0] = src[0];
+                  dst[1] = src[1];
+                  dst[2] = src[2];
                   src += 4;
                   dst += 3;
                }
@@ -3679,10 +3679,8 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
                             srcPacking->SwapBytes);
       }
 
-      /* Need to clamp if returning GLubytes or GLushorts */
-#if CHAN_TYPE != GL_FLOAT
+      /* Need to clamp if returning GLubytes */
       transferOps |= IMAGE_CLAMP_BIT;
-#endif
 
       if (transferOps) {
          _mesa_apply_rgba_transfer_ops(ctx, transferOps, n, rgba);
@@ -3691,61 +3689,61 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
       get_component_indexes(dstFormat,
                             &rDst, &gDst, &bDst, &aDst, &lDst, &iDst);
 
-      /* Now return the GLchan data in the requested dstFormat */
+      /* Now return the GLubyte data in the requested dstFormat */
       if (rDst >= 0) {
-         GLchan *dst = dest;
+         GLubyte *dst = dest;
          GLuint i;
          for (i = 0; i < n; i++) {
-            CLAMPED_FLOAT_TO_CHAN(dst[rDst], rgba[i][RCOMP]);
+            CLAMPED_FLOAT_TO_UBYTE(dst[rDst], rgba[i][RCOMP]);
             dst += dstComponents;
          }
       }
 
       if (gDst >= 0) {
-         GLchan *dst = dest;
+         GLubyte *dst = dest;
          GLuint i;
          for (i = 0; i < n; i++) {
-            CLAMPED_FLOAT_TO_CHAN(dst[gDst], rgba[i][GCOMP]);
+            CLAMPED_FLOAT_TO_UBYTE(dst[gDst], rgba[i][GCOMP]);
             dst += dstComponents;
          }
       }
 
       if (bDst >= 0) {
-         GLchan *dst = dest;
+         GLubyte *dst = dest;
          GLuint i;
          for (i = 0; i < n; i++) {
-            CLAMPED_FLOAT_TO_CHAN(dst[bDst], rgba[i][BCOMP]);
+            CLAMPED_FLOAT_TO_UBYTE(dst[bDst], rgba[i][BCOMP]);
             dst += dstComponents;
          }
       }
 
       if (aDst >= 0) {
-         GLchan *dst = dest;
+         GLubyte *dst = dest;
          GLuint i;
          for (i = 0; i < n; i++) {
-            CLAMPED_FLOAT_TO_CHAN(dst[aDst], rgba[i][ACOMP]);
+            CLAMPED_FLOAT_TO_UBYTE(dst[aDst], rgba[i][ACOMP]);
             dst += dstComponents;
          }
       }
 
       if (iDst >= 0) {
-         GLchan *dst = dest;
+         GLubyte *dst = dest;
          GLuint i;
          assert(iDst == 0);
          assert(dstComponents == 1);
          for (i = 0; i < n; i++) {
             /* Intensity comes from red channel */
-            CLAMPED_FLOAT_TO_CHAN(dst[i], rgba[i][RCOMP]);
+            CLAMPED_FLOAT_TO_UBYTE(dst[i], rgba[i][RCOMP]);
          }
       }
 
       if (lDst >= 0) {
-         GLchan *dst = dest;
+         GLubyte *dst = dest;
          GLuint i;
          assert(lDst == 0);
          for (i = 0; i < n; i++) {
             /* Luminance comes from red channel */
-            CLAMPED_FLOAT_TO_CHAN(dst[0], rgba[i][RCOMP]);
+            CLAMPED_FLOAT_TO_UBYTE(dst[0], rgba[i][RCOMP]);
             dst += dstComponents;
          }
       }
@@ -3756,8 +3754,8 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
 
 
 /**
- * Same as _mesa_unpack_color_span_chan(), but return GLfloat data
- * instead of GLchan.
+ * Same as _mesa_unpack_color_span_ubyte(), but return GLfloat data
+ * instead of GLubyte.
  */
 void
 _mesa_unpack_color_span_float( struct gl_context *ctx,
@@ -3954,8 +3952,8 @@ _mesa_unpack_color_span_float( struct gl_context *ctx,
 
 
 /**
- * Same as _mesa_unpack_color_span_chan(), but return GLuint data
- * instead of GLchan.
+ * Same as _mesa_unpack_color_span_ubyte(), but return GLuint data
+ * instead of GLubyte.
  * No pixel transfer ops are applied.
  */
 void
diff --git a/mesalib/src/mesa/main/pack.h b/mesalib/src/mesa/main/pack.h
index 00aab409e..7c76baae4 100644
--- a/mesalib/src/mesa/main/pack.h
+++ b/mesalib/src/mesa/main/pack.h
@@ -58,8 +58,8 @@ _mesa_pack_rgba_span_float(struct gl_context *ctx, GLuint n,
 
 
 extern void
-_mesa_unpack_color_span_chan(struct gl_context *ctx,
-                             GLuint n, GLenum dstFormat, GLchan dest[],
+_mesa_unpack_color_span_ubyte(struct gl_context *ctx,
+                             GLuint n, GLenum dstFormat, GLubyte dest[],
                              GLenum srcFormat, GLenum srcType,
                              const GLvoid *source,
                              const struct gl_pixelstore_attrib *srcPacking,
diff --git a/mesalib/src/mesa/main/state.c b/mesalib/src/mesa/main/state.c
index 9d9c952dc..fc25515a0 100644
--- a/mesalib/src/mesa/main/state.c
+++ b/mesalib/src/mesa/main/state.c
@@ -447,7 +447,20 @@ update_clamp_read_color(struct gl_context *ctx)
       ctx->Color._ClampReadColor = ctx->Color.ClampReadColor;
 }
 
-
+/**
+ * Update the ctx->VertexProgram._TwoSideEnabled flag.
+ */
+static void
+update_twoside(struct gl_context *ctx)
+{
+   if (ctx->Shader.CurrentVertexProgram ||
+       ctx->VertexProgram.Current) {
+      ctx->VertexProgram._TwoSideEnabled = ctx->VertexProgram.TwoSideEnabled;
+   } else {
+      ctx->VertexProgram._TwoSideEnabled = (ctx->Light.Enabled &&
+					    ctx->Light.Model.TwoSide);
+   }
+}
 
 
 /*
@@ -603,6 +616,9 @@ _mesa_update_state_locked( struct gl_context *ctx )
    if (new_state & _NEW_LIGHT)
       _mesa_update_lighting( ctx );
 
+   if (new_state & (_NEW_LIGHT | _NEW_PROGRAM))
+      update_twoside( ctx );
+
    if (new_state & (_NEW_LIGHT | _NEW_BUFFERS))
       update_clamp_vertex_color(ctx);
 
diff --git a/mesalib/src/mesa/main/texcompress.c b/mesalib/src/mesa/main/texcompress.c
index b49d1b1ca..03e05d5ef 100644
--- a/mesalib/src/mesa/main/texcompress.c
+++ b/mesalib/src/mesa/main/texcompress.c
@@ -264,21 +264,23 @@ _mesa_get_compressed_formats(struct gl_context *ctx, GLint *formats)
       }
    }
 
-#if FEATURE_ES1 || FEATURE_ES2
-   if (formats) {
-      formats[n++] = GL_PALETTE4_RGB8_OES;
-      formats[n++] = GL_PALETTE4_RGBA8_OES;
-      formats[n++] = GL_PALETTE4_R5_G6_B5_OES;
-      formats[n++] = GL_PALETTE4_RGBA4_OES;
-      formats[n++] = GL_PALETTE4_RGB5_A1_OES;
-      formats[n++] = GL_PALETTE8_RGB8_OES;
-      formats[n++] = GL_PALETTE8_RGBA8_OES;
-      formats[n++] = GL_PALETTE8_R5_G6_B5_OES;
-      formats[n++] = GL_PALETTE8_RGBA4_OES;
-      formats[n++] = GL_PALETTE8_RGB5_A1_OES;
-   }
-   else {
-      n += 10;
+#if FEATURE_ES1
+   if (ctx->API == API_OPENGLES) {
+      if (formats) {
+	 formats[n++] = GL_PALETTE4_RGB8_OES;
+	 formats[n++] = GL_PALETTE4_RGBA8_OES;
+	 formats[n++] = GL_PALETTE4_R5_G6_B5_OES;
+	 formats[n++] = GL_PALETTE4_RGBA4_OES;
+	 formats[n++] = GL_PALETTE4_RGB5_A1_OES;
+	 formats[n++] = GL_PALETTE8_RGB8_OES;
+	 formats[n++] = GL_PALETTE8_RGBA8_OES;
+	 formats[n++] = GL_PALETTE8_R5_G6_B5_OES;
+	 formats[n++] = GL_PALETTE8_RGBA4_OES;
+	 formats[n++] = GL_PALETTE8_RGB5_A1_OES;
+      }
+      else {
+	 n += 10;
+      }
    }
 #endif
 
diff --git a/mesalib/src/mesa/main/texcompress_fxt1.c b/mesalib/src/mesa/main/texcompress_fxt1.c
index a75487ce2..0437cfcc1 100644
--- a/mesalib/src/mesa/main/texcompress_fxt1.c
+++ b/mesalib/src/mesa/main/texcompress_fxt1.c
@@ -52,7 +52,7 @@ fxt1_encode (GLuint width, GLuint height, GLint comps,
 
 void
 fxt1_decode_1 (const void *texture, GLint stride,
-               GLint i, GLint j, GLchan *rgba);
+               GLint i, GLint j, GLubyte *rgba);
 
 
 /**
@@ -61,11 +61,11 @@ fxt1_decode_1 (const void *texture, GLint stride,
 GLboolean
 _mesa_texstore_rgb_fxt1(TEXSTORE_PARAMS)
 {
-   const GLchan *pixels;
+   const GLubyte *pixels;
    GLint srcRowStride;
    GLubyte *dst;
    const GLint texWidth = dstRowStride * 8 / 16; /* a bit of a hack */
-   const GLchan *tempImage = NULL;
+   const GLubyte *tempImage = NULL;
 
    ASSERT(dstFormat == MESA_FORMAT_RGB_FXT1);
    ASSERT(dstXoffset % 8 == 0);
@@ -75,11 +75,11 @@ _mesa_texstore_rgb_fxt1(TEXSTORE_PARAMS)
    (void) dstImageOffsets;
 
    if (srcFormat != GL_RGB ||
-       srcType != CHAN_TYPE ||
+       srcType != GL_UNSIGNED_BYTE ||
        ctx->_ImageTransferState ||
        srcPacking->SwapBytes) {
-      /* convert image to RGB/GLchan */
-      tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      /* convert image to RGB/GLubyte */
+      tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                              baseInternalFormat,
                                              _mesa_get_format_base_format(dstFormat),
                                              srcWidth, srcHeight, srcDepth,
@@ -92,9 +92,9 @@ _mesa_texstore_rgb_fxt1(TEXSTORE_PARAMS)
       srcFormat = GL_RGB;
    }
    else {
-      pixels = (const GLchan *) srcAddr;
+      pixels = (const GLubyte *) srcAddr;
       srcRowStride = _mesa_image_row_stride(srcPacking, srcWidth, srcFormat,
-                                            srcType) / sizeof(GLchan);
+                                            srcType) / sizeof(GLubyte);
    }
 
    dst = _mesa_compressed_image_address(dstXoffset, dstYoffset, 0,
@@ -117,11 +117,11 @@ _mesa_texstore_rgb_fxt1(TEXSTORE_PARAMS)
 GLboolean
 _mesa_texstore_rgba_fxt1(TEXSTORE_PARAMS)
 {
-   const GLchan *pixels;
+   const GLubyte *pixels;
    GLint srcRowStride;
    GLubyte *dst;
    GLint texWidth = dstRowStride * 8 / 16; /* a bit of a hack */
-   const GLchan *tempImage = NULL;
+   const GLubyte *tempImage = NULL;
 
    ASSERT(dstFormat == MESA_FORMAT_RGBA_FXT1);
    ASSERT(dstXoffset % 8 == 0);
@@ -131,11 +131,11 @@ _mesa_texstore_rgba_fxt1(TEXSTORE_PARAMS)
    (void) dstImageOffsets;
 
    if (srcFormat != GL_RGBA ||
-       srcType != CHAN_TYPE ||
+       srcType != GL_UNSIGNED_BYTE ||
        ctx->_ImageTransferState ||
        srcPacking->SwapBytes) {
-      /* convert image to RGBA/GLchan */
-      tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      /* convert image to RGBA/GLubyte */
+      tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                              baseInternalFormat,
                                              _mesa_get_format_base_format(dstFormat),
                                              srcWidth, srcHeight, srcDepth,
@@ -148,9 +148,9 @@ _mesa_texstore_rgba_fxt1(TEXSTORE_PARAMS)
       srcFormat = GL_RGBA;
    }
    else {
-      pixels = (const GLchan *) srcAddr;
+      pixels = (const GLubyte *) srcAddr;
       srcRowStride = _mesa_image_row_stride(srcPacking, srcWidth, srcFormat,
-                                            srcType) / sizeof(GLchan);
+                                            srcType) / sizeof(GLubyte);
    }
 
    dst = _mesa_compressed_image_address(dstXoffset, dstYoffset, 0,
@@ -171,14 +171,14 @@ void
 _mesa_fetch_texel_2d_f_rgba_fxt1( const struct swrast_texture_image *texImage,
                                   GLint i, GLint j, GLint k, GLfloat *texel )
 {
-   /* just sample as GLchan and convert to float here */
-   GLchan rgba[4];
+   /* just sample as GLubyte and convert to float here */
+   GLubyte rgba[4];
    (void) k;
    fxt1_decode_1(texImage->Base.Data, texImage->Base.RowStride, i, j, rgba);
-   texel[RCOMP] = CHAN_TO_FLOAT(rgba[RCOMP]);
-   texel[GCOMP] = CHAN_TO_FLOAT(rgba[GCOMP]);
-   texel[BCOMP] = CHAN_TO_FLOAT(rgba[BCOMP]);
-   texel[ACOMP] = CHAN_TO_FLOAT(rgba[ACOMP]);
+   texel[RCOMP] = UBYTE_TO_FLOAT(rgba[RCOMP]);
+   texel[GCOMP] = UBYTE_TO_FLOAT(rgba[GCOMP]);
+   texel[BCOMP] = UBYTE_TO_FLOAT(rgba[BCOMP]);
+   texel[ACOMP] = UBYTE_TO_FLOAT(rgba[ACOMP]);
 }
 
 
@@ -186,13 +186,13 @@ void
 _mesa_fetch_texel_2d_f_rgb_fxt1( const struct swrast_texture_image *texImage,
                                  GLint i, GLint j, GLint k, GLfloat *texel )
 {
-   /* just sample as GLchan and convert to float here */
-   GLchan rgba[4];
+   /* just sample as GLubyte and convert to float here */
+   GLubyte rgba[4];
    (void) k;
    fxt1_decode_1(texImage->Base.Data, texImage->Base.RowStride, i, j, rgba);
-   texel[RCOMP] = CHAN_TO_FLOAT(rgba[RCOMP]);
-   texel[GCOMP] = CHAN_TO_FLOAT(rgba[GCOMP]);
-   texel[BCOMP] = CHAN_TO_FLOAT(rgba[BCOMP]);
+   texel[RCOMP] = UBYTE_TO_FLOAT(rgba[RCOMP]);
+   texel[GCOMP] = UBYTE_TO_FLOAT(rgba[GCOMP]);
+   texel[BCOMP] = UBYTE_TO_FLOAT(rgba[BCOMP]);
    texel[ACOMP] = 1.0F;
 }
 
@@ -1289,6 +1289,41 @@ fxt1_quantize (GLuint *cc, const GLubyte *lines[], GLint comps)
 }
 
 
+
+/**
+ * Upscale an image by replication, not (typical) stretching.
+ * We use this when the image width or height is less than a
+ * certain size (4, 8) and we need to upscale an image.
+ */
+static void
+upscale_teximage2d(GLsizei inWidth, GLsizei inHeight,
+                   GLsizei outWidth, GLsizei outHeight,
+                   GLint comps, const GLubyte *src, GLint srcRowStride,
+                   GLubyte *dest )
+{
+   GLint i, j, k;
+
+   ASSERT(outWidth >= inWidth);
+   ASSERT(outHeight >= inHeight);
+#if 0
+   ASSERT(inWidth == 1 || inWidth == 2 || inHeight == 1 || inHeight == 2);
+   ASSERT((outWidth & 3) == 0);
+   ASSERT((outHeight & 3) == 0);
+#endif
+
+   for (i = 0; i < outHeight; i++) {
+      const GLint ii = i % inHeight;
+      for (j = 0; j < outWidth; j++) {
+         const GLint jj = j % inWidth;
+         for (k = 0; k < comps; k++) {
+            dest[(i * outWidth + j) * comps + k]
+               = src[ii * srcRowStride + jj * comps + k];
+         }
+      }
+   }
+}
+
+
 static void
 fxt1_encode (GLuint width, GLuint height, GLint comps,
              const void *source, GLint srcRowStride,
@@ -1305,42 +1340,21 @@ fxt1_encode (GLuint width, GLuint height, GLint comps,
    if ((width & 7) | (height & 3)) {
       GLint newWidth = (width + 7) & ~7;
       GLint newHeight = (height + 3) & ~3;
-      newSource = malloc(comps * newWidth * newHeight * sizeof(GLchan));
+      newSource = malloc(comps * newWidth * newHeight * sizeof(GLubyte));
       if (!newSource) {
          GET_CURRENT_CONTEXT(ctx);
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "texture compression");
          goto cleanUp;
       }
-      _mesa_upscale_teximage2d(width, height, newWidth, newHeight,
-                               comps, (const GLchan *) source,
-                               srcRowStride, (GLchan *) newSource);
+      upscale_teximage2d(width, height, newWidth, newHeight,
+                         comps, (const GLubyte *) source,
+                         srcRowStride, (GLubyte *) newSource);
       source = newSource;
       width = newWidth;
       height = newHeight;
       srcRowStride = comps * newWidth;
    }
 
-   /* convert from 16/32-bit channels to GLubyte if needed */
-   if (CHAN_TYPE != GL_UNSIGNED_BYTE) {
-      const GLuint n = width * height * comps;
-      const GLchan *src = (const GLchan *) source;
-      GLubyte *dest = (GLubyte *) malloc(n * sizeof(GLubyte));
-      GLuint i;
-      if (!dest) {
-         GET_CURRENT_CONTEXT(ctx);
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "texture compression");
-         goto cleanUp;
-      }
-      for (i = 0; i < n; i++) {
-         dest[i] = CHAN_TO_UBYTE(src[i]);
-      }
-      if (newSource != NULL) {
-         free(newSource);
-      }
-      newSource = dest;  /* we'll free this buffer before returning */
-      source = dest;  /* the new, GLubyte incoming image */
-   }
-
    data = (const GLubyte *) source;
    destRowStride = (destRowStride - width * 2) / 4;
    for (y = 0; y < height; y += 4) {
@@ -1402,7 +1416,7 @@ static const GLubyte _rgb_scale_6[] = {
 
 
 static void
-fxt1_decode_1HI (const GLubyte *code, GLint t, GLchan *rgba)
+fxt1_decode_1HI (const GLubyte *code, GLint t, GLubyte *rgba)
 {
    const GLuint *cc;
 
@@ -1428,16 +1442,16 @@ fxt1_decode_1HI (const GLubyte *code, GLint t, GLchan *rgba)
          g = LERP(6, t, UP5(CC_SEL(cc, 5)), UP5(CC_SEL(cc, 20)));
          r = LERP(6, t, UP5(CC_SEL(cc, 10)), UP5(CC_SEL(cc, 25)));
       }
-      rgba[RCOMP] = UBYTE_TO_CHAN(r);
-      rgba[GCOMP] = UBYTE_TO_CHAN(g);
-      rgba[BCOMP] = UBYTE_TO_CHAN(b);
-      rgba[ACOMP] = CHAN_MAX;
+      rgba[RCOMP] = r;
+      rgba[GCOMP] = g;
+      rgba[BCOMP] = b;
+      rgba[ACOMP] = 255;
    }
 }
 
 
 static void
-fxt1_decode_1CHROMA (const GLubyte *code, GLint t, GLchan *rgba)
+fxt1_decode_1CHROMA (const GLubyte *code, GLint t, GLubyte *rgba)
 {
    const GLuint *cc;
    GLuint kk;
@@ -1452,15 +1466,15 @@ fxt1_decode_1CHROMA (const GLubyte *code, GLint t, GLchan *rgba)
    t *= 15;
    cc = (const GLuint *)(code + 8 + t / 8);
    kk = cc[0] >> (t & 7);
-   rgba[BCOMP] = UBYTE_TO_CHAN( UP5(kk) );
-   rgba[GCOMP] = UBYTE_TO_CHAN( UP5(kk >> 5) );
-   rgba[RCOMP] = UBYTE_TO_CHAN( UP5(kk >> 10) );
-   rgba[ACOMP] = CHAN_MAX;
+   rgba[BCOMP] = UP5(kk);
+   rgba[GCOMP] = UP5(kk >> 5);
+   rgba[RCOMP] = UP5(kk >> 10);
+   rgba[ACOMP] = 255;
 }
 
 
 static void
-fxt1_decode_1MIXED (const GLubyte *code, GLint t, GLchan *rgba)
+fxt1_decode_1MIXED (const GLubyte *code, GLint t, GLubyte *rgba)
 {
    const GLuint *cc;
    GLuint col[2][3];
@@ -1515,10 +1529,10 @@ fxt1_decode_1MIXED (const GLubyte *code, GLint t, GLchan *rgba)
             g = (UP5(col[0][GCOMP]) + UP6(col[1][GCOMP], glsb)) / 2;
             r = (UP5(col[0][RCOMP]) + UP5(col[1][RCOMP])) / 2;
          }
-         rgba[RCOMP] = UBYTE_TO_CHAN(r);
-         rgba[GCOMP] = UBYTE_TO_CHAN(g);
-         rgba[BCOMP] = UBYTE_TO_CHAN(b);
-         rgba[ACOMP] = CHAN_MAX;
+         rgba[RCOMP] = r;
+         rgba[GCOMP] = g;
+         rgba[BCOMP] = b;
+         rgba[ACOMP] = 255;
       }
    } else {
       /* alpha[0] == 0 */
@@ -1537,16 +1551,16 @@ fxt1_decode_1MIXED (const GLubyte *code, GLint t, GLchan *rgba)
                         UP6(col[1][GCOMP], glsb));
          r = LERP(3, t, UP5(col[0][RCOMP]), UP5(col[1][RCOMP]));
       }
-      rgba[RCOMP] = UBYTE_TO_CHAN(r);
-      rgba[GCOMP] = UBYTE_TO_CHAN(g);
-      rgba[BCOMP] = UBYTE_TO_CHAN(b);
-      rgba[ACOMP] = CHAN_MAX;
+      rgba[RCOMP] = r;
+      rgba[GCOMP] = g;
+      rgba[BCOMP] = b;
+      rgba[ACOMP] = 255;
    }
 }
 
 
 static void
-fxt1_decode_1ALPHA (const GLubyte *code, GLint t, GLchan *rgba)
+fxt1_decode_1ALPHA (const GLubyte *code, GLint t, GLubyte *rgba)
 {
    const GLuint *cc;
    GLubyte r, g, b, a;
@@ -1613,18 +1627,18 @@ fxt1_decode_1ALPHA (const GLubyte *code, GLint t, GLchan *rgba)
          r = UP5(kk >> 10);
       }
    }
-   rgba[RCOMP] = UBYTE_TO_CHAN(r);
-   rgba[GCOMP] = UBYTE_TO_CHAN(g);
-   rgba[BCOMP] = UBYTE_TO_CHAN(b);
-   rgba[ACOMP] = UBYTE_TO_CHAN(a);
+   rgba[RCOMP] = r;
+   rgba[GCOMP] = g;
+   rgba[BCOMP] = b;
+   rgba[ACOMP] = a;
 }
 
 
 void
 fxt1_decode_1 (const void *texture, GLint stride, /* in pixels */
-               GLint i, GLint j, GLchan *rgba)
+               GLint i, GLint j, GLubyte *rgba)
 {
-   static void (*decode_1[]) (const GLubyte *, GLint, GLchan *) = {
+   static void (*decode_1[]) (const GLubyte *, GLint, GLubyte *) = {
       fxt1_decode_1HI,     /* cc-high   = "00?" */
       fxt1_decode_1HI,     /* cc-high   = "00?" */
       fxt1_decode_1CHROMA, /* cc-chroma = "010" */
diff --git a/mesalib/src/mesa/main/texcompress_rgtc.c b/mesalib/src/mesa/main/texcompress_rgtc.c
index 7af3d6762..398f61290 100644
--- a/mesalib/src/mesa/main/texcompress_rgtc.c
+++ b/mesalib/src/mesa/main/texcompress_rgtc.c
@@ -48,9 +48,9 @@
 
 #define RGTC_DEBUG 0
 
-static void unsigned_encode_rgtc_chan(GLubyte *blkaddr, GLubyte srccolors[4][4],
+static void unsigned_encode_rgtc_ubyte(GLubyte *blkaddr, GLubyte srccolors[4][4],
 					GLint numxpixels, GLint numypixels);
-static void signed_encode_rgtc_chan(GLbyte *blkaddr, GLbyte srccolors[4][4],
+static void signed_encode_rgtc_ubyte(GLbyte *blkaddr, GLbyte srccolors[4][4],
 			     GLint numxpixels, GLint numypixels);
 
 static void unsigned_fetch_texel_rgtc(unsigned srcRowStride, const GLubyte *pixdata,
@@ -59,15 +59,15 @@ static void unsigned_fetch_texel_rgtc(unsigned srcRowStride, const GLubyte *pixd
 static void signed_fetch_texel_rgtc(unsigned srcRowStride, const GLbyte *pixdata,
 				      unsigned i, unsigned j, GLbyte *value, unsigned comps);
 
-static void extractsrc_u( GLubyte srcpixels[4][4], const GLchan *srcaddr,
+static void extractsrc_u( GLubyte srcpixels[4][4], const GLubyte *srcaddr,
 			  GLint srcRowStride, GLint numxpixels, GLint numypixels, GLint comps)
 {
    GLubyte i, j;
-   const GLchan *curaddr;
+   const GLubyte *curaddr;
    for (j = 0; j < numypixels; j++) {
       curaddr = srcaddr + j * srcRowStride * comps;
       for (i = 0; i < numxpixels; i++) {
-	 srcpixels[j][i] = *curaddr / (CHAN_MAX / 255);
+	 srcpixels[j][i] = *curaddr;
 	 curaddr += comps;
       }
    }
@@ -93,10 +93,10 @@ _mesa_texstore_red_rgtc1(TEXSTORE_PARAMS)
 {
    GLubyte *dst;
    const GLint texWidth = dstRowStride * 4 / 8; /* a bit of a hack */
-   const GLchan *tempImage = NULL;
+   const GLubyte *tempImage = NULL;
    int i, j;
    int numxpixels, numypixels;
-   const GLchan *srcaddr;
+   const GLubyte *srcaddr;
    GLubyte srcpixels[4][4];
    GLubyte *blkaddr;
    GLint dstRowDiff;
@@ -109,7 +109,7 @@ _mesa_texstore_red_rgtc1(TEXSTORE_PARAMS)
    (void) dstImageOffsets;
 
 
-   tempImage = _mesa_make_temp_chan_image(ctx, dims,
+   tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
 					  baseInternalFormat,
 					  _mesa_get_format_base_format(dstFormat),
 					  srcWidth, srcHeight, srcDepth,
@@ -132,7 +132,7 @@ _mesa_texstore_red_rgtc1(TEXSTORE_PARAMS)
 	 if (srcWidth > i + 3) numxpixels = 4;
 	 else numxpixels = srcWidth - i;
 	 extractsrc_u(srcpixels, srcaddr, srcWidth, numxpixels, numypixels, 1);
-	 unsigned_encode_rgtc_chan(blkaddr, srcpixels, numxpixels, numypixels);
+	 unsigned_encode_rgtc_ubyte(blkaddr, srcpixels, numxpixels, numypixels);
 	 srcaddr += numxpixels;
 	 blkaddr += 8;
       }
@@ -187,7 +187,7 @@ _mesa_texstore_signed_red_rgtc1(TEXSTORE_PARAMS)
 	 if (srcWidth > i + 3) numxpixels = 4;
 	 else numxpixels = srcWidth - i;
 	 extractsrc_s(srcpixels, srcaddr, srcWidth, numxpixels, numypixels, 1);
-	 signed_encode_rgtc_chan(blkaddr, srcpixels, numxpixels, numypixels);
+	 signed_encode_rgtc_ubyte(blkaddr, srcpixels, numxpixels, numypixels);
 	 srcaddr += numxpixels;
 	 blkaddr += 8;
       }
@@ -204,10 +204,10 @@ _mesa_texstore_rg_rgtc2(TEXSTORE_PARAMS)
 {
    GLubyte *dst;
    const GLint texWidth = dstRowStride * 4 / 16; /* a bit of a hack */
-   const GLchan *tempImage = NULL;
+   const GLubyte *tempImage = NULL;
    int i, j;
    int numxpixels, numypixels;
-   const GLchan *srcaddr;
+   const GLubyte *srcaddr;
    GLubyte srcpixels[4][4];
    GLubyte *blkaddr;
    GLint dstRowDiff;
@@ -220,7 +220,7 @@ _mesa_texstore_rg_rgtc2(TEXSTORE_PARAMS)
    (void) dstZoffset;
    (void) dstImageOffsets;
 
-   tempImage = _mesa_make_temp_chan_image(ctx, dims,
+   tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
 					  baseInternalFormat,
 					  _mesa_get_format_base_format(dstFormat),
 					  srcWidth, srcHeight, srcDepth,
@@ -243,11 +243,11 @@ _mesa_texstore_rg_rgtc2(TEXSTORE_PARAMS)
 	 if (srcWidth > i + 3) numxpixels = 4;
 	 else numxpixels = srcWidth - i;
 	 extractsrc_u(srcpixels, srcaddr, srcWidth, numxpixels, numypixels, 2);
-	 unsigned_encode_rgtc_chan(blkaddr, srcpixels, numxpixels, numypixels);
+	 unsigned_encode_rgtc_ubyte(blkaddr, srcpixels, numxpixels, numypixels);
 
 	 blkaddr += 8;
-	 extractsrc_u(srcpixels, (GLchan *)srcaddr + 1, srcWidth, numxpixels, numypixels, 2);
-	 unsigned_encode_rgtc_chan(blkaddr, srcpixels, numxpixels, numypixels);
+	 extractsrc_u(srcpixels, (GLubyte *)srcaddr + 1, srcWidth, numxpixels, numypixels, 2);
+	 unsigned_encode_rgtc_ubyte(blkaddr, srcpixels, numxpixels, numypixels);
 
 	 blkaddr += 8;
 
@@ -306,11 +306,11 @@ _mesa_texstore_signed_rg_rgtc2(TEXSTORE_PARAMS)
 	 else numxpixels = srcWidth - i;
 
 	 extractsrc_s(srcpixels, srcaddr, srcWidth, numxpixels, numypixels, 2);
-	 signed_encode_rgtc_chan(blkaddr, srcpixels, numxpixels, numypixels);
+	 signed_encode_rgtc_ubyte(blkaddr, srcpixels, numxpixels, numypixels);
 	 blkaddr += 8;
 
 	 extractsrc_s(srcpixels, srcaddr + 1, srcWidth, numxpixels, numypixels, 2);
-	 signed_encode_rgtc_chan(blkaddr, srcpixels, numxpixels, numypixels);
+	 signed_encode_rgtc_ubyte(blkaddr, srcpixels, numxpixels, numypixels);
 	 blkaddr += 8;
 
 	 srcaddr += numxpixels * 2;
diff --git a/mesalib/src/mesa/main/texcompress_rgtc_tmp.h b/mesalib/src/mesa/main/texcompress_rgtc_tmp.h
index 48bbd374e..277d69b17 100644
--- a/mesalib/src/mesa/main/texcompress_rgtc_tmp.h
+++ b/mesalib/src/mesa/main/texcompress_rgtc_tmp.h
@@ -73,7 +73,7 @@ static void TAG(write_rgtc_encoded_channel)(TYPE *blkaddr,
    *blkaddr++ = (alphaenc[13] >> 1) | (alphaenc[14] << 2) | (alphaenc[15] << 5);
 }
 
-static void TAG(encode_rgtc_chan)(TYPE *blkaddr, TYPE srccolors[4][4],
+static void TAG(encode_rgtc_ubyte)(TYPE *blkaddr, TYPE srccolors[4][4],
 			     int numxpixels, int numypixels)
 {
    TYPE alphabase[2], alphause[2];
diff --git a/mesalib/src/mesa/main/texcompress_s3tc.c b/mesalib/src/mesa/main/texcompress_s3tc.c
index 36a56447e..04c5b4476 100644
--- a/mesalib/src/mesa/main/texcompress_s3tc.c
+++ b/mesalib/src/mesa/main/texcompress_s3tc.c
@@ -97,7 +97,7 @@ dxtFetchTexelFuncExt fetch_ext_rgba_dxt3 = NULL;
 dxtFetchTexelFuncExt fetch_ext_rgba_dxt5 = NULL;
 
 typedef void (*dxtCompressTexFuncExt)(GLint srccomps, GLint width,
-                                      GLint height, const GLchan *srcPixData,
+                                      GLint height, const GLubyte *srcPixData,
                                       GLenum destformat, GLubyte *dest,
                                       GLint dstRowStride);
 
@@ -163,10 +163,10 @@ _mesa_init_texture_s3tc( struct gl_context *ctx )
 GLboolean
 _mesa_texstore_rgb_dxt1(TEXSTORE_PARAMS)
 {
-   const GLchan *pixels;
+   const GLubyte *pixels;
    GLubyte *dst;
    const GLint texWidth = dstRowStride * 4 / 8; /* a bit of a hack */
-   const GLchan *tempImage = NULL;
+   const GLubyte *tempImage = NULL;
 
    ASSERT(dstFormat == MESA_FORMAT_RGB_DXT1 ||
           dstFormat == MESA_FORMAT_SRGB_DXT1);
@@ -177,11 +177,11 @@ _mesa_texstore_rgb_dxt1(TEXSTORE_PARAMS)
    (void) dstImageOffsets;
 
    if (srcFormat != GL_RGB ||
-       srcType != CHAN_TYPE ||
+       srcType != GL_UNSIGNED_BYTE ||
        ctx->_ImageTransferState ||
        srcPacking->SwapBytes) {
-      /* convert image to RGB/GLchan */
-      tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      /* convert image to RGB/GLubyte */
+      tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                              baseInternalFormat,
                                              _mesa_get_format_base_format(dstFormat),
                                              srcWidth, srcHeight, srcDepth,
@@ -193,7 +193,7 @@ _mesa_texstore_rgb_dxt1(TEXSTORE_PARAMS)
       srcFormat = GL_RGB;
    }
    else {
-      pixels = (const GLchan *) srcAddr;
+      pixels = (const GLubyte *) srcAddr;
    }
 
    dst = _mesa_compressed_image_address(dstXoffset, dstYoffset, 0,
@@ -222,10 +222,10 @@ _mesa_texstore_rgb_dxt1(TEXSTORE_PARAMS)
 GLboolean
 _mesa_texstore_rgba_dxt1(TEXSTORE_PARAMS)
 {
-   const GLchan *pixels;
+   const GLubyte *pixels;
    GLubyte *dst;
    const GLint texWidth = dstRowStride * 4 / 8; /* a bit of a hack */
-   const GLchan *tempImage = NULL;
+   const GLubyte *tempImage = NULL;
 
    ASSERT(dstFormat == MESA_FORMAT_RGBA_DXT1 ||
           dstFormat == MESA_FORMAT_SRGBA_DXT1);
@@ -236,11 +236,11 @@ _mesa_texstore_rgba_dxt1(TEXSTORE_PARAMS)
    (void) dstImageOffsets;
 
    if (srcFormat != GL_RGBA ||
-       srcType != CHAN_TYPE ||
+       srcType != GL_UNSIGNED_BYTE ||
        ctx->_ImageTransferState ||
        srcPacking->SwapBytes) {
-      /* convert image to RGBA/GLchan */
-      tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      /* convert image to RGBA/GLubyte */
+      tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                              baseInternalFormat,
                                              _mesa_get_format_base_format(dstFormat),
                                              srcWidth, srcHeight, srcDepth,
@@ -252,7 +252,7 @@ _mesa_texstore_rgba_dxt1(TEXSTORE_PARAMS)
       srcFormat = GL_RGBA;
    }
    else {
-      pixels = (const GLchan *) srcAddr;
+      pixels = (const GLubyte *) srcAddr;
    }
 
    dst = _mesa_compressed_image_address(dstXoffset, dstYoffset, 0,
@@ -280,10 +280,10 @@ _mesa_texstore_rgba_dxt1(TEXSTORE_PARAMS)
 GLboolean
 _mesa_texstore_rgba_dxt3(TEXSTORE_PARAMS)
 {
-   const GLchan *pixels;
+   const GLubyte *pixels;
    GLubyte *dst;
    const GLint texWidth = dstRowStride * 4 / 16; /* a bit of a hack */
-   const GLchan *tempImage = NULL;
+   const GLubyte *tempImage = NULL;
 
    ASSERT(dstFormat == MESA_FORMAT_RGBA_DXT3 ||
           dstFormat == MESA_FORMAT_SRGBA_DXT3);
@@ -294,11 +294,11 @@ _mesa_texstore_rgba_dxt3(TEXSTORE_PARAMS)
    (void) dstImageOffsets;
 
    if (srcFormat != GL_RGBA ||
-       srcType != CHAN_TYPE ||
+       srcType != GL_UNSIGNED_BYTE ||
        ctx->_ImageTransferState ||
        srcPacking->SwapBytes) {
-      /* convert image to RGBA/GLchan */
-      tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      /* convert image to RGBA/GLubyte */
+      tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                              baseInternalFormat,
                                              _mesa_get_format_base_format(dstFormat),
                                              srcWidth, srcHeight, srcDepth,
@@ -309,7 +309,7 @@ _mesa_texstore_rgba_dxt3(TEXSTORE_PARAMS)
       pixels = tempImage;
    }
    else {
-      pixels = (const GLchan *) srcAddr;
+      pixels = (const GLubyte *) srcAddr;
    }
 
    dst = _mesa_compressed_image_address(dstXoffset, dstYoffset, 0,
@@ -337,10 +337,10 @@ _mesa_texstore_rgba_dxt3(TEXSTORE_PARAMS)
 GLboolean
 _mesa_texstore_rgba_dxt5(TEXSTORE_PARAMS)
 {
-   const GLchan *pixels;
+   const GLubyte *pixels;
    GLubyte *dst;
    const GLint texWidth = dstRowStride * 4 / 16; /* a bit of a hack */
-   const GLchan *tempImage = NULL;
+   const GLubyte *tempImage = NULL;
 
    ASSERT(dstFormat == MESA_FORMAT_RGBA_DXT5 ||
           dstFormat == MESA_FORMAT_SRGBA_DXT5);
@@ -351,11 +351,11 @@ _mesa_texstore_rgba_dxt5(TEXSTORE_PARAMS)
    (void) dstImageOffsets;
 
    if (srcFormat != GL_RGBA ||
-       srcType != CHAN_TYPE ||
+       srcType != GL_UNSIGNED_BYTE ||
        ctx->_ImageTransferState ||
        srcPacking->SwapBytes) {
-      /* convert image to RGBA/GLchan */
-      tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      /* convert image to RGBA/GLubyte */
+      tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                              baseInternalFormat,
                                    	     _mesa_get_format_base_format(dstFormat),
                                              srcWidth, srcHeight, srcDepth,
@@ -366,7 +366,7 @@ _mesa_texstore_rgba_dxt5(TEXSTORE_PARAMS)
       pixels = tempImage;
    }
    else {
-      pixels = (const GLchan *) srcAddr;
+      pixels = (const GLubyte *) srcAddr;
    }
 
    dst = _mesa_compressed_image_address(dstXoffset, dstYoffset, 0,
@@ -390,11 +390,10 @@ _mesa_texstore_rgba_dxt5(TEXSTORE_PARAMS)
 
 static void
 fetch_texel_2d_rgb_dxt1( const struct swrast_texture_image *texImage,
-                         GLint i, GLint j, GLint k, GLchan *texel )
+                         GLint i, GLint j, GLint k, GLubyte *texel )
 {
    (void) k;
    if (fetch_ext_rgb_dxt1) {
-      ASSERT (sizeof(GLchan) == sizeof(GLubyte));
       fetch_ext_rgb_dxt1(texImage->Base.RowStride,
                          (GLubyte *)(texImage)->Base.Data, i, j, texel);
    }
@@ -407,19 +406,19 @@ void
 _mesa_fetch_texel_2d_f_rgb_dxt1(const struct swrast_texture_image *texImage,
                                 GLint i, GLint j, GLint k, GLfloat *texel)
 {
-   /* just sample as GLchan and convert to float here */
-   GLchan rgba[4];
+   /* just sample as GLubyte and convert to float here */
+   GLubyte rgba[4];
    fetch_texel_2d_rgb_dxt1(texImage, i, j, k, rgba);
-   texel[RCOMP] = CHAN_TO_FLOAT(rgba[RCOMP]);
-   texel[GCOMP] = CHAN_TO_FLOAT(rgba[GCOMP]);
-   texel[BCOMP] = CHAN_TO_FLOAT(rgba[BCOMP]);
-   texel[ACOMP] = CHAN_TO_FLOAT(rgba[ACOMP]);
+   texel[RCOMP] = UBYTE_TO_FLOAT(rgba[RCOMP]);
+   texel[GCOMP] = UBYTE_TO_FLOAT(rgba[GCOMP]);
+   texel[BCOMP] = UBYTE_TO_FLOAT(rgba[BCOMP]);
+   texel[ACOMP] = UBYTE_TO_FLOAT(rgba[ACOMP]);
 }
 
 
 static void
 fetch_texel_2d_rgba_dxt1( const struct swrast_texture_image *texImage,
-                          GLint i, GLint j, GLint k, GLchan *texel )
+                          GLint i, GLint j, GLint k, GLubyte *texel )
 {
    (void) k;
    if (fetch_ext_rgba_dxt1) {
@@ -435,23 +434,22 @@ void
 _mesa_fetch_texel_2d_f_rgba_dxt1(const struct swrast_texture_image *texImage,
                                  GLint i, GLint j, GLint k, GLfloat *texel)
 {
-   /* just sample as GLchan and convert to float here */
-   GLchan rgba[4];
+   /* just sample as GLubyte and convert to float here */
+   GLubyte rgba[4];
    fetch_texel_2d_rgba_dxt1(texImage, i, j, k, rgba);
-   texel[RCOMP] = CHAN_TO_FLOAT(rgba[RCOMP]);
-   texel[GCOMP] = CHAN_TO_FLOAT(rgba[GCOMP]);
-   texel[BCOMP] = CHAN_TO_FLOAT(rgba[BCOMP]);
-   texel[ACOMP] = CHAN_TO_FLOAT(rgba[ACOMP]);
+   texel[RCOMP] = UBYTE_TO_FLOAT(rgba[RCOMP]);
+   texel[GCOMP] = UBYTE_TO_FLOAT(rgba[GCOMP]);
+   texel[BCOMP] = UBYTE_TO_FLOAT(rgba[BCOMP]);
+   texel[ACOMP] = UBYTE_TO_FLOAT(rgba[ACOMP]);
 }
 
 
 static void
 fetch_texel_2d_rgba_dxt3( const struct swrast_texture_image *texImage,
-                          GLint i, GLint j, GLint k, GLchan *texel )
+                          GLint i, GLint j, GLint k, GLubyte *texel )
 {
    (void) k;
    if (fetch_ext_rgba_dxt3) {
-      ASSERT (sizeof(GLchan) == sizeof(GLubyte));
       fetch_ext_rgba_dxt3(texImage->Base.RowStride,
                           (GLubyte *)(texImage)->Base.Data,
                           i, j, texel);
@@ -465,19 +463,19 @@ void
 _mesa_fetch_texel_2d_f_rgba_dxt3(const struct swrast_texture_image *texImage,
                                  GLint i, GLint j, GLint k, GLfloat *texel)
 {
-   /* just sample as GLchan and convert to float here */
-   GLchan rgba[4];
+   /* just sample as GLubyte and convert to float here */
+   GLubyte rgba[4];
    fetch_texel_2d_rgba_dxt3(texImage, i, j, k, rgba);
-   texel[RCOMP] = CHAN_TO_FLOAT(rgba[RCOMP]);
-   texel[GCOMP] = CHAN_TO_FLOAT(rgba[GCOMP]);
-   texel[BCOMP] = CHAN_TO_FLOAT(rgba[BCOMP]);
-   texel[ACOMP] = CHAN_TO_FLOAT(rgba[ACOMP]);
+   texel[RCOMP] = UBYTE_TO_FLOAT(rgba[RCOMP]);
+   texel[GCOMP] = UBYTE_TO_FLOAT(rgba[GCOMP]);
+   texel[BCOMP] = UBYTE_TO_FLOAT(rgba[BCOMP]);
+   texel[ACOMP] = UBYTE_TO_FLOAT(rgba[ACOMP]);
 }
 
 
 static void
 fetch_texel_2d_rgba_dxt5( const struct swrast_texture_image *texImage,
-                          GLint i, GLint j, GLint k, GLchan *texel )
+                          GLint i, GLint j, GLint k, GLubyte *texel )
 {
    (void) k;
    if (fetch_ext_rgba_dxt5) {
@@ -494,13 +492,13 @@ void
 _mesa_fetch_texel_2d_f_rgba_dxt5(const struct swrast_texture_image *texImage,
                                  GLint i, GLint j, GLint k, GLfloat *texel)
 {
-   /* just sample as GLchan and convert to float here */
-   GLchan rgba[4];
+   /* just sample as GLubyte and convert to float here */
+   GLubyte rgba[4];
    fetch_texel_2d_rgba_dxt5(texImage, i, j, k, rgba);
-   texel[RCOMP] = CHAN_TO_FLOAT(rgba[RCOMP]);
-   texel[GCOMP] = CHAN_TO_FLOAT(rgba[GCOMP]);
-   texel[BCOMP] = CHAN_TO_FLOAT(rgba[BCOMP]);
-   texel[ACOMP] = CHAN_TO_FLOAT(rgba[ACOMP]);
+   texel[RCOMP] = UBYTE_TO_FLOAT(rgba[RCOMP]);
+   texel[GCOMP] = UBYTE_TO_FLOAT(rgba[GCOMP]);
+   texel[BCOMP] = UBYTE_TO_FLOAT(rgba[BCOMP]);
+   texel[ACOMP] = UBYTE_TO_FLOAT(rgba[ACOMP]);
 }
 
 #if FEATURE_EXT_texture_sRGB
@@ -508,52 +506,52 @@ void
 _mesa_fetch_texel_2d_f_srgb_dxt1( const struct swrast_texture_image *texImage,
                                   GLint i, GLint j, GLint k, GLfloat *texel )
 {
-   /* just sample as GLchan and convert to float here */
-   GLchan rgba[4];
+   /* just sample as GLubyte and convert to float here */
+   GLubyte rgba[4];
    fetch_texel_2d_rgb_dxt1(texImage, i, j, k, rgba);
    texel[RCOMP] = nonlinear_to_linear(rgba[RCOMP]);
    texel[GCOMP] = nonlinear_to_linear(rgba[GCOMP]);
    texel[BCOMP] = nonlinear_to_linear(rgba[BCOMP]);
-   texel[ACOMP] = CHAN_TO_FLOAT(rgba[ACOMP]);
+   texel[ACOMP] = UBYTE_TO_FLOAT(rgba[ACOMP]);
 }
 
 void
 _mesa_fetch_texel_2d_f_srgba_dxt1(const struct swrast_texture_image *texImage,
                                   GLint i, GLint j, GLint k, GLfloat *texel)
 {
-   /* just sample as GLchan and convert to float here */
-   GLchan rgba[4];
+   /* just sample as GLubyte and convert to float here */
+   GLubyte rgba[4];
    fetch_texel_2d_rgba_dxt1(texImage, i, j, k, rgba);
    texel[RCOMP] = nonlinear_to_linear(rgba[RCOMP]);
    texel[GCOMP] = nonlinear_to_linear(rgba[GCOMP]);
    texel[BCOMP] = nonlinear_to_linear(rgba[BCOMP]);
-   texel[ACOMP] = CHAN_TO_FLOAT(rgba[ACOMP]);
+   texel[ACOMP] = UBYTE_TO_FLOAT(rgba[ACOMP]);
 }
 
 void
 _mesa_fetch_texel_2d_f_srgba_dxt3(const struct swrast_texture_image *texImage,
                                   GLint i, GLint j, GLint k, GLfloat *texel)
 {
-   /* just sample as GLchan and convert to float here */
-   GLchan rgba[4];
+   /* just sample as GLubyte and convert to float here */
+   GLubyte rgba[4];
    fetch_texel_2d_rgba_dxt3(texImage, i, j, k, rgba);
    texel[RCOMP] = nonlinear_to_linear(rgba[RCOMP]);
    texel[GCOMP] = nonlinear_to_linear(rgba[GCOMP]);
    texel[BCOMP] = nonlinear_to_linear(rgba[BCOMP]);
-   texel[ACOMP] = CHAN_TO_FLOAT(rgba[ACOMP]);
+   texel[ACOMP] = UBYTE_TO_FLOAT(rgba[ACOMP]);
 }
 
 void
 _mesa_fetch_texel_2d_f_srgba_dxt5(const struct swrast_texture_image *texImage,
                                   GLint i, GLint j, GLint k, GLfloat *texel)
 {
-   /* just sample as GLchan and convert to float here */
-   GLchan rgba[4];
+   /* just sample as GLubyte and convert to float here */
+   GLubyte rgba[4];
    fetch_texel_2d_rgba_dxt5(texImage, i, j, k, rgba);
    texel[RCOMP] = nonlinear_to_linear(rgba[RCOMP]);
    texel[GCOMP] = nonlinear_to_linear(rgba[GCOMP]);
    texel[BCOMP] = nonlinear_to_linear(rgba[BCOMP]);
-   texel[ACOMP] = CHAN_TO_FLOAT(rgba[ACOMP]);
+   texel[ACOMP] = UBYTE_TO_FLOAT(rgba[ACOMP]);
 }
 #endif /* FEATURE_EXT_texture_sRGB */
 
diff --git a/mesalib/src/mesa/main/texstore.c b/mesalib/src/mesa/main/texstore.c
index b958615b5..cbed26cd4 100644
--- a/mesalib/src/mesa/main/texstore.c
+++ b/mesalib/src/mesa/main/texstore.c
@@ -531,7 +531,7 @@ make_temp_uint_image(struct gl_context *ctx, GLuint dims,
 
 
 /**
- * Make a temporary (color) texture image with GLchan components.
+ * Make a temporary (color) texture image with GLubyte components.
  * Apply all needed pixel unpacking and pixel transfer operations.
  * Note that there are both logicalBaseFormat and textureBaseFormat parameters.
  * Suppose the user specifies GL_LUMINANCE as the internal texture format
@@ -551,21 +551,21 @@ make_temp_uint_image(struct gl_context *ctx, GLuint dims,
  * \param srcType  source image type
  * \param srcAddr  source image address
  * \param srcPacking  source image pixel packing
- * \return resulting image with format = textureBaseFormat and type = GLchan.
+ * \return resulting image with format = textureBaseFormat and type = GLubyte.
  */
-GLchan *
-_mesa_make_temp_chan_image(struct gl_context *ctx, GLuint dims,
-                           GLenum logicalBaseFormat,
-                           GLenum textureBaseFormat,
-                           GLint srcWidth, GLint srcHeight, GLint srcDepth,
-                           GLenum srcFormat, GLenum srcType,
-                           const GLvoid *srcAddr,
-                           const struct gl_pixelstore_attrib *srcPacking)
+GLubyte *
+_mesa_make_temp_ubyte_image(struct gl_context *ctx, GLuint dims,
+                            GLenum logicalBaseFormat,
+                            GLenum textureBaseFormat,
+                            GLint srcWidth, GLint srcHeight, GLint srcDepth,
+                            GLenum srcFormat, GLenum srcType,
+                            const GLvoid *srcAddr,
+                            const struct gl_pixelstore_attrib *srcPacking)
 {
    GLuint transferOps = ctx->_ImageTransferState;
    const GLint components = _mesa_components_in_format(logicalBaseFormat);
    GLint img, row;
-   GLchan *tempImage, *dst;
+   GLubyte *tempImage, *dst;
 
    ASSERT(dims >= 1 && dims <= 3);
 
@@ -588,8 +588,8 @@ _mesa_make_temp_chan_image(struct gl_context *ctx, GLuint dims,
           textureBaseFormat == GL_INTENSITY);
 
    /* unpack and transfer the source image */
-   tempImage = (GLchan *) malloc(srcWidth * srcHeight * srcDepth
-                                       * components * sizeof(GLchan));
+   tempImage = (GLubyte *) malloc(srcWidth * srcHeight * srcDepth
+                                       * components * sizeof(GLubyte));
    if (!tempImage) {
       return NULL;
    }
@@ -604,9 +604,9 @@ _mesa_make_temp_chan_image(struct gl_context *ctx, GLuint dims,
                                                srcFormat, srcType,
                                                img, 0, 0);
       for (row = 0; row < srcHeight; row++) {
-         _mesa_unpack_color_span_chan(ctx, srcWidth, logicalBaseFormat, dst,
-                                      srcFormat, srcType, src, srcPacking,
-                                      transferOps);
+         _mesa_unpack_color_span_ubyte(ctx, srcWidth, logicalBaseFormat, dst,
+                                       srcFormat, srcType, src, srcPacking,
+                                       transferOps);
          dst += srcWidth * components;
          src += srcStride;
       }
@@ -616,7 +616,7 @@ _mesa_make_temp_chan_image(struct gl_context *ctx, GLuint dims,
       /* one more conversion step */
       GLint texComponents = _mesa_components_in_format(textureBaseFormat);
       GLint logComponents = _mesa_components_in_format(logicalBaseFormat);
-      GLchan *newImage;
+      GLubyte *newImage;
       GLint i, n;
       GLubyte map[6];
 
@@ -629,8 +629,8 @@ _mesa_make_temp_chan_image(struct gl_context *ctx, GLuint dims,
        */
       ASSERT(texComponents >= logComponents);
 
-      newImage = (GLchan *) malloc(srcWidth * srcHeight * srcDepth
-                                         * texComponents * sizeof(GLchan));
+      newImage = (GLubyte *) malloc(srcWidth * srcHeight * srcDepth
+                                         * texComponents * sizeof(GLubyte));
       if (!newImage) {
          free(tempImage);
          return NULL;
@@ -646,7 +646,7 @@ _mesa_make_temp_chan_image(struct gl_context *ctx, GLuint dims,
             if (j == ZERO)
                newImage[i * texComponents + k] = 0;
             else if (j == ONE)
-               newImage[i * texComponents + k] = CHAN_MAX;
+               newImage[i * texComponents + k] = 255;
             else
                newImage[i * texComponents + k] = tempImage[i * logComponents + j];
          }
@@ -1235,13 +1235,13 @@ _mesa_texstore_rgb565(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src = tempImage;
+      const GLubyte *src = tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -1255,17 +1255,17 @@ _mesa_texstore_rgb565(TEXSTORE_PARAMS)
             /* check for byteswapped format */
             if (dstFormat == MESA_FORMAT_RGB565) {
                for (col = 0; col < srcWidth; col++) {
-                  dstUS[col] = PACK_COLOR_565( CHAN_TO_UBYTE(src[RCOMP]),
-                                               CHAN_TO_UBYTE(src[GCOMP]),
-                                               CHAN_TO_UBYTE(src[BCOMP]) );
+                  dstUS[col] = PACK_COLOR_565( src[RCOMP],
+                                               src[GCOMP],
+                                               src[BCOMP] );
                   src += 3;
                }
             }
             else {
                for (col = 0; col < srcWidth; col++) {
-                  dstUS[col] = PACK_COLOR_565_REV( CHAN_TO_UBYTE(src[RCOMP]),
-                                                   CHAN_TO_UBYTE(src[GCOMP]),
-                                                   CHAN_TO_UBYTE(src[BCOMP]) );
+                  dstUS[col] = PACK_COLOR_565_REV( src[RCOMP],
+                                                   src[GCOMP],
+                                                   src[BCOMP] );
                   src += 3;
                }
             }
@@ -1361,13 +1361,13 @@ _mesa_texstore_rgba8888(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src = tempImage;
+      const GLubyte *src = tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -1380,19 +1380,19 @@ _mesa_texstore_rgba8888(TEXSTORE_PARAMS)
             GLuint *dstUI = (GLuint *) dstRow;
             if (dstFormat == MESA_FORMAT_RGBA8888) {
                for (col = 0; col < srcWidth; col++) {
-                  dstUI[col] = PACK_COLOR_8888( CHAN_TO_UBYTE(src[RCOMP]),
-                                                CHAN_TO_UBYTE(src[GCOMP]),
-                                                CHAN_TO_UBYTE(src[BCOMP]),
-                                                CHAN_TO_UBYTE(src[ACOMP]) );
+                  dstUI[col] = PACK_COLOR_8888( src[RCOMP],
+                                                src[GCOMP],
+                                                src[BCOMP],
+                                                src[ACOMP] );
                   src += 4;
                }
             }
             else {
                for (col = 0; col < srcWidth; col++) {
-                  dstUI[col] = PACK_COLOR_8888_REV( CHAN_TO_UBYTE(src[RCOMP]),
-                                                    CHAN_TO_UBYTE(src[GCOMP]),
-                                                    CHAN_TO_UBYTE(src[BCOMP]),
-                                                    CHAN_TO_UBYTE(src[ACOMP]) );
+                  dstUI[col] = PACK_COLOR_8888_REV( src[RCOMP],
+                                                    src[GCOMP],
+                                                    src[BCOMP],
+                                                    src[ACOMP] );
                   src += 4;
                }
             }
@@ -1561,13 +1561,13 @@ _mesa_texstore_argb8888(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src = tempImage;
+      const GLubyte *src = tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -1580,28 +1580,28 @@ _mesa_texstore_argb8888(TEXSTORE_PARAMS)
             GLuint *dstUI = (GLuint *) dstRow;
             if (dstFormat == MESA_FORMAT_ARGB8888) {
                for (col = 0; col < srcWidth; col++) {
-                  dstUI[col] = PACK_COLOR_8888( CHAN_TO_UBYTE(src[ACOMP]),
-                                                CHAN_TO_UBYTE(src[RCOMP]),
-                                                CHAN_TO_UBYTE(src[GCOMP]),
-                                                CHAN_TO_UBYTE(src[BCOMP]) );
+                  dstUI[col] = PACK_COLOR_8888( src[ACOMP],
+                                                src[RCOMP],
+                                                src[GCOMP],
+                                                src[BCOMP] );
                   src += 4;
                }
             }
             else if (dstFormat == MESA_FORMAT_XRGB8888) {
                for (col = 0; col < srcWidth; col++) {
                   dstUI[col] = PACK_COLOR_8888( 0xff,
-                                                CHAN_TO_UBYTE(src[RCOMP]),
-                                                CHAN_TO_UBYTE(src[GCOMP]),
-                                                CHAN_TO_UBYTE(src[BCOMP]) );
+                                                src[RCOMP],
+                                                src[GCOMP],
+                                                src[BCOMP] );
                   src += 4;
                }
             }
             else {
                for (col = 0; col < srcWidth; col++) {
-                  dstUI[col] = PACK_COLOR_8888_REV( CHAN_TO_UBYTE(src[ACOMP]),
-                                                    CHAN_TO_UBYTE(src[RCOMP]),
-                                                    CHAN_TO_UBYTE(src[GCOMP]),
-                                                    CHAN_TO_UBYTE(src[BCOMP]) );
+                  dstUI[col] = PACK_COLOR_8888_REV( src[ACOMP],
+                                                    src[RCOMP],
+                                                    src[GCOMP],
+                                                    src[BCOMP] );
                   src += 4;
                }
             }
@@ -1690,13 +1690,13 @@ _mesa_texstore_rgb888(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src = (const GLchan *) tempImage;
+      const GLubyte *src = (const GLubyte *) tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -1709,9 +1709,9 @@ _mesa_texstore_rgb888(TEXSTORE_PARAMS)
 #if 0
             if (littleEndian) {
                for (col = 0; col < srcWidth; col++) {
-                  dstRow[col * 3 + 0] = CHAN_TO_UBYTE(src[RCOMP]);
-                  dstRow[col * 3 + 1] = CHAN_TO_UBYTE(src[GCOMP]);
-                  dstRow[col * 3 + 2] = CHAN_TO_UBYTE(src[BCOMP]);
+                  dstRow[col * 3 + 0] = src[RCOMP];
+                  dstRow[col * 3 + 1] = src[GCOMP];
+                  dstRow[col * 3 + 2] = src[BCOMP];
                   srcUB += 3;
                }
             }
@@ -1725,9 +1725,9 @@ _mesa_texstore_rgb888(TEXSTORE_PARAMS)
             }
 #else
             for (col = 0; col < srcWidth; col++) {
-               dstRow[col * 3 + 0] = CHAN_TO_UBYTE(src[BCOMP]);
-               dstRow[col * 3 + 1] = CHAN_TO_UBYTE(src[GCOMP]);
-               dstRow[col * 3 + 2] = CHAN_TO_UBYTE(src[RCOMP]);
+               dstRow[col * 3 + 0] = src[BCOMP];
+               dstRow[col * 3 + 1] = src[GCOMP];
+               dstRow[col * 3 + 2] = src[RCOMP];
                src += 3;
             }
 #endif
@@ -1816,13 +1816,13 @@ _mesa_texstore_bgr888(TEXSTORE_PARAMS)
    }   
    else {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src = (const GLchan *) tempImage;
+      const GLubyte *src = (const GLubyte *) tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -1833,9 +1833,9 @@ _mesa_texstore_bgr888(TEXSTORE_PARAMS)
             + dstXoffset * texelBytes;
          for (row = 0; row < srcHeight; row++) {
             for (col = 0; col < srcWidth; col++) {
-               dstRow[col * 3 + 0] = CHAN_TO_UBYTE(src[RCOMP]);
-               dstRow[col * 3 + 1] = CHAN_TO_UBYTE(src[GCOMP]);
-               dstRow[col * 3 + 2] = CHAN_TO_UBYTE(src[BCOMP]);
+               dstRow[col * 3 + 0] = src[RCOMP];
+               dstRow[col * 3 + 1] = src[GCOMP];
+               dstRow[col * 3 + 2] = src[BCOMP];
                src += 3;
             }
             dstRow += dstRowStride;
@@ -1873,13 +1873,13 @@ _mesa_texstore_argb4444(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src = tempImage;
+      const GLubyte *src = tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -1892,19 +1892,19 @@ _mesa_texstore_argb4444(TEXSTORE_PARAMS)
             GLushort *dstUS = (GLushort *) dstRow;
             if (dstFormat == MESA_FORMAT_ARGB4444) {
                for (col = 0; col < srcWidth; col++) {
-                  dstUS[col] = PACK_COLOR_4444( CHAN_TO_UBYTE(src[ACOMP]),
-                                                CHAN_TO_UBYTE(src[RCOMP]),
-                                                CHAN_TO_UBYTE(src[GCOMP]),
-                                                CHAN_TO_UBYTE(src[BCOMP]) );
+                  dstUS[col] = PACK_COLOR_4444( src[ACOMP],
+                                                src[RCOMP],
+                                                src[GCOMP],
+                                                src[BCOMP] );
                   src += 4;
                }
             }
             else {
                for (col = 0; col < srcWidth; col++) {
-                  dstUS[col] = PACK_COLOR_4444_REV( CHAN_TO_UBYTE(src[ACOMP]),
-                                                    CHAN_TO_UBYTE(src[RCOMP]),
-                                                    CHAN_TO_UBYTE(src[GCOMP]),
-                                                    CHAN_TO_UBYTE(src[BCOMP]) );
+                  dstUS[col] = PACK_COLOR_4444_REV( src[ACOMP],
+                                                    src[RCOMP],
+                                                    src[GCOMP],
+                                                    src[BCOMP] );
                   src += 4;
                }
             }
@@ -1941,13 +1941,13 @@ _mesa_texstore_rgba5551(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src =tempImage;
+      const GLubyte *src =tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -1959,10 +1959,10 @@ _mesa_texstore_rgba5551(TEXSTORE_PARAMS)
          for (row = 0; row < srcHeight; row++) {
             GLushort *dstUS = (GLushort *) dstRow;
 	    for (col = 0; col < srcWidth; col++) {
-	       dstUS[col] = PACK_COLOR_5551( CHAN_TO_UBYTE(src[RCOMP]),
-					     CHAN_TO_UBYTE(src[GCOMP]),
-					     CHAN_TO_UBYTE(src[BCOMP]),
-					     CHAN_TO_UBYTE(src[ACOMP]) );
+	       dstUS[col] = PACK_COLOR_5551( src[RCOMP],
+					     src[GCOMP],
+					     src[BCOMP],
+					     src[ACOMP] );
 	      src += 4;
 	    }
             dstRow += dstRowStride;
@@ -1999,13 +1999,13 @@ _mesa_texstore_argb1555(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src =tempImage;
+      const GLubyte *src =tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -2018,19 +2018,19 @@ _mesa_texstore_argb1555(TEXSTORE_PARAMS)
             GLushort *dstUS = (GLushort *) dstRow;
             if (dstFormat == MESA_FORMAT_ARGB1555) {
                for (col = 0; col < srcWidth; col++) {
-                  dstUS[col] = PACK_COLOR_1555( CHAN_TO_UBYTE(src[ACOMP]),
-                                                CHAN_TO_UBYTE(src[RCOMP]),
-                                                CHAN_TO_UBYTE(src[GCOMP]),
-                                                CHAN_TO_UBYTE(src[BCOMP]) );
+                  dstUS[col] = PACK_COLOR_1555( src[ACOMP],
+                                                src[RCOMP],
+                                                src[GCOMP],
+                                                src[BCOMP] );
                   src += 4;
                }
             }
             else {
                for (col = 0; col < srcWidth; col++) {
-                  dstUS[col] = PACK_COLOR_1555_REV( CHAN_TO_UBYTE(src[ACOMP]),
-                                                    CHAN_TO_UBYTE(src[RCOMP]),
-                                                    CHAN_TO_UBYTE(src[GCOMP]),
-                                                    CHAN_TO_UBYTE(src[BCOMP]) );
+                  dstUS[col] = PACK_COLOR_1555_REV( src[ACOMP],
+                                                    src[RCOMP],
+                                                    src[GCOMP],
+                                                    src[BCOMP] );
                   src += 4;
                }
             }
@@ -2137,13 +2137,13 @@ _mesa_texstore_unorm44(TEXSTORE_PARAMS)
 
    {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src = tempImage;
+      const GLubyte *src = tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -2156,8 +2156,8 @@ _mesa_texstore_unorm44(TEXSTORE_PARAMS)
             GLubyte *dstUS = (GLubyte *) dstRow;
             for (col = 0; col < srcWidth; col++) {
                /* src[0] is luminance, src[1] is alpha */
-               dstUS[col] = PACK_COLOR_44( CHAN_TO_UBYTE(src[1]),
-                                           CHAN_TO_UBYTE(src[0]) );
+               dstUS[col] = PACK_COLOR_44( src[1],
+                                           src[0] );
                src += 2;
             }
             dstRow += dstRowStride;
@@ -2248,13 +2248,13 @@ _mesa_texstore_unorm88(TEXSTORE_PARAMS)
    }   
    else {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src = tempImage;
+      const GLubyte *src = tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -2269,16 +2269,16 @@ _mesa_texstore_unorm88(TEXSTORE_PARAMS)
 		dstFormat == MESA_FORMAT_RG88) {
                for (col = 0; col < srcWidth; col++) {
                   /* src[0] is luminance, src[1] is alpha */
-                 dstUS[col] = PACK_COLOR_88( CHAN_TO_UBYTE(src[1]),
-                                             CHAN_TO_UBYTE(src[0]) );
+                 dstUS[col] = PACK_COLOR_88( src[1],
+                                             src[0] );
                  src += 2;
                }
             }
             else {
                for (col = 0; col < srcWidth; col++) {
                   /* src[0] is luminance, src[1] is alpha */
-                 dstUS[col] = PACK_COLOR_88_REV( CHAN_TO_UBYTE(src[1]),
-                                                 CHAN_TO_UBYTE(src[0]) );
+                 dstUS[col] = PACK_COLOR_88_REV( src[1],
+                                                 src[0] );
                  src += 2;
                }
             }
@@ -2604,13 +2604,13 @@ _mesa_texstore_rgb332(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src = tempImage;
+      const GLubyte *src = tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -2621,9 +2621,9 @@ _mesa_texstore_rgb332(TEXSTORE_PARAMS)
             + dstXoffset * texelBytes;
          for (row = 0; row < srcHeight; row++) {
             for (col = 0; col < srcWidth; col++) {
-               dstRow[col] = PACK_COLOR_332( CHAN_TO_UBYTE(src[RCOMP]),
-                                             CHAN_TO_UBYTE(src[GCOMP]),
-                                             CHAN_TO_UBYTE(src[BCOMP]) );
+               dstRow[col] = PACK_COLOR_332( src[RCOMP],
+                                             src[GCOMP],
+                                             src[BCOMP] );
                src += 3;
             }
             dstRow += dstRowStride;
@@ -2692,13 +2692,13 @@ _mesa_texstore_unorm8(TEXSTORE_PARAMS)
    }   
    else {
       /* general path */
-      const GLchan *tempImage = _mesa_make_temp_chan_image(ctx, dims,
+      const GLubyte *tempImage = _mesa_make_temp_ubyte_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
                                                  srcFormat, srcType, srcAddr,
                                                  srcPacking);
-      const GLchan *src = tempImage;
+      const GLubyte *src = tempImage;
       GLint img, row, col;
       if (!tempImage)
          return GL_FALSE;
@@ -2709,7 +2709,7 @@ _mesa_texstore_unorm8(TEXSTORE_PARAMS)
             + dstXoffset * texelBytes;
          for (row = 0; row < srcHeight; row++) {
             for (col = 0; col < srcWidth; col++) {
-               dstRow[col] = CHAN_TO_UBYTE(src[col]);
+               dstRow[col] = src[col];
             }
             dstRow += dstRowStride;
             src += srcWidth;
diff --git a/mesalib/src/mesa/main/texstore.h b/mesalib/src/mesa/main/texstore.h
index d56318709..24a254ac4 100644
--- a/mesalib/src/mesa/main/texstore.h
+++ b/mesalib/src/mesa/main/texstore.h
@@ -72,8 +72,8 @@ extern GLboolean
 _mesa_texstore(TEXSTORE_PARAMS);
 
 
-extern GLchan *
-_mesa_make_temp_chan_image(struct gl_context *ctx, GLuint dims,
+extern GLubyte *
+_mesa_make_temp_ubyte_image(struct gl_context *ctx, GLuint dims,
                            GLenum logicalBaseFormat,
                            GLenum textureBaseFormat,
                            GLint srcWidth, GLint srcHeight, GLint srcDepth,
diff --git a/mesalib/src/mesa/math/m_translate.h b/mesalib/src/mesa/math/m_translate.h
index 580410311..bf7485c8c 100644
--- a/mesalib/src/mesa/math/m_translate.h
+++ b/mesalib/src/mesa/math/m_translate.h
@@ -29,7 +29,7 @@
 #include "main/compiler.h"
 #include "main/glheader.h"
 #include "main/mtypes.h"		/* hack for GLchan */
-
+#include "swrast/s_chan.h"
 
 /**
  * Array translation.
diff --git a/mesalib/src/mesa/state_tracker/st_cb_texture.c b/mesalib/src/mesa/state_tracker/st_cb_texture.c
index 68323a35a..97c1fabd5 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_texture.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_texture.c
@@ -849,6 +849,9 @@ decompress_with_blit(struct gl_context * ctx, GLenum target, GLint level,
       pipe->render_condition(pipe, NULL, 0);
    }
 
+   /* Choose the source mipmap level */
+   src_view->u.tex.first_level = src_view->u.tex.last_level = level;
+
    /* blit/render/decompress */
    util_blit_pixels_tex(st->blit,
                         src_view,      /* pipe_resource (src) */
diff --git a/mesalib/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/mesalib/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 892169822..f68270d0f 100644
--- a/mesalib/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/mesalib/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -1528,15 +1528,45 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
          st_src_reg temp = get_temp(native_integers ?
                glsl_type::get_instance(ir->operands[0]->type->base_type, 4, 1) :
                glsl_type::vec4_type);
-         assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
-         emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
-         
-         /* After the dot-product, the value will be an integer on the
-          * range [0,4].  Zero becomes 1.0, and positive values become zero.
-          */
-         emit_dp(ir, result_dst, temp, temp, vector_elements);
          
-         if (result_dst.type == GLSL_TYPE_FLOAT) {
+         if (native_integers) {
+            st_dst_reg temp_dst = st_dst_reg(temp);
+            st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
+            
+            emit(ir, TGSI_OPCODE_SEQ, st_dst_reg(temp), op[0], op[1]);
+            
+            /* Emit 1-3 AND operations to combine the SEQ results. */
+            switch (ir->operands[0]->type->vector_elements) {
+            case 2:
+               break;
+            case 3:
+               temp_dst.writemask = WRITEMASK_Y;
+               temp1.swizzle = SWIZZLE_YYYY;
+               temp2.swizzle = SWIZZLE_ZZZZ;
+               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+               break;
+            case 4:
+               temp_dst.writemask = WRITEMASK_X;
+               temp1.swizzle = SWIZZLE_XXXX;
+               temp2.swizzle = SWIZZLE_YYYY;
+               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+               temp_dst.writemask = WRITEMASK_Y;
+               temp1.swizzle = SWIZZLE_ZZZZ;
+               temp2.swizzle = SWIZZLE_WWWW;
+               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+            }
+            
+            temp1.swizzle = SWIZZLE_XXXX;
+            temp2.swizzle = SWIZZLE_YYYY;
+            emit(ir, TGSI_OPCODE_AND, result_dst, temp1, temp2);
+         } else {
+            emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
+            
+            /* After the dot-product, the value will be an integer on the
+             * range [0,4].  Zero becomes 1.0, and positive values become zero.
+             */
+            emit_dp(ir, result_dst, temp, temp, vector_elements);
+
             /* Negating the result of the dot-product gives values on the range
              * [-4, 0].  Zero becomes 1.0, and negative values become zero.
              * This is achieved using SGE.
@@ -1544,11 +1574,6 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
             st_src_reg sge_src = result_src;
             sge_src.negate = ~sge_src.negate;
             emit(ir, TGSI_OPCODE_SGE, result_dst, sge_src, st_src_reg_for_float(0.0));
-         } else {
-            /* The TGSI negate flag doesn't work for integers, so use SEQ 0
-             * instead.
-             */
-            emit(ir, TGSI_OPCODE_SEQ, result_dst, result_src, st_src_reg_for_int(0));
          }
       } else {
          emit(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
@@ -1561,30 +1586,56 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
          st_src_reg temp = get_temp(native_integers ?
                glsl_type::get_instance(ir->operands[0]->type->base_type, 4, 1) :
                glsl_type::vec4_type);
-         assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
          emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
 
-         /* After the dot-product, the value will be an integer on the
-          * range [0,4].  Zero stays zero, and positive values become 1.0.
-          */
-         glsl_to_tgsi_instruction *const dp =
-               emit_dp(ir, result_dst, temp, temp, vector_elements);
-         if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
-             result_dst.type == GLSL_TYPE_FLOAT) {
-            /* The clamping to [0,1] can be done for free in the fragment
-             * shader with a saturate.
-             */
-            dp->saturate = true;
-         } else if (result_dst.type == GLSL_TYPE_FLOAT) {
-            /* Negating the result of the dot-product gives values on the range
-             * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
-             * achieved using SLT.
-             */
-            st_src_reg slt_src = result_src;
-            slt_src.negate = ~slt_src.negate;
-            emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
+         if (native_integers) {
+            st_dst_reg temp_dst = st_dst_reg(temp);
+            st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
+            
+            /* Emit 1-3 OR operations to combine the SNE results. */
+            switch (ir->operands[0]->type->vector_elements) {
+            case 2:
+               break;
+            case 3:
+               temp_dst.writemask = WRITEMASK_Y;
+               temp1.swizzle = SWIZZLE_YYYY;
+               temp2.swizzle = SWIZZLE_ZZZZ;
+               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+               break;
+            case 4:
+               temp_dst.writemask = WRITEMASK_X;
+               temp1.swizzle = SWIZZLE_XXXX;
+               temp2.swizzle = SWIZZLE_YYYY;
+               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+               temp_dst.writemask = WRITEMASK_Y;
+               temp1.swizzle = SWIZZLE_ZZZZ;
+               temp2.swizzle = SWIZZLE_WWWW;
+               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+            }
+            
+            temp1.swizzle = SWIZZLE_XXXX;
+            temp2.swizzle = SWIZZLE_YYYY;
+            emit(ir, TGSI_OPCODE_OR, result_dst, temp1, temp2);
          } else {
-            emit(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0));
+            /* After the dot-product, the value will be an integer on the
+             * range [0,4].  Zero stays zero, and positive values become 1.0.
+             */
+            glsl_to_tgsi_instruction *const dp =
+                  emit_dp(ir, result_dst, temp, temp, vector_elements);
+            if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
+               /* The clamping to [0,1] can be done for free in the fragment
+                * shader with a saturate.
+                */
+               dp->saturate = true;
+            } else {
+               /* Negating the result of the dot-product gives values on the range
+                * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
+                * achieved using SLT.
+                */
+               st_src_reg slt_src = result_src;
+               slt_src.negate = ~slt_src.negate;
+               emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
+            }
          }
       } else {
          emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
diff --git a/mesalib/src/mesa/swrast/s_span.h b/mesalib/src/mesa/swrast/s_span.h
index d3cce304f..382c3d2eb 100644
--- a/mesalib/src/mesa/swrast/s_span.h
+++ b/mesalib/src/mesa/swrast/s_span.h
@@ -1,223 +1,225 @@
-/*
- * Mesa 3-D graphics library
- * Version:  7.5
- *
- * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
- * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-#ifndef S_SPAN_H
-#define S_SPAN_H
-
-
-#include "main/config.h"
-#include "main/glheader.h"
-#include "main/mtypes.h"
-
-struct gl_context;
-struct gl_renderbuffer;
-
-
-/**
- * \defgroup SpanFlags
- * Special bitflags to describe span data.
- *
- * In general, the point/line/triangle functions interpolate/emit the
- * attributes specified by swrast->_ActiveAttribs (i.e. FRAT_BIT_* values).
- * Some things don't fit into that, though, so we have these flags.
- */
-/*@{*/
-#define SPAN_RGBA       0x01  /**< interpMask and arrayMask */
-#define SPAN_Z          0x02  /**< interpMask and arrayMask */
-#define SPAN_FLAT       0x04  /**< interpMask: flat shading? */
-#define SPAN_XY         0x08  /**< array.x[], y[] valid? */
-#define SPAN_MASK       0x10  /**< was array.mask[] filled in by caller? */
-#define SPAN_LAMBDA     0x20  /**< array.lambda[] valid? */
-#define SPAN_COVERAGE   0x40  /**< array.coverage[] valid? */
-/*@}*/
-
-
-/**
- * \sw_span_arrays 
- * \brief Arrays of fragment values.
- *
- * These will either be computed from the span x/xStep values or
- * filled in by glDraw/CopyPixels, etc.
- * These arrays are separated out of sw_span to conserve memory.
- */
-typedef struct sw_span_arrays
-{
-   /** Per-fragment attributes (indexed by FRAG_ATTRIB_* tokens) */
-   /* XXX someday look at transposing first two indexes for better memory
-    * access pattern.
-    */
-   GLfloat attribs[FRAG_ATTRIB_MAX][MAX_WIDTH][4];
-
-   /** This mask indicates which fragments are alive or culled */
-   GLubyte mask[MAX_WIDTH];
-
-   GLenum ChanType; /**< Color channel type, GL_UNSIGNED_BYTE, GL_FLOAT */
-
-   /** Attribute arrays that don't fit into attribs[] array above */
-   /*@{*/
-   GLubyte rgba8[MAX_WIDTH][4];
-   GLushort rgba16[MAX_WIDTH][4];
-   GLchan (*rgba)[4];  /** either == rgba8 or rgba16 */
-   GLint   x[MAX_WIDTH];  /**< fragment X coords */
-   GLint   y[MAX_WIDTH];  /**< fragment Y coords */
-   GLuint  z[MAX_WIDTH];  /**< fragment Z coords */
-   GLuint  index[MAX_WIDTH];  /**< Color indexes */
-   GLfloat lambda[MAX_TEXTURE_COORD_UNITS][MAX_WIDTH]; /**< Texture LOD */
-   GLfloat coverage[MAX_WIDTH];  /**< Fragment coverage for AA/smoothing */
-   /*@}*/
-} SWspanarrays;
-
-
-/**
- * The SWspan structure describes the colors, Z, fogcoord, texcoords,
- * etc for either a horizontal run or an array of independent pixels.
- * We can either specify a base/step to indicate interpolated values, or
- * fill in explicit arrays of values.  The interpMask and arrayMask bitfields
- * indicate which attributes are active interpolants or arrays, respectively.
- *
- * It would be interesting to experiment with multiprocessor rasterization
- * with this structure.  The triangle rasterizer could simply emit a
- * stream of these structures which would be consumed by one or more
- * span-processing threads which could run in parallel.
- */
-typedef struct sw_span
-{
-   /** Coord of first fragment in horizontal span/run */
-   GLint x, y;
-
-   /** Number of fragments in the span */
-   GLuint end;
-
-   /** for clipping left edge of spans */
-   GLuint leftClip;
-
-   /** This flag indicates that mask[] array is effectively filled with ones */
-   GLboolean writeAll;
-
-   /** either GL_POLYGON, GL_LINE, GL_POLYGON, GL_BITMAP */
-   GLenum primitive;
-
-   /** 0 = front-facing span, 1 = back-facing span (for two-sided stencil) */
-   GLuint facing;
-
-   /**
-    * This bitmask (of  \link SpanFlags SPAN_* flags\endlink) indicates
-    * which of the attrStart/StepX/StepY variables are relevant.
-    */
-   GLbitfield interpMask;
-
-   /** Fragment attribute interpolants */
-   GLfloat attrStart[FRAG_ATTRIB_MAX][4];   /**< initial value */
-   GLfloat attrStepX[FRAG_ATTRIB_MAX][4];   /**< dvalue/dx */
-   GLfloat attrStepY[FRAG_ATTRIB_MAX][4];   /**< dvalue/dy */
-
-   /* XXX the rest of these will go away eventually... */
-
-   /* For horizontal spans, step is the partial derivative wrt X.
-    * For lines, step is the delta from one fragment to the next.
-    */
-   GLfixed red, redStep;
-   GLfixed green, greenStep;
-   GLfixed blue, blueStep;
-   GLfixed alpha, alphaStep;
-   GLfixed index, indexStep;
-   GLfixed z, zStep;    /**< XXX z should probably be GLuint */
-   GLfixed intTex[2], intTexStep[2];  /**< (s,t) for unit[0] only */
-
-   /**
-    * This bitmask (of \link SpanFlags SPAN_* flags\endlink) indicates
-    * which of the fragment arrays in the span_arrays struct are relevant.
-    */
-   GLbitfield arrayMask;
-
-   GLbitfield arrayAttribs;
-
-   /**
-    * We store the arrays of fragment values in a separate struct so
-    * that we can allocate sw_span structs on the stack without using
-    * a lot of memory.  The span_arrays struct is about 1.4MB while the
-    * sw_span struct is only about 512 bytes.
-    */
-   SWspanarrays *array;
-} SWspan;
-
-
-
-#define INIT_SPAN(S, PRIMITIVE)			\
-do {						\
-   (S).primitive = (PRIMITIVE);			\
-   (S).interpMask = 0x0;			\
-   (S).arrayMask = 0x0;				\
-   (S).arrayAttribs = 0x0;			\
-   (S).end = 0;					\
-   (S).leftClip = 0;				\
-   (S).facing = 0;				\
-   (S).array = SWRAST_CONTEXT(ctx)->SpanArrays;	\
-} while (0)
-
-
-
-extern void
-_swrast_span_default_attribs(struct gl_context *ctx, SWspan *span);
-
-extern void
-_swrast_span_interpolate_z( const struct gl_context *ctx, SWspan *span );
-
-extern GLfloat
-_swrast_compute_lambda(GLfloat dsdx, GLfloat dsdy, GLfloat dtdx, GLfloat dtdy,
-                       GLfloat dqdx, GLfloat dqdy, GLfloat texW, GLfloat texH,
-                       GLfloat s, GLfloat t, GLfloat q, GLfloat invQ);
-
-
-extern void
-_swrast_write_rgba_span( struct gl_context *ctx, SWspan *span);
-
-
-extern void
-_swrast_read_rgba_span(struct gl_context *ctx, struct gl_renderbuffer *rb,
-                       GLuint n, GLint x, GLint y, GLenum type, GLvoid *rgba);
-
-extern void
-_swrast_get_values(struct gl_context *ctx, struct gl_renderbuffer *rb,
-                   GLuint count, const GLint x[], const GLint y[],
-                   void *values, GLuint valueSize);
-
-extern void
-_swrast_put_row(struct gl_context *ctx, struct gl_renderbuffer *rb,
-                GLuint count, GLint x, GLint y,
-                const GLvoid *values, GLuint valueSize);
-
-extern void
-_swrast_get_row(struct gl_context *ctx, struct gl_renderbuffer *rb,
-                GLuint count, GLint x, GLint y,
-                GLvoid *values, GLuint valueSize);
-
-
-extern void *
-_swrast_get_dest_rgba(struct gl_context *ctx, struct gl_renderbuffer *rb,
-                      SWspan *span);
-
-#endif
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.5
+ *
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef S_SPAN_H
+#define S_SPAN_H
+
+
+#include "main/config.h"
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "swrast/s_chan.h"
+
+
+struct gl_context;
+struct gl_renderbuffer;
+
+
+/**
+ * \defgroup SpanFlags
+ * Special bitflags to describe span data.
+ *
+ * In general, the point/line/triangle functions interpolate/emit the
+ * attributes specified by swrast->_ActiveAttribs (i.e. FRAT_BIT_* values).
+ * Some things don't fit into that, though, so we have these flags.
+ */
+/*@{*/
+#define SPAN_RGBA       0x01  /**< interpMask and arrayMask */
+#define SPAN_Z          0x02  /**< interpMask and arrayMask */
+#define SPAN_FLAT       0x04  /**< interpMask: flat shading? */
+#define SPAN_XY         0x08  /**< array.x[], y[] valid? */
+#define SPAN_MASK       0x10  /**< was array.mask[] filled in by caller? */
+#define SPAN_LAMBDA     0x20  /**< array.lambda[] valid? */
+#define SPAN_COVERAGE   0x40  /**< array.coverage[] valid? */
+/*@}*/
+
+
+/**
+ * \sw_span_arrays 
+ * \brief Arrays of fragment values.
+ *
+ * These will either be computed from the span x/xStep values or
+ * filled in by glDraw/CopyPixels, etc.
+ * These arrays are separated out of sw_span to conserve memory.
+ */
+typedef struct sw_span_arrays
+{
+   /** Per-fragment attributes (indexed by FRAG_ATTRIB_* tokens) */
+   /* XXX someday look at transposing first two indexes for better memory
+    * access pattern.
+    */
+   GLfloat attribs[FRAG_ATTRIB_MAX][MAX_WIDTH][4];
+
+   /** This mask indicates which fragments are alive or culled */
+   GLubyte mask[MAX_WIDTH];
+
+   GLenum ChanType; /**< Color channel type, GL_UNSIGNED_BYTE, GL_FLOAT */
+
+   /** Attribute arrays that don't fit into attribs[] array above */
+   /*@{*/
+   GLubyte rgba8[MAX_WIDTH][4];
+   GLushort rgba16[MAX_WIDTH][4];
+   GLchan (*rgba)[4];  /** either == rgba8 or rgba16 */
+   GLint   x[MAX_WIDTH];  /**< fragment X coords */
+   GLint   y[MAX_WIDTH];  /**< fragment Y coords */
+   GLuint  z[MAX_WIDTH];  /**< fragment Z coords */
+   GLuint  index[MAX_WIDTH];  /**< Color indexes */
+   GLfloat lambda[MAX_TEXTURE_COORD_UNITS][MAX_WIDTH]; /**< Texture LOD */
+   GLfloat coverage[MAX_WIDTH];  /**< Fragment coverage for AA/smoothing */
+   /*@}*/
+} SWspanarrays;
+
+
+/**
+ * The SWspan structure describes the colors, Z, fogcoord, texcoords,
+ * etc for either a horizontal run or an array of independent pixels.
+ * We can either specify a base/step to indicate interpolated values, or
+ * fill in explicit arrays of values.  The interpMask and arrayMask bitfields
+ * indicate which attributes are active interpolants or arrays, respectively.
+ *
+ * It would be interesting to experiment with multiprocessor rasterization
+ * with this structure.  The triangle rasterizer could simply emit a
+ * stream of these structures which would be consumed by one or more
+ * span-processing threads which could run in parallel.
+ */
+typedef struct sw_span
+{
+   /** Coord of first fragment in horizontal span/run */
+   GLint x, y;
+
+   /** Number of fragments in the span */
+   GLuint end;
+
+   /** for clipping left edge of spans */
+   GLuint leftClip;
+
+   /** This flag indicates that mask[] array is effectively filled with ones */
+   GLboolean writeAll;
+
+   /** either GL_POLYGON, GL_LINE, GL_POLYGON, GL_BITMAP */
+   GLenum primitive;
+
+   /** 0 = front-facing span, 1 = back-facing span (for two-sided stencil) */
+   GLuint facing;
+
+   /**
+    * This bitmask (of  \link SpanFlags SPAN_* flags\endlink) indicates
+    * which of the attrStart/StepX/StepY variables are relevant.
+    */
+   GLbitfield interpMask;
+
+   /** Fragment attribute interpolants */
+   GLfloat attrStart[FRAG_ATTRIB_MAX][4];   /**< initial value */
+   GLfloat attrStepX[FRAG_ATTRIB_MAX][4];   /**< dvalue/dx */
+   GLfloat attrStepY[FRAG_ATTRIB_MAX][4];   /**< dvalue/dy */
+
+   /* XXX the rest of these will go away eventually... */
+
+   /* For horizontal spans, step is the partial derivative wrt X.
+    * For lines, step is the delta from one fragment to the next.
+    */
+   GLfixed red, redStep;
+   GLfixed green, greenStep;
+   GLfixed blue, blueStep;
+   GLfixed alpha, alphaStep;
+   GLfixed index, indexStep;
+   GLfixed z, zStep;    /**< XXX z should probably be GLuint */
+   GLfixed intTex[2], intTexStep[2];  /**< (s,t) for unit[0] only */
+
+   /**
+    * This bitmask (of \link SpanFlags SPAN_* flags\endlink) indicates
+    * which of the fragment arrays in the span_arrays struct are relevant.
+    */
+   GLbitfield arrayMask;
+
+   GLbitfield arrayAttribs;
+
+   /**
+    * We store the arrays of fragment values in a separate struct so
+    * that we can allocate sw_span structs on the stack without using
+    * a lot of memory.  The span_arrays struct is about 1.4MB while the
+    * sw_span struct is only about 512 bytes.
+    */
+   SWspanarrays *array;
+} SWspan;
+
+
+
+#define INIT_SPAN(S, PRIMITIVE)			\
+do {						\
+   (S).primitive = (PRIMITIVE);			\
+   (S).interpMask = 0x0;			\
+   (S).arrayMask = 0x0;				\
+   (S).arrayAttribs = 0x0;			\
+   (S).end = 0;					\
+   (S).leftClip = 0;				\
+   (S).facing = 0;				\
+   (S).array = SWRAST_CONTEXT(ctx)->SpanArrays;	\
+} while (0)
+
+
+
+extern void
+_swrast_span_default_attribs(struct gl_context *ctx, SWspan *span);
+
+extern void
+_swrast_span_interpolate_z( const struct gl_context *ctx, SWspan *span );
+
+extern GLfloat
+_swrast_compute_lambda(GLfloat dsdx, GLfloat dsdy, GLfloat dtdx, GLfloat dtdy,
+                       GLfloat dqdx, GLfloat dqdy, GLfloat texW, GLfloat texH,
+                       GLfloat s, GLfloat t, GLfloat q, GLfloat invQ);
+
+
+extern void
+_swrast_write_rgba_span( struct gl_context *ctx, SWspan *span);
+
+
+extern void
+_swrast_read_rgba_span(struct gl_context *ctx, struct gl_renderbuffer *rb,
+                       GLuint n, GLint x, GLint y, GLenum type, GLvoid *rgba);
+
+extern void
+_swrast_get_values(struct gl_context *ctx, struct gl_renderbuffer *rb,
+                   GLuint count, const GLint x[], const GLint y[],
+                   void *values, GLuint valueSize);
+
+extern void
+_swrast_put_row(struct gl_context *ctx, struct gl_renderbuffer *rb,
+                GLuint count, GLint x, GLint y,
+                const GLvoid *values, GLuint valueSize);
+
+extern void
+_swrast_get_row(struct gl_context *ctx, struct gl_renderbuffer *rb,
+                GLuint count, GLint x, GLint y,
+                GLvoid *values, GLuint valueSize);
+
+
+extern void *
+_swrast_get_dest_rgba(struct gl_context *ctx, struct gl_renderbuffer *rb,
+                      SWspan *span);
+
+#endif
diff --git a/mesalib/src/mesa/swrast/s_texfetch_tmp.h b/mesalib/src/mesa/swrast/s_texfetch_tmp.h
index 3eebd13d7..c63b2043c 100644
--- a/mesalib/src/mesa/swrast/s_texfetch_tmp.h
+++ b/mesalib/src/mesa/swrast/s_texfetch_tmp.h
@@ -976,7 +976,7 @@ static void FETCH(f_rg88)( const struct swrast_texture_image *texImage,
 static void store_texel_rg88(struct swrast_texture_image *texImage,
                              GLint i, GLint j, GLint k, const void *texel)
 {
-   const GLchan *rgba = (const GLubyte *) texel;
+   const GLchan *rgba = (const GLchan *) texel;
    GLushort *dst = TEXEL_ADDR(GLushort, texImage, i, j, k, 1);
    GLubyte r = CHAN_TO_UBYTE(rgba[RCOMP]);
    GLubyte g = CHAN_TO_UBYTE(rgba[GCOMP]);
diff --git a/mesalib/src/mesa/swrast/swrast.h b/mesalib/src/mesa/swrast/swrast.h
index 390b42264..06cc65158 100644
--- a/mesalib/src/mesa/swrast/swrast.h
+++ b/mesalib/src/mesa/swrast/swrast.h
@@ -33,6 +33,7 @@
 #define SWRAST_H
 
 #include "main/mtypes.h"
+#include "swrast/s_chan.h"
 
 /**
  * \struct SWvertex
diff --git a/mesalib/src/mesa/tnl/t_vertex.c b/mesalib/src/mesa/tnl/t_vertex.c
index 0ca324402..6582949a0 100644
--- a/mesalib/src/mesa/tnl/t_vertex.c
+++ b/mesalib/src/mesa/tnl/t_vertex.c
@@ -1,564 +1,564 @@
-/*
- * Copyright 2003 Tungsten Graphics, inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Keith Whitwell <keithw@tungstengraphics.com>
- */
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/colormac.h"
-
-#include "t_context.h"
-#include "t_vertex.h"
-
-#define DBG 0
-
-/* Build and manage clipspace/ndc/window vertices.
- */
-
-static GLboolean match_fastpath( struct tnl_clipspace *vtx,
-				 const struct tnl_clipspace_fastpath *fp)
-{
-   GLuint j;
-
-   if (vtx->attr_count != fp->attr_count) 
-      return GL_FALSE;
-
-   for (j = 0; j < vtx->attr_count; j++) 
-      if (vtx->attr[j].format != fp->attr[j].format ||
-	  vtx->attr[j].inputsize != fp->attr[j].size ||
-	  vtx->attr[j].vertoffset != fp->attr[j].offset) 
-	 return GL_FALSE;
-      
-   if (fp->match_strides) {
-      if (vtx->vertex_size != fp->vertex_size)
-	 return GL_FALSE;
-
-      for (j = 0; j < vtx->attr_count; j++) 
-	 if (vtx->attr[j].inputstride != fp->attr[j].stride) 
-	    return GL_FALSE;
-   }
-   
-   return GL_TRUE;
-}
-
-static GLboolean search_fastpath_emit( struct tnl_clipspace *vtx )
-{
-   struct tnl_clipspace_fastpath *fp = vtx->fastpath;
-
-   for ( ; fp ; fp = fp->next) {
-      if (match_fastpath(vtx, fp)) {
-         vtx->emit = fp->func;
-	 return GL_TRUE;
-      }
-   }
-
-   return GL_FALSE;
-}
-
-void _tnl_register_fastpath( struct tnl_clipspace *vtx,
-			     GLboolean match_strides )
-{
-   struct tnl_clipspace_fastpath *fastpath = CALLOC_STRUCT(tnl_clipspace_fastpath);
-   GLuint i;
-
-   fastpath->vertex_size = vtx->vertex_size;
-   fastpath->attr_count = vtx->attr_count;
-   fastpath->match_strides = match_strides;
-   fastpath->func = vtx->emit;
-   fastpath->attr = (struct tnl_attr_type *)
-      malloc(vtx->attr_count * sizeof(fastpath->attr[0]));
-
-   for (i = 0; i < vtx->attr_count; i++) {
-      fastpath->attr[i].format = vtx->attr[i].format;
-      fastpath->attr[i].stride = vtx->attr[i].inputstride;
-      fastpath->attr[i].size = vtx->attr[i].inputsize;
-      fastpath->attr[i].offset = vtx->attr[i].vertoffset;
-   }
-
-   fastpath->next = vtx->fastpath;
-   vtx->fastpath = fastpath;
-}
-
-
-
-/***********************************************************************
- * Build codegen functions or return generic ones:
- */
-static void choose_emit_func( struct gl_context *ctx, GLuint count, GLubyte *dest)
-{
-   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   struct tnl_clipspace_attr *a = vtx->attr;
-   const GLuint attr_count = vtx->attr_count;
-   GLuint j;
-
-   for (j = 0; j < attr_count; j++) {
-      GLvector4f *vptr = VB->AttribPtr[a[j].attrib];
-      a[j].inputstride = vptr->stride;
-      a[j].inputsize = vptr->size;
-      a[j].emit = a[j].insert[vptr->size - 1]; /* not always used */
-   }
-
-   vtx->emit = NULL;
-   
-   /* Does this match an existing (hardwired, codegen or known-bad)
-    * fastpath?
-    */
-   if (search_fastpath_emit(vtx)) {
-      /* Use this result.  If it is null, then it is already known
-       * that the current state will fail for codegen and there is no
-       * point trying again.
-       */
-   }
-   else if (vtx->codegen_emit) {
-      vtx->codegen_emit(ctx);
-   }
-
-   if (!vtx->emit) {
-      _tnl_generate_hardwired_emit(ctx);
-   }
-
-   /* Otherwise use the generic version:
-    */
-   if (!vtx->emit)
-      vtx->emit = _tnl_generic_emit;
-
-   vtx->emit( ctx, count, dest );
-}
-
-
-
-static void choose_interp_func( struct gl_context *ctx,
-				GLfloat t,
-				GLuint edst, GLuint eout, GLuint ein,
-				GLboolean force_boundary )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-
-   if (vtx->need_extras && 
-       (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
-      vtx->interp = _tnl_generic_interp_extras;
-   } else {
-      vtx->interp = _tnl_generic_interp;
-   }
-
-   vtx->interp( ctx, t, edst, eout, ein, force_boundary );
-}
-
-
-static void choose_copy_pv_func(  struct gl_context *ctx, GLuint edst, GLuint esrc )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-
-   if (vtx->need_extras && 
-       (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
-      vtx->copy_pv = _tnl_generic_copy_pv_extras;
-   } else {
-      vtx->copy_pv = _tnl_generic_copy_pv;
-   }
-
-   vtx->copy_pv( ctx, edst, esrc );
-}
-
-
-/***********************************************************************
- * Public entrypoints, mostly dispatch to the above:
- */
-
-
-/* Interpolate between two vertices to produce a third:
- */
-void _tnl_interp( struct gl_context *ctx,
-		  GLfloat t,
-		  GLuint edst, GLuint eout, GLuint ein,
-		  GLboolean force_boundary )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   vtx->interp( ctx, t, edst, eout, ein, force_boundary );
-}
-
-/* Copy colors from one vertex to another:
- */
-void _tnl_copy_pv(  struct gl_context *ctx, GLuint edst, GLuint esrc )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   vtx->copy_pv( ctx, edst, esrc );
-}
-
-
-/* Extract a named attribute from a hardware vertex.  Will have to
- * reverse any viewport transformation, swizzling or other conversions
- * which may have been applied:
- */
-void _tnl_get_attr( struct gl_context *ctx, const void *vin,
-			      GLenum attr, GLfloat *dest )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   const struct tnl_clipspace_attr *a = vtx->attr;
-   const GLuint attr_count = vtx->attr_count;
-   GLuint j;
-
-   for (j = 0; j < attr_count; j++) {
-      if (a[j].attrib == attr) {
-	 a[j].extract( &a[j], dest, (GLubyte *)vin + a[j].vertoffset );
-	 return;
-      }
-   }
-
-   /* Else return the value from ctx->Current.
-    */
-   if (attr == _TNL_ATTRIB_POINTSIZE) {
-      /* If the hardware vertex doesn't have point size then use size from
-       * struct gl_context.  XXX this will be wrong if drawing attenuated points!
-       */
-      dest[0] = ctx->Point.Size;
-   }
-   else {
-      memcpy( dest, ctx->Current.Attrib[attr], 4*sizeof(GLfloat));
-   }
-}
-
-
-/* Complementary operation to the above.
- */
-void _tnl_set_attr( struct gl_context *ctx, void *vout,
-		    GLenum attr, const GLfloat *src )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   const struct tnl_clipspace_attr *a = vtx->attr;
-   const GLuint attr_count = vtx->attr_count;
-   GLuint j;
-
-   for (j = 0; j < attr_count; j++) {
-      if (a[j].attrib == attr) {
-	 a[j].insert[4-1]( &a[j], (GLubyte *)vout + a[j].vertoffset, src );
-	 return;
-      }
-   }
-}
-
-
-void *_tnl_get_vertex( struct gl_context *ctx, GLuint nr )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-
-   return vtx->vertex_buf + nr * vtx->vertex_size;
-}
-
-void _tnl_invalidate_vertex_state( struct gl_context *ctx, GLuint new_state )
-{
-   if (new_state & (_DD_NEW_TRI_LIGHT_TWOSIDE|_DD_NEW_TRI_UNFILLED) ) {
-      struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-      vtx->new_inputs = ~0;
-      vtx->interp = choose_interp_func;
-      vtx->copy_pv = choose_copy_pv_func;
-   }
-}
-
-static void invalidate_funcs( struct tnl_clipspace *vtx )
-{
-   vtx->emit = choose_emit_func;
-   vtx->interp = choose_interp_func;
-   vtx->copy_pv = choose_copy_pv_func;
-   vtx->new_inputs = ~0;
-}
-
-GLuint _tnl_install_attrs( struct gl_context *ctx, const struct tnl_attr_map *map,
-			   GLuint nr, const GLfloat *vp, 
-			   GLuint unpacked_size )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   GLuint offset = 0;
-   GLuint i, j;
-
-   assert(nr < _TNL_ATTRIB_MAX);
-   assert(nr == 0 || map[0].attrib == VERT_ATTRIB_POS);
-
-   vtx->new_inputs = ~0;
-   vtx->need_viewport = GL_FALSE;
-
-   if (vp) {
-      vtx->need_viewport = GL_TRUE;
-   }
-
-   for (j = 0, i = 0; i < nr; i++) {
-      const GLuint format = map[i].format;
-      if (format == EMIT_PAD) {
-	 if (DBG)
-	    printf("%d: pad %d, offset %d\n", i,  
-		   map[i].offset, offset);  
-
-	 offset += map[i].offset;
-
-      }
-      else {
-	 GLuint tmpoffset;
-
-	 if (unpacked_size) 
-	    tmpoffset = map[i].offset;
-	 else
-	    tmpoffset = offset;
-
-	 if (vtx->attr_count != j ||
-	     vtx->attr[j].attrib != map[i].attrib ||
-	     vtx->attr[j].format != format ||
-	     vtx->attr[j].vertoffset != tmpoffset) {
-	    invalidate_funcs(vtx);
-
-	    vtx->attr[j].attrib = map[i].attrib;
-	    vtx->attr[j].format = format;
-	    vtx->attr[j].vp = vp;
-	    vtx->attr[j].insert = _tnl_format_info[format].insert;
-	    vtx->attr[j].extract = _tnl_format_info[format].extract;
-	    vtx->attr[j].vertattrsize = _tnl_format_info[format].attrsize;
-	    vtx->attr[j].vertoffset = tmpoffset;
-	 }
-
-	 
-	 if (DBG)
-	    printf("%d: %s, vp %p, offset %d\n", i,  
-		   _tnl_format_info[format].name, (void *)vp,
-		   vtx->attr[j].vertoffset);   
-
-	 offset += _tnl_format_info[format].attrsize;
-	 j++;
-      }
-   }
-
-   vtx->attr_count = j;
-
-   if (unpacked_size)
-      vtx->vertex_size = unpacked_size;
-   else
-      vtx->vertex_size = offset;
-
-   assert(vtx->vertex_size <= vtx->max_vertex_size);
-   return vtx->vertex_size;
-}
-
-
-
-void _tnl_invalidate_vertices( struct gl_context *ctx, GLuint newinputs )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   vtx->new_inputs |= newinputs;
-}
-
-
-/* This event has broader use beyond this file - will move elsewhere
- * and probably invoke a driver callback.
- */
-void _tnl_notify_pipeline_output_change( struct gl_context *ctx )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   invalidate_funcs(vtx);
-}
-
-
-static void adjust_input_ptrs( struct gl_context *ctx, GLint diff)
-{
-   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   struct tnl_clipspace_attr *a = vtx->attr;
-   const GLuint count = vtx->attr_count;
-   GLuint j;
-
-   diff -= 1;
-   for (j=0; j<count; ++j) {
-           register GLvector4f *vptr = VB->AttribPtr[a->attrib];
-	   (a++)->inputptr += diff*vptr->stride;
-   }
-}
-
-static void update_input_ptrs( struct gl_context *ctx, GLuint start )
-{
-   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   struct tnl_clipspace_attr *a = vtx->attr;
-   const GLuint count = vtx->attr_count;
-   GLuint j;
-   
-   for (j = 0; j < count; j++) {
-      GLvector4f *vptr = VB->AttribPtr[a[j].attrib];
-
-      if (vtx->emit != choose_emit_func) {
-	 assert(a[j].inputstride == vptr->stride);
-	 assert(a[j].inputsize == vptr->size);
-      }
-
-      a[j].inputptr = ((GLubyte *)vptr->data) + start * vptr->stride;
-   }
-   
-   if (a->vp) {
-      vtx->vp_scale[0] = a->vp[MAT_SX];
-      vtx->vp_scale[1] = a->vp[MAT_SY];
-      vtx->vp_scale[2] = a->vp[MAT_SZ];
-      vtx->vp_scale[3] = 1.0;
-      vtx->vp_xlate[0] = a->vp[MAT_TX];
-      vtx->vp_xlate[1] = a->vp[MAT_TY];
-      vtx->vp_xlate[2] = a->vp[MAT_TZ];
-      vtx->vp_xlate[3] = 0.0;
-   }
-}
-
-
-void _tnl_build_vertices( struct gl_context *ctx,
-			  GLuint start,
-			  GLuint end,
-			  GLuint newinputs )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);  
-   update_input_ptrs( ctx, start );      
-   vtx->emit( ctx, end - start, 
-	      (GLubyte *)(vtx->vertex_buf + 
-			  start * vtx->vertex_size));
-}
-
-/* Emit VB vertices start..end to dest.  Note that VB vertex at
- * postion start will be emitted to dest at position zero.
- */
-void *_tnl_emit_vertices_to_buffer( struct gl_context *ctx,
-				    GLuint start,
-				    GLuint end,
-				    void *dest )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-
-   update_input_ptrs(ctx, start);
-   /* Note: dest should not be adjusted for non-zero 'start' values:
-    */
-   vtx->emit( ctx, end - start, (GLubyte*) dest );	
-   return (void *)((GLubyte *)dest + vtx->vertex_size * (end - start));
-}
-
-/* Emit indexed VB vertices start..end to dest.  Note that VB vertex at
- * postion start will be emitted to dest at position zero.
- */
-
-void *_tnl_emit_indexed_vertices_to_buffer( struct gl_context *ctx,
-					    const GLuint *elts,
-					    GLuint start,
-					    GLuint end,
-					    void *dest )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   GLuint oldIndex;
-   GLubyte *cdest = dest;
-
-   update_input_ptrs(ctx, oldIndex = elts[start++]);
-   vtx->emit( ctx, 1, cdest );
-   cdest += vtx->vertex_size;
-
-   for (; start < end; ++start) {
-      adjust_input_ptrs(ctx, elts[start] - oldIndex);
-      oldIndex = elts[start];
-      vtx->emit( ctx, 1, cdest);
-      cdest += vtx->vertex_size;
-   }
-
-   return (void *) cdest;
-}
-
-
-void _tnl_init_vertices( struct gl_context *ctx, 
-			GLuint vb_size,
-			GLuint max_vertex_size )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);  
-
-   _tnl_install_attrs( ctx, NULL, 0, NULL, 0 );
-
-   vtx->need_extras = GL_TRUE;
-   if (max_vertex_size > vtx->max_vertex_size) {
-      _tnl_free_vertices( ctx );
-      vtx->max_vertex_size = max_vertex_size;
-      vtx->vertex_buf = (GLubyte *)_mesa_align_calloc(vb_size * max_vertex_size, 32 );
-      invalidate_funcs(vtx);
-   }
-
-   switch(CHAN_TYPE) {
-   case GL_UNSIGNED_BYTE:
-      vtx->chan_scale[0] = 255.0;
-      vtx->chan_scale[1] = 255.0;
-      vtx->chan_scale[2] = 255.0;
-      vtx->chan_scale[3] = 255.0;
-      break;
-   case GL_UNSIGNED_SHORT:
-      vtx->chan_scale[0] = 65535.0;
-      vtx->chan_scale[1] = 65535.0;
-      vtx->chan_scale[2] = 65535.0;
-      vtx->chan_scale[3] = 65535.0;
-      break;
-   default:
-      vtx->chan_scale[0] = 1.0;
-      vtx->chan_scale[1] = 1.0;
-      vtx->chan_scale[2] = 1.0;
-      vtx->chan_scale[3] = 1.0;
-      break;
-   }
-
-   vtx->identity[0] = 0.0;
-   vtx->identity[1] = 0.0;
-   vtx->identity[2] = 0.0;
-   vtx->identity[3] = 1.0;
-
-   vtx->codegen_emit = NULL;
-
-#ifdef USE_SSE_ASM
-   if (!_mesa_getenv("MESA_NO_CODEGEN"))
-      vtx->codegen_emit = _tnl_generate_sse_emit;
-#endif
-}
-
-
-void _tnl_free_vertices( struct gl_context *ctx )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   if (tnl) {
-      struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-      struct tnl_clipspace_fastpath *fp, *tmp;
-
-      if (vtx->vertex_buf) {
-         _mesa_align_free(vtx->vertex_buf);
-         vtx->vertex_buf = NULL;
-      }
-
-      for (fp = vtx->fastpath ; fp ; fp = tmp) {
-         tmp = fp->next;
-         FREE(fp->attr);
-
-         /* KW: At the moment, fp->func is constrained to be allocated by
-          * _mesa_exec_alloc(), as the hardwired fastpaths in
-          * t_vertex_generic.c are handled specially.  It would be nice
-          * to unify them, but this probably won't change until this
-          * module gets another overhaul.
-          */
-         _mesa_exec_free((void *) fp->func);
-         FREE(fp);
-      }
-
-      vtx->fastpath = NULL;
-   }
-}
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/colormac.h"
+#include "swrast/s_chan.h"
+#include "t_context.h"
+#include "t_vertex.h"
+
+#define DBG 0
+
+/* Build and manage clipspace/ndc/window vertices.
+ */
+
+static GLboolean match_fastpath( struct tnl_clipspace *vtx,
+				 const struct tnl_clipspace_fastpath *fp)
+{
+   GLuint j;
+
+   if (vtx->attr_count != fp->attr_count) 
+      return GL_FALSE;
+
+   for (j = 0; j < vtx->attr_count; j++) 
+      if (vtx->attr[j].format != fp->attr[j].format ||
+	  vtx->attr[j].inputsize != fp->attr[j].size ||
+	  vtx->attr[j].vertoffset != fp->attr[j].offset) 
+	 return GL_FALSE;
+      
+   if (fp->match_strides) {
+      if (vtx->vertex_size != fp->vertex_size)
+	 return GL_FALSE;
+
+      for (j = 0; j < vtx->attr_count; j++) 
+	 if (vtx->attr[j].inputstride != fp->attr[j].stride) 
+	    return GL_FALSE;
+   }
+   
+   return GL_TRUE;
+}
+
+static GLboolean search_fastpath_emit( struct tnl_clipspace *vtx )
+{
+   struct tnl_clipspace_fastpath *fp = vtx->fastpath;
+
+   for ( ; fp ; fp = fp->next) {
+      if (match_fastpath(vtx, fp)) {
+         vtx->emit = fp->func;
+	 return GL_TRUE;
+      }
+   }
+
+   return GL_FALSE;
+}
+
+void _tnl_register_fastpath( struct tnl_clipspace *vtx,
+			     GLboolean match_strides )
+{
+   struct tnl_clipspace_fastpath *fastpath = CALLOC_STRUCT(tnl_clipspace_fastpath);
+   GLuint i;
+
+   fastpath->vertex_size = vtx->vertex_size;
+   fastpath->attr_count = vtx->attr_count;
+   fastpath->match_strides = match_strides;
+   fastpath->func = vtx->emit;
+   fastpath->attr = (struct tnl_attr_type *)
+      malloc(vtx->attr_count * sizeof(fastpath->attr[0]));
+
+   for (i = 0; i < vtx->attr_count; i++) {
+      fastpath->attr[i].format = vtx->attr[i].format;
+      fastpath->attr[i].stride = vtx->attr[i].inputstride;
+      fastpath->attr[i].size = vtx->attr[i].inputsize;
+      fastpath->attr[i].offset = vtx->attr[i].vertoffset;
+   }
+
+   fastpath->next = vtx->fastpath;
+   vtx->fastpath = fastpath;
+}
+
+
+
+/***********************************************************************
+ * Build codegen functions or return generic ones:
+ */
+static void choose_emit_func( struct gl_context *ctx, GLuint count, GLubyte *dest)
+{
+   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   struct tnl_clipspace_attr *a = vtx->attr;
+   const GLuint attr_count = vtx->attr_count;
+   GLuint j;
+
+   for (j = 0; j < attr_count; j++) {
+      GLvector4f *vptr = VB->AttribPtr[a[j].attrib];
+      a[j].inputstride = vptr->stride;
+      a[j].inputsize = vptr->size;
+      a[j].emit = a[j].insert[vptr->size - 1]; /* not always used */
+   }
+
+   vtx->emit = NULL;
+   
+   /* Does this match an existing (hardwired, codegen or known-bad)
+    * fastpath?
+    */
+   if (search_fastpath_emit(vtx)) {
+      /* Use this result.  If it is null, then it is already known
+       * that the current state will fail for codegen and there is no
+       * point trying again.
+       */
+   }
+   else if (vtx->codegen_emit) {
+      vtx->codegen_emit(ctx);
+   }
+
+   if (!vtx->emit) {
+      _tnl_generate_hardwired_emit(ctx);
+   }
+
+   /* Otherwise use the generic version:
+    */
+   if (!vtx->emit)
+      vtx->emit = _tnl_generic_emit;
+
+   vtx->emit( ctx, count, dest );
+}
+
+
+
+static void choose_interp_func( struct gl_context *ctx,
+				GLfloat t,
+				GLuint edst, GLuint eout, GLuint ein,
+				GLboolean force_boundary )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+
+   if (vtx->need_extras && 
+       (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
+      vtx->interp = _tnl_generic_interp_extras;
+   } else {
+      vtx->interp = _tnl_generic_interp;
+   }
+
+   vtx->interp( ctx, t, edst, eout, ein, force_boundary );
+}
+
+
+static void choose_copy_pv_func(  struct gl_context *ctx, GLuint edst, GLuint esrc )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+
+   if (vtx->need_extras && 
+       (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
+      vtx->copy_pv = _tnl_generic_copy_pv_extras;
+   } else {
+      vtx->copy_pv = _tnl_generic_copy_pv;
+   }
+
+   vtx->copy_pv( ctx, edst, esrc );
+}
+
+
+/***********************************************************************
+ * Public entrypoints, mostly dispatch to the above:
+ */
+
+
+/* Interpolate between two vertices to produce a third:
+ */
+void _tnl_interp( struct gl_context *ctx,
+		  GLfloat t,
+		  GLuint edst, GLuint eout, GLuint ein,
+		  GLboolean force_boundary )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   vtx->interp( ctx, t, edst, eout, ein, force_boundary );
+}
+
+/* Copy colors from one vertex to another:
+ */
+void _tnl_copy_pv(  struct gl_context *ctx, GLuint edst, GLuint esrc )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   vtx->copy_pv( ctx, edst, esrc );
+}
+
+
+/* Extract a named attribute from a hardware vertex.  Will have to
+ * reverse any viewport transformation, swizzling or other conversions
+ * which may have been applied:
+ */
+void _tnl_get_attr( struct gl_context *ctx, const void *vin,
+			      GLenum attr, GLfloat *dest )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   const struct tnl_clipspace_attr *a = vtx->attr;
+   const GLuint attr_count = vtx->attr_count;
+   GLuint j;
+
+   for (j = 0; j < attr_count; j++) {
+      if (a[j].attrib == attr) {
+	 a[j].extract( &a[j], dest, (GLubyte *)vin + a[j].vertoffset );
+	 return;
+      }
+   }
+
+   /* Else return the value from ctx->Current.
+    */
+   if (attr == _TNL_ATTRIB_POINTSIZE) {
+      /* If the hardware vertex doesn't have point size then use size from
+       * struct gl_context.  XXX this will be wrong if drawing attenuated points!
+       */
+      dest[0] = ctx->Point.Size;
+   }
+   else {
+      memcpy( dest, ctx->Current.Attrib[attr], 4*sizeof(GLfloat));
+   }
+}
+
+
+/* Complementary operation to the above.
+ */
+void _tnl_set_attr( struct gl_context *ctx, void *vout,
+		    GLenum attr, const GLfloat *src )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   const struct tnl_clipspace_attr *a = vtx->attr;
+   const GLuint attr_count = vtx->attr_count;
+   GLuint j;
+
+   for (j = 0; j < attr_count; j++) {
+      if (a[j].attrib == attr) {
+	 a[j].insert[4-1]( &a[j], (GLubyte *)vout + a[j].vertoffset, src );
+	 return;
+      }
+   }
+}
+
+
+void *_tnl_get_vertex( struct gl_context *ctx, GLuint nr )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+
+   return vtx->vertex_buf + nr * vtx->vertex_size;
+}
+
+void _tnl_invalidate_vertex_state( struct gl_context *ctx, GLuint new_state )
+{
+   if (new_state & (_DD_NEW_TRI_LIGHT_TWOSIDE|_DD_NEW_TRI_UNFILLED) ) {
+      struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+      vtx->new_inputs = ~0;
+      vtx->interp = choose_interp_func;
+      vtx->copy_pv = choose_copy_pv_func;
+   }
+}
+
+static void invalidate_funcs( struct tnl_clipspace *vtx )
+{
+   vtx->emit = choose_emit_func;
+   vtx->interp = choose_interp_func;
+   vtx->copy_pv = choose_copy_pv_func;
+   vtx->new_inputs = ~0;
+}
+
+GLuint _tnl_install_attrs( struct gl_context *ctx, const struct tnl_attr_map *map,
+			   GLuint nr, const GLfloat *vp, 
+			   GLuint unpacked_size )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   GLuint offset = 0;
+   GLuint i, j;
+
+   assert(nr < _TNL_ATTRIB_MAX);
+   assert(nr == 0 || map[0].attrib == VERT_ATTRIB_POS);
+
+   vtx->new_inputs = ~0;
+   vtx->need_viewport = GL_FALSE;
+
+   if (vp) {
+      vtx->need_viewport = GL_TRUE;
+   }
+
+   for (j = 0, i = 0; i < nr; i++) {
+      const GLuint format = map[i].format;
+      if (format == EMIT_PAD) {
+	 if (DBG)
+	    printf("%d: pad %d, offset %d\n", i,  
+		   map[i].offset, offset);  
+
+	 offset += map[i].offset;
+
+      }
+      else {
+	 GLuint tmpoffset;
+
+	 if (unpacked_size) 
+	    tmpoffset = map[i].offset;
+	 else
+	    tmpoffset = offset;
+
+	 if (vtx->attr_count != j ||
+	     vtx->attr[j].attrib != map[i].attrib ||
+	     vtx->attr[j].format != format ||
+	     vtx->attr[j].vertoffset != tmpoffset) {
+	    invalidate_funcs(vtx);
+
+	    vtx->attr[j].attrib = map[i].attrib;
+	    vtx->attr[j].format = format;
+	    vtx->attr[j].vp = vp;
+	    vtx->attr[j].insert = _tnl_format_info[format].insert;
+	    vtx->attr[j].extract = _tnl_format_info[format].extract;
+	    vtx->attr[j].vertattrsize = _tnl_format_info[format].attrsize;
+	    vtx->attr[j].vertoffset = tmpoffset;
+	 }
+
+	 
+	 if (DBG)
+	    printf("%d: %s, vp %p, offset %d\n", i,  
+		   _tnl_format_info[format].name, (void *)vp,
+		   vtx->attr[j].vertoffset);   
+
+	 offset += _tnl_format_info[format].attrsize;
+	 j++;
+      }
+   }
+
+   vtx->attr_count = j;
+
+   if (unpacked_size)
+      vtx->vertex_size = unpacked_size;
+   else
+      vtx->vertex_size = offset;
+
+   assert(vtx->vertex_size <= vtx->max_vertex_size);
+   return vtx->vertex_size;
+}
+
+
+
+void _tnl_invalidate_vertices( struct gl_context *ctx, GLuint newinputs )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   vtx->new_inputs |= newinputs;
+}
+
+
+/* This event has broader use beyond this file - will move elsewhere
+ * and probably invoke a driver callback.
+ */
+void _tnl_notify_pipeline_output_change( struct gl_context *ctx )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   invalidate_funcs(vtx);
+}
+
+
+static void adjust_input_ptrs( struct gl_context *ctx, GLint diff)
+{
+   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   struct tnl_clipspace_attr *a = vtx->attr;
+   const GLuint count = vtx->attr_count;
+   GLuint j;
+
+   diff -= 1;
+   for (j=0; j<count; ++j) {
+           register GLvector4f *vptr = VB->AttribPtr[a->attrib];
+	   (a++)->inputptr += diff*vptr->stride;
+   }
+}
+
+static void update_input_ptrs( struct gl_context *ctx, GLuint start )
+{
+   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   struct tnl_clipspace_attr *a = vtx->attr;
+   const GLuint count = vtx->attr_count;
+   GLuint j;
+   
+   for (j = 0; j < count; j++) {
+      GLvector4f *vptr = VB->AttribPtr[a[j].attrib];
+
+      if (vtx->emit != choose_emit_func) {
+	 assert(a[j].inputstride == vptr->stride);
+	 assert(a[j].inputsize == vptr->size);
+      }
+
+      a[j].inputptr = ((GLubyte *)vptr->data) + start * vptr->stride;
+   }
+   
+   if (a->vp) {
+      vtx->vp_scale[0] = a->vp[MAT_SX];
+      vtx->vp_scale[1] = a->vp[MAT_SY];
+      vtx->vp_scale[2] = a->vp[MAT_SZ];
+      vtx->vp_scale[3] = 1.0;
+      vtx->vp_xlate[0] = a->vp[MAT_TX];
+      vtx->vp_xlate[1] = a->vp[MAT_TY];
+      vtx->vp_xlate[2] = a->vp[MAT_TZ];
+      vtx->vp_xlate[3] = 0.0;
+   }
+}
+
+
+void _tnl_build_vertices( struct gl_context *ctx,
+			  GLuint start,
+			  GLuint end,
+			  GLuint newinputs )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);  
+   update_input_ptrs( ctx, start );      
+   vtx->emit( ctx, end - start, 
+	      (GLubyte *)(vtx->vertex_buf + 
+			  start * vtx->vertex_size));
+}
+
+/* Emit VB vertices start..end to dest.  Note that VB vertex at
+ * postion start will be emitted to dest at position zero.
+ */
+void *_tnl_emit_vertices_to_buffer( struct gl_context *ctx,
+				    GLuint start,
+				    GLuint end,
+				    void *dest )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+
+   update_input_ptrs(ctx, start);
+   /* Note: dest should not be adjusted for non-zero 'start' values:
+    */
+   vtx->emit( ctx, end - start, (GLubyte*) dest );	
+   return (void *)((GLubyte *)dest + vtx->vertex_size * (end - start));
+}
+
+/* Emit indexed VB vertices start..end to dest.  Note that VB vertex at
+ * postion start will be emitted to dest at position zero.
+ */
+
+void *_tnl_emit_indexed_vertices_to_buffer( struct gl_context *ctx,
+					    const GLuint *elts,
+					    GLuint start,
+					    GLuint end,
+					    void *dest )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   GLuint oldIndex;
+   GLubyte *cdest = dest;
+
+   update_input_ptrs(ctx, oldIndex = elts[start++]);
+   vtx->emit( ctx, 1, cdest );
+   cdest += vtx->vertex_size;
+
+   for (; start < end; ++start) {
+      adjust_input_ptrs(ctx, elts[start] - oldIndex);
+      oldIndex = elts[start];
+      vtx->emit( ctx, 1, cdest);
+      cdest += vtx->vertex_size;
+   }
+
+   return (void *) cdest;
+}
+
+
+void _tnl_init_vertices( struct gl_context *ctx, 
+			GLuint vb_size,
+			GLuint max_vertex_size )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);  
+
+   _tnl_install_attrs( ctx, NULL, 0, NULL, 0 );
+
+   vtx->need_extras = GL_TRUE;
+   if (max_vertex_size > vtx->max_vertex_size) {
+      _tnl_free_vertices( ctx );
+      vtx->max_vertex_size = max_vertex_size;
+      vtx->vertex_buf = (GLubyte *)_mesa_align_calloc(vb_size * max_vertex_size, 32 );
+      invalidate_funcs(vtx);
+   }
+
+   switch(CHAN_TYPE) {
+   case GL_UNSIGNED_BYTE:
+      vtx->chan_scale[0] = 255.0;
+      vtx->chan_scale[1] = 255.0;
+      vtx->chan_scale[2] = 255.0;
+      vtx->chan_scale[3] = 255.0;
+      break;
+   case GL_UNSIGNED_SHORT:
+      vtx->chan_scale[0] = 65535.0;
+      vtx->chan_scale[1] = 65535.0;
+      vtx->chan_scale[2] = 65535.0;
+      vtx->chan_scale[3] = 65535.0;
+      break;
+   default:
+      vtx->chan_scale[0] = 1.0;
+      vtx->chan_scale[1] = 1.0;
+      vtx->chan_scale[2] = 1.0;
+      vtx->chan_scale[3] = 1.0;
+      break;
+   }
+
+   vtx->identity[0] = 0.0;
+   vtx->identity[1] = 0.0;
+   vtx->identity[2] = 0.0;
+   vtx->identity[3] = 1.0;
+
+   vtx->codegen_emit = NULL;
+
+#ifdef USE_SSE_ASM
+   if (!_mesa_getenv("MESA_NO_CODEGEN"))
+      vtx->codegen_emit = _tnl_generate_sse_emit;
+#endif
+}
+
+
+void _tnl_free_vertices( struct gl_context *ctx )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   if (tnl) {
+      struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+      struct tnl_clipspace_fastpath *fp, *tmp;
+
+      if (vtx->vertex_buf) {
+         _mesa_align_free(vtx->vertex_buf);
+         vtx->vertex_buf = NULL;
+      }
+
+      for (fp = vtx->fastpath ; fp ; fp = tmp) {
+         tmp = fp->next;
+         FREE(fp->attr);
+
+         /* KW: At the moment, fp->func is constrained to be allocated by
+          * _mesa_exec_alloc(), as the hardwired fastpaths in
+          * t_vertex_generic.c are handled specially.  It would be nice
+          * to unify them, but this probably won't change until this
+          * module gets another overhaul.
+          */
+         _mesa_exec_free((void *) fp->func);
+         FREE(fp);
+      }
+
+      vtx->fastpath = NULL;
+   }
+}
diff --git a/mesalib/src/mesa/tnl/t_vertex_generic.c b/mesalib/src/mesa/tnl/t_vertex_generic.c
index 144b3669d..9dcecdd57 100644
--- a/mesalib/src/mesa/tnl/t_vertex_generic.c
+++ b/mesalib/src/mesa/tnl/t_vertex_generic.c
@@ -1,1155 +1,1156 @@
-
-/*
- * Copyright 2003 Tungsten Graphics, inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Keith Whitwell <keithw@tungstengraphics.com>
- */
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/colormac.h"
-#include "main/simple_list.h"
-#include "t_context.h"
-#include "t_vertex.h"
-
-
-#if 0
-#define DEBUG_INSERT printf("%s\n", __FUNCTION__)
-#else
-#define DEBUG_INSERT
-#endif
-
-
-/*
- * These functions take the NDC coordinates pointed to by 'in', apply the
- * NDC->Viewport mapping and store the results at 'v'.
- */
-
-static INLINE void insert_4f_viewport_4( const struct tnl_clipspace_attr *a, GLubyte *v,
-                      const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   DEBUG_INSERT;
-   out[0] = vp[0] * in[0] + vp[12];
-   out[1] = vp[5] * in[1] + vp[13];
-   out[2] = vp[10] * in[2] + vp[14];
-   out[3] = in[3];
-}
-
-static INLINE void insert_4f_viewport_3( const struct tnl_clipspace_attr *a, GLubyte *v,
-				const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   DEBUG_INSERT;
-   out[0] = vp[0] * in[0] + vp[12];
-   out[1] = vp[5] * in[1] + vp[13];
-   out[2] = vp[10] * in[2] + vp[14];
-   out[3] = 1;
-}
-
-static INLINE void insert_4f_viewport_2( const struct tnl_clipspace_attr *a, GLubyte *v,
-				const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   DEBUG_INSERT;
-   out[0] = vp[0] * in[0] + vp[12];
-   out[1] = vp[5] * in[1] + vp[13];
-   out[2] = vp[14];
-   out[3] = 1;
-}
-
-static INLINE void insert_4f_viewport_1( const struct tnl_clipspace_attr *a, GLubyte *v,
-				const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   DEBUG_INSERT;
-   out[0] = vp[0] * in[0] + vp[12];
-   out[1] = vp[13];
-   out[2] = vp[14];
-   out[3] = 1;
-}
-
-static INLINE void insert_3f_viewport_3( const struct tnl_clipspace_attr *a, GLubyte *v,
-				const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   DEBUG_INSERT;
-   out[0] = vp[0] * in[0] + vp[12];
-   out[1] = vp[5] * in[1] + vp[13];
-   out[2] = vp[10] * in[2] + vp[14];
-}
-
-static INLINE void insert_3f_viewport_2( const struct tnl_clipspace_attr *a, GLubyte *v,
-				const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   DEBUG_INSERT;
-   out[0] = vp[0] * in[0] + vp[12];
-   out[1] = vp[5] * in[1] + vp[13];
-   out[2] = vp[14];
-}
-
-static INLINE void insert_3f_viewport_1( const struct tnl_clipspace_attr *a, GLubyte *v,
-				const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   DEBUG_INSERT;
-   out[0] = vp[0] * in[0] + vp[12];
-   out[1] = vp[13];
-   out[2] = vp[14];
-}
-
-static INLINE void insert_2f_viewport_2( const struct tnl_clipspace_attr *a, GLubyte *v,
-				const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   DEBUG_INSERT;
-   out[0] = vp[0] * in[0] + vp[12];
-   out[1] = vp[5] * in[1] + vp[13];
-}
-
-static INLINE void insert_2f_viewport_1( const struct tnl_clipspace_attr *a, GLubyte *v,
-				const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   DEBUG_INSERT;
-   out[0] = vp[0] * in[0] + vp[12];
-   out[1] = vp[13];
-}
-
-
-/*
- * These functions do the same as above, except for the viewport mapping.
- */
-
-static INLINE void insert_4f_4( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)(v);
-   (void) a;
-   DEBUG_INSERT;
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-   out[3] = in[3];
-}
-
-static INLINE void insert_4f_3( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)(v);
-   (void) a;
-   DEBUG_INSERT;
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-   out[3] = 1;
-}
-
-static INLINE void insert_4f_2( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)(v);
-   (void) a;
-   DEBUG_INSERT;
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = 0;
-   out[3] = 1;
-}
-
-static INLINE void insert_4f_1( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)(v);
-   (void) a;
-   DEBUG_INSERT;
-   out[0] = in[0];
-   out[1] = 0;
-   out[2] = 0;
-   out[3] = 1;
-}
-
-static INLINE void insert_3f_xyw_4( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)(v);
-   (void) a;
-   DEBUG_INSERT;
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[3];
-}
-
-static INLINE void insert_3f_xyw_err( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   (void) a; (void) v; (void) in;
-   DEBUG_INSERT;
-   exit(1);
-}
-
-static INLINE void insert_3f_3( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)(v);
-   (void) a;
-   DEBUG_INSERT;
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-}
-
-static INLINE void insert_3f_2( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)(v);
-   (void) a;
-   DEBUG_INSERT;
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = 0;
-}
-
-static INLINE void insert_3f_1( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)(v);
-   (void) a;
-   DEBUG_INSERT;
-   out[0] = in[0];
-   out[1] = 0;
-   out[2] = 0;
-}
-
-
-static INLINE void insert_2f_2( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)(v);
-   (void) a;
-   DEBUG_INSERT;
-   out[0] = in[0];
-   out[1] = in[1];
-}
-
-static INLINE void insert_2f_1( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)(v);
-   (void) a;
-   DEBUG_INSERT;
-   out[0] = in[0];
-   out[1] = 0;
-}
-
-static INLINE void insert_1f_1( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   GLfloat *out = (GLfloat *)(v);
-   (void) a;
-   DEBUG_INSERT;
-   out[0] = in[0];
-}
-
-static INLINE void insert_null( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a; (void) v; (void) in;
-}
-
-static INLINE void insert_4chan_4f_rgba_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				  const GLfloat *in )
-{
-   GLchan *c = (GLchan *)v;
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[3], in[3]);
-}
-
-static INLINE void insert_4chan_4f_rgba_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				  const GLfloat *in )
-{
-   GLchan *c = (GLchan *)v;
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
-   c[3] = CHAN_MAX;
-}
-
-static INLINE void insert_4chan_4f_rgba_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				  const GLfloat *in )
-{
-   GLchan *c = (GLchan *)v;
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
-   c[2] = 0;
-   c[3] = CHAN_MAX;
-}
-
-static INLINE void insert_4chan_4f_rgba_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				  const GLfloat *in )
-{
-   GLchan *c = (GLchan *)v;
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   c[1] = 0;
-   c[2] = 0;
-   c[3] = CHAN_MAX;
-}
-
-static INLINE void insert_4ub_4f_rgba_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
-}
-
-static INLINE void insert_4ub_4f_rgba_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_rgba_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   v[2] = 0;
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_rgba_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   v[1] = 0;
-   v[2] = 0;
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_bgra_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
-}
-
-static INLINE void insert_4ub_4f_bgra_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_bgra_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   v[0] = 0;
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_bgra_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   v[1] = 0;
-   v[0] = 0;
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_argb_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
-}
-
-static INLINE void insert_4ub_4f_argb_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
-   v[0] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_argb_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   v[3] = 0x00;
-   v[0] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_argb_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
-   v[2] = 0x00;
-   v[3] = 0x00;
-   v[0] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_abgr_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
-}
-
-static INLINE void insert_4ub_4f_abgr_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
-   v[0] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_abgr_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   v[1] = 0x00;
-   v[0] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_abgr_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
-   v[2] = 0x00;
-   v[1] = 0x00;
-   v[0] = 0xff;
-}
-
-static INLINE void insert_3ub_3f_rgb_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
-			       const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
-}
-
-static INLINE void insert_3ub_3f_rgb_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
-			       const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   v[2] = 0;
-}
-
-static INLINE void insert_3ub_3f_rgb_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
-			       const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   v[1] = 0;
-   v[2] = 0;
-}
-
-static INLINE void insert_3ub_3f_bgr_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				 const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
-}
-
-static INLINE void insert_3ub_3f_bgr_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				 const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   v[0] = 0;
-}
-
-static INLINE void insert_3ub_3f_bgr_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
-				 const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   v[1] = 0;
-   v[0] = 0;
-}
-
-
-static INLINE void insert_1ub_1f_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
-			   const GLfloat *in )
-{
-   DEBUG_INSERT;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-}
-
-
-/***********************************************************************
- * Functions to perform the reverse operations to the above, for
- * swrast translation and clip-interpolation.
- * 
- * Currently always extracts a full 4 floats.
- */
-
-static void extract_4f_viewport( const struct tnl_clipspace_attr *a, GLfloat *out, 
-				 const GLubyte *v )
-{
-   const GLfloat *in = (const GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   
-   /* Although included for completeness, the position coordinate is
-    * usually handled differently during clipping.
-    */
-   DEBUG_INSERT;
-   out[0] = (in[0] - vp[12]) / vp[0];
-   out[1] = (in[1] - vp[13]) / vp[5];
-   out[2] = (in[2] - vp[14]) / vp[10];
-   out[3] = in[3];
-}
-
-static void extract_3f_viewport( const struct tnl_clipspace_attr *a, GLfloat *out, 
-				 const GLubyte *v )
-{
-   const GLfloat *in = (const GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   DEBUG_INSERT;
-   out[0] = (in[0] - vp[12]) / vp[0];
-   out[1] = (in[1] - vp[13]) / vp[5];
-   out[2] = (in[2] - vp[14]) / vp[10];
-   out[3] = 1;
-}
-
-
-static void extract_2f_viewport( const struct tnl_clipspace_attr *a, GLfloat *out, 
-				 const GLubyte *v )
-{
-   const GLfloat *in = (const GLfloat *)v;
-   const GLfloat * const vp = a->vp;
-   DEBUG_INSERT;
-   out[0] = (in[0] - vp[12]) / vp[0];
-   out[1] = (in[1] - vp[13]) / vp[5];
-   out[2] = 0;
-   out[3] = 1;
-}
-
-
-static void extract_4f( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v  )
-{
-   const GLfloat *in = (const GLfloat *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-   out[3] = in[3];
-}
-
-static void extract_3f_xyw( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v )
-{
-   const GLfloat *in = (const GLfloat *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = 0;
-   out[3] = in[2];
-}
-
-
-static void extract_3f( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v )
-{
-   const GLfloat *in = (const GLfloat *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-   out[3] = 1;
-}
-
-
-static void extract_2f( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v )
-{
-   const GLfloat *in = (const GLfloat *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = 0;
-   out[3] = 1;
-}
-
-static void extract_1f( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v )
-{
-   const GLfloat *in = (const GLfloat *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = 0;
-   out[2] = 0;
-   out[3] = 1;
-}
-
-static void extract_4chan_4f_rgba( const struct tnl_clipspace_attr *a, GLfloat *out, 
-				 const GLubyte *v )
-{
-   GLchan *c = (GLchan *)v;
-   (void) a;
-
-   out[0] = CHAN_TO_FLOAT(c[0]);
-   out[1] = CHAN_TO_FLOAT(c[1]);
-   out[2] = CHAN_TO_FLOAT(c[2]);
-   out[3] = CHAN_TO_FLOAT(c[3]);
-}
-
-static void extract_4ub_4f_rgba( const struct tnl_clipspace_attr *a, GLfloat *out, 
-				 const GLubyte *v )
-{
-   (void) a;
-   out[0] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[2] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_4ub_4f_bgra( const struct tnl_clipspace_attr *a, GLfloat *out, 
-				 const GLubyte *v )
-{
-   (void) a;
-   out[2] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[0] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_4ub_4f_argb( const struct tnl_clipspace_attr *a, GLfloat *out, 
-				 const GLubyte *v )
-{
-   (void) a;
-   out[3] = UBYTE_TO_FLOAT(v[0]);
-   out[0] = UBYTE_TO_FLOAT(v[1]);
-   out[1] = UBYTE_TO_FLOAT(v[2]);
-   out[2] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_4ub_4f_abgr( const struct tnl_clipspace_attr *a, GLfloat *out, 
-				 const GLubyte *v )
-{
-   (void) a;
-   out[3] = UBYTE_TO_FLOAT(v[0]);
-   out[2] = UBYTE_TO_FLOAT(v[1]);
-   out[1] = UBYTE_TO_FLOAT(v[2]);
-   out[0] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_3ub_3f_rgb( const struct tnl_clipspace_attr *a, GLfloat *out, 
-				const GLubyte *v )
-{
-   (void) a;
-   out[0] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[2] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = 1;
-}
-
-static void extract_3ub_3f_bgr( const struct tnl_clipspace_attr *a, GLfloat *out, 
-				const GLubyte *v )
-{
-   (void) a;
-   out[2] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[0] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = 1;
-}
-
-static void extract_1ub_1f( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v )
-{
-   (void) a;
-   out[0] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = 0;
-   out[2] = 0;
-   out[3] = 1;
-}
-
-
-const struct tnl_format_info _tnl_format_info[EMIT_MAX] = 
-{
-   { "1f",
-     extract_1f,
-     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
-     sizeof(GLfloat) },
-
-   { "2f",
-     extract_2f,
-     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
-     2 * sizeof(GLfloat) },
-
-   { "3f",
-     extract_3f,
-     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
-     3 * sizeof(GLfloat) },
-
-   { "4f",
-     extract_4f,
-     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
-     4 * sizeof(GLfloat) },
-
-   { "2f_viewport",
-     extract_2f_viewport,
-     { insert_2f_viewport_1, insert_2f_viewport_2, insert_2f_viewport_2,
-       insert_2f_viewport_2 },
-     2 * sizeof(GLfloat) },
-
-   { "3f_viewport",
-     extract_3f_viewport,
-     { insert_3f_viewport_1, insert_3f_viewport_2, insert_3f_viewport_3,
-       insert_3f_viewport_3 },
-     3 * sizeof(GLfloat) },
-
-   { "4f_viewport",
-     extract_4f_viewport,
-     { insert_4f_viewport_1, insert_4f_viewport_2, insert_4f_viewport_3,
-       insert_4f_viewport_4 }, 
-     4 * sizeof(GLfloat) },
-
-   { "3f_xyw",
-     extract_3f_xyw,
-     { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
-       insert_3f_xyw_4 },
-     3 * sizeof(GLfloat) },
-
-   { "1ub_1f",
-     extract_1ub_1f,
-     { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
-     sizeof(GLubyte) },
-
-   { "3ub_3f_rgb",
-     extract_3ub_3f_rgb,
-     { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
-       insert_3ub_3f_rgb_3 },
-     3 * sizeof(GLubyte) },
-
-   { "3ub_3f_bgr",
-     extract_3ub_3f_bgr,
-     { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
-       insert_3ub_3f_bgr_3 },
-     3 * sizeof(GLubyte) },
-
-   { "4ub_4f_rgba",
-     extract_4ub_4f_rgba,
-     { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
-       insert_4ub_4f_rgba_4 },
-     4 * sizeof(GLubyte) },
-
-   { "4ub_4f_bgra",
-     extract_4ub_4f_bgra,
-     { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
-       insert_4ub_4f_bgra_4 },
-     4 * sizeof(GLubyte) },
-
-   { "4ub_4f_argb",
-     extract_4ub_4f_argb,
-     { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
-       insert_4ub_4f_argb_4 },
-     4 * sizeof(GLubyte) },
-
-   { "4ub_4f_abgr",
-     extract_4ub_4f_abgr,
-     { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
-       insert_4ub_4f_abgr_4 },
-     4 * sizeof(GLubyte) },
-
-   { "4chan_4f_rgba",
-     extract_4chan_4f_rgba,
-     { insert_4chan_4f_rgba_1, insert_4chan_4f_rgba_2, insert_4chan_4f_rgba_3,
-       insert_4chan_4f_rgba_4 },
-     4 * sizeof(GLchan) },
-
-   { "pad",
-     NULL,
-     { NULL, NULL, NULL, NULL },
-     0 }
-
-};
-
-
-
-    
-/***********************************************************************
- * Hardwired fastpaths for emitting whole vertices or groups of
- * vertices
- */
-#define EMIT5(NR, F0, F1, F2, F3, F4, NAME)				\
-static void NAME( struct gl_context *ctx,					\
-		  GLuint count,						\
-		  GLubyte *v )						\
-{									\
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);			\
-   struct tnl_clipspace_attr *a = vtx->attr;				\
-   GLuint i;								\
-									\
-   for (i = 0 ; i < count ; i++, v += vtx->vertex_size) {		\
-      if (NR > 0) {							\
-	 F0( &a[0], v + a[0].vertoffset, (GLfloat *)a[0].inputptr );	\
-	 a[0].inputptr += a[0].inputstride;				\
-      }									\
-      									\
-      if (NR > 1) {							\
-	 F1( &a[1], v + a[1].vertoffset, (GLfloat *)a[1].inputptr );	\
-	 a[1].inputptr += a[1].inputstride;				\
-      }									\
-      									\
-      if (NR > 2) {							\
-	 F2( &a[2], v + a[2].vertoffset, (GLfloat *)a[2].inputptr );	\
-	 a[2].inputptr += a[2].inputstride;				\
-      }									\
-      									\
-      if (NR > 3) {							\
-	 F3( &a[3], v + a[3].vertoffset, (GLfloat *)a[3].inputptr );	\
-	 a[3].inputptr += a[3].inputstride;				\
-      }									\
-									\
-      if (NR > 4) {							\
-	 F4( &a[4], v + a[4].vertoffset, (GLfloat *)a[4].inputptr );	\
-	 a[4].inputptr += a[4].inputstride;				\
-      }									\
-   }									\
-}
-
-   
-#define EMIT2(F0, F1, NAME) EMIT5(2, F0, F1, insert_null, \
-				  insert_null, insert_null, NAME)
-
-#define EMIT3(F0, F1, F2, NAME) EMIT5(3, F0, F1, F2, insert_null, \
-				      insert_null, NAME)
-   
-#define EMIT4(F0, F1, F2, F3, NAME) EMIT5(4, F0, F1, F2, F3, \
-				          insert_null, NAME)
-   
-
-EMIT2(insert_3f_viewport_3, insert_4ub_4f_rgba_4, emit_viewport3_rgba4)
-EMIT2(insert_3f_viewport_3, insert_4ub_4f_bgra_4, emit_viewport3_bgra4)
-EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
-
-EMIT3(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_viewport4_rgba4_st2)
-EMIT3(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2,  emit_viewport4_bgra4_st2)
-EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
-
-EMIT4(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_viewport4_rgba4_st2_st2)
-EMIT4(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2, insert_2f_2,  emit_viewport4_bgra4_st2_st2)
-EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
-
-
-/* Use the codegen paths to select one of a number of hardwired
- * fastpaths.
- */
-void _tnl_generate_hardwired_emit( struct gl_context *ctx )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   tnl_emit_func func = NULL;
-
-   /* Does it fit a hardwired fastpath?  Help! this is growing out of
-    * control!
-    */
-   switch (vtx->attr_count) {
-   case 2:
-      if (vtx->attr[0].emit == insert_3f_viewport_3) {
-	 if (vtx->attr[1].emit == insert_4ub_4f_bgra_4) 
-	    func = emit_viewport3_bgra4;
-	 else if (vtx->attr[1].emit == insert_4ub_4f_rgba_4) 
-	    func = emit_viewport3_rgba4;
-      }
-      else if (vtx->attr[0].emit == insert_3f_3 &&
-	       vtx->attr[1].emit == insert_4ub_4f_rgba_4) {
- 	 func = emit_xyz3_rgba4; 
-      }
-      break;
-   case 3:
-      if (vtx->attr[2].emit == insert_2f_2) {
-	 if (vtx->attr[1].emit == insert_4ub_4f_rgba_4) {
-	    if (vtx->attr[0].emit == insert_4f_viewport_4)
-	       func = emit_viewport4_rgba4_st2;
-	    else if (vtx->attr[0].emit == insert_4f_4) 
-	       func = emit_xyzw4_rgba4_st2;
-	 }
-	 else if (vtx->attr[1].emit == insert_4ub_4f_bgra_4 &&
-		  vtx->attr[0].emit == insert_4f_viewport_4)
-	    func = emit_viewport4_bgra4_st2;
-      }
-      break;
-   case 4:
-      if (vtx->attr[2].emit == insert_2f_2 &&
-	  vtx->attr[3].emit == insert_2f_2) {
-	 if (vtx->attr[1].emit == insert_4ub_4f_rgba_4) {
-	    if (vtx->attr[0].emit == insert_4f_viewport_4)
-	       func = emit_viewport4_rgba4_st2_st2;
-	    else if (vtx->attr[0].emit == insert_4f_4) 
-	       func = emit_xyzw4_rgba4_st2_st2;
-	 }
-	 else if (vtx->attr[1].emit == insert_4ub_4f_bgra_4 &&
-		  vtx->attr[0].emit == insert_4f_viewport_4)
-	    func = emit_viewport4_bgra4_st2_st2;
-      }
-      break;
-   }
-
-   vtx->emit = func;
-}
-
-/***********************************************************************
- * Generic (non-codegen) functions for whole vertices or groups of
- * vertices
- */
-
-void _tnl_generic_emit( struct gl_context *ctx,
-			GLuint count,
-			GLubyte *v )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   struct tnl_clipspace_attr *a = vtx->attr;
-   const GLuint attr_count = vtx->attr_count;
-   const GLuint stride = vtx->vertex_size;
-   GLuint i, j;
-
-   for (i = 0 ; i < count ; i++, v += stride) {
-      for (j = 0; j < attr_count; j++) {
-	 GLfloat *in = (GLfloat *)a[j].inputptr;
-	 a[j].inputptr += a[j].inputstride;
-	 a[j].emit( &a[j], v + a[j].vertoffset, in );
-      }
-   }
-}
-
-
-void _tnl_generic_interp( struct gl_context *ctx,
-			    GLfloat t,
-			    GLuint edst, GLuint eout, GLuint ein,
-			    GLboolean force_boundary )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct vertex_buffer *VB = &tnl->vb;
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   const GLubyte *vin  = vtx->vertex_buf + ein  * vtx->vertex_size;
-   const GLubyte *vout = vtx->vertex_buf + eout * vtx->vertex_size;
-   GLubyte *vdst = vtx->vertex_buf + edst * vtx->vertex_size;
-   const struct tnl_clipspace_attr *a = vtx->attr;
-   const GLuint attr_count = vtx->attr_count;
-   GLuint j;
-   (void) force_boundary;
-
-   if (tnl->NeedNdcCoords) {
-      const GLfloat *dstclip = VB->ClipPtr->data[edst];
-      if (dstclip[3] != 0.0) {
-	 const GLfloat w = 1.0f / dstclip[3];
-	 GLfloat pos[4];
-
-	 pos[0] = dstclip[0] * w;
-	 pos[1] = dstclip[1] * w;
-	 pos[2] = dstclip[2] * w;
-	 pos[3] = w;
-
-	 a[0].insert[4-1]( &a[0], vdst, pos );
-      }
-   }
-   else {
-      a[0].insert[4-1]( &a[0], vdst, VB->ClipPtr->data[edst] );
-   }
-
-
-   for (j = 1; j < attr_count; j++) {
-      GLfloat fin[4], fout[4], fdst[4];
-	 
-      a[j].extract( &a[j], fin, vin + a[j].vertoffset );
-      a[j].extract( &a[j], fout, vout + a[j].vertoffset );
-
-      INTERP_F( t, fdst[3], fout[3], fin[3] );
-      INTERP_F( t, fdst[2], fout[2], fin[2] );
-      INTERP_F( t, fdst[1], fout[1], fin[1] );
-      INTERP_F( t, fdst[0], fout[0], fin[0] );
-
-      a[j].insert[4-1]( &a[j], vdst + a[j].vertoffset, fdst );
-   }
-}
-
-
-/* Extract color attributes from one vertex and insert them into
- * another.  (Shortcircuit extract/insert with memcpy).
- */
-void _tnl_generic_copy_pv( struct gl_context *ctx, GLuint edst, GLuint esrc )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   GLubyte *vsrc = vtx->vertex_buf + esrc * vtx->vertex_size;
-   GLubyte *vdst = vtx->vertex_buf + edst * vtx->vertex_size;
-   const struct tnl_clipspace_attr *a = vtx->attr;
-   const GLuint attr_count = vtx->attr_count;
-   GLuint j;
-
-   for (j = 0; j < attr_count; j++) {
-      if (a[j].attrib == VERT_ATTRIB_COLOR0 ||
-	  a[j].attrib == VERT_ATTRIB_COLOR1) {
-
-	 memcpy( vdst + a[j].vertoffset,
-                 vsrc + a[j].vertoffset,
-                 a[j].vertattrsize );
-      }
-   }
-}
-
-
-/* Helper functions for hardware which doesn't put back colors and/or
- * edgeflags into vertices.
- */
-void _tnl_generic_interp_extras( struct gl_context *ctx,
-				   GLfloat t,
-				   GLuint dst, GLuint out, GLuint in,
-				   GLboolean force_boundary )
-{
-   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
-
-   /* If stride is zero, BackfaceColorPtr is constant across the VB, so
-    * there is no point interpolating between two values as they will
-    * be identical.  In all other cases, this value is generated by
-    * t_vb_lighttmp.h and has a stride of 4 dwords.
-    */
-   if (VB->BackfaceColorPtr && VB->BackfaceColorPtr->stride) {
-      assert(VB->BackfaceColorPtr->stride == 4 * sizeof(GLfloat));
-
-      INTERP_4F( t,
-		 VB->BackfaceColorPtr->data[dst],
-		 VB->BackfaceColorPtr->data[out],
-		 VB->BackfaceColorPtr->data[in] );
-   }
-
-   if (VB->BackfaceSecondaryColorPtr) {
-      assert(VB->BackfaceSecondaryColorPtr->stride == 4 * sizeof(GLfloat));
-      
-      INTERP_3F( t,
-		 VB->BackfaceSecondaryColorPtr->data[dst],
-		 VB->BackfaceSecondaryColorPtr->data[out],
-		 VB->BackfaceSecondaryColorPtr->data[in] );
-   }
-   
-   if (VB->BackfaceIndexPtr) {
-      VB->BackfaceIndexPtr->data[dst][0] = LINTERP( t,
-					       VB->BackfaceIndexPtr->data[out][0],
-					       VB->BackfaceIndexPtr->data[in][0] );
-   }
-
-   if (VB->EdgeFlag) {
-      VB->EdgeFlag[dst] = VB->EdgeFlag[out] || force_boundary;
-   }
-
-   _tnl_generic_interp(ctx, t, dst, out, in, force_boundary);
-}
-
-void _tnl_generic_copy_pv_extras( struct gl_context *ctx, 
-				  GLuint dst, GLuint src )
-{
-   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
-
-   /* See above comment:
-    */
-   if (VB->BackfaceColorPtr && VB->BackfaceColorPtr->stride) {
-      COPY_4FV( VB->BackfaceColorPtr->data[dst],
-		VB->BackfaceColorPtr->data[src] );
-   }
-
-   if (VB->BackfaceSecondaryColorPtr) {
-      COPY_4FV( VB->BackfaceSecondaryColorPtr->data[dst],
-		VB->BackfaceSecondaryColorPtr->data[src] );
-   }
-
-   if (VB->BackfaceIndexPtr) {
-      VB->BackfaceIndexPtr->data[dst][0] = VB->BackfaceIndexPtr->data[src][0];
-   }
-
-   _tnl_generic_copy_pv(ctx, dst, src);
-}
-
-
+
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/colormac.h"
+#include "main/simple_list.h"
+#include "swrast/s_chan.h"
+#include "t_context.h"
+#include "t_vertex.h"
+
+
+#if 0
+#define DEBUG_INSERT printf("%s\n", __FUNCTION__)
+#else
+#define DEBUG_INSERT
+#endif
+
+
+/*
+ * These functions take the NDC coordinates pointed to by 'in', apply the
+ * NDC->Viewport mapping and store the results at 'v'.
+ */
+
+static INLINE void insert_4f_viewport_4( const struct tnl_clipspace_attr *a, GLubyte *v,
+                      const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   DEBUG_INSERT;
+   out[0] = vp[0] * in[0] + vp[12];
+   out[1] = vp[5] * in[1] + vp[13];
+   out[2] = vp[10] * in[2] + vp[14];
+   out[3] = in[3];
+}
+
+static INLINE void insert_4f_viewport_3( const struct tnl_clipspace_attr *a, GLubyte *v,
+				const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   DEBUG_INSERT;
+   out[0] = vp[0] * in[0] + vp[12];
+   out[1] = vp[5] * in[1] + vp[13];
+   out[2] = vp[10] * in[2] + vp[14];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_viewport_2( const struct tnl_clipspace_attr *a, GLubyte *v,
+				const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   DEBUG_INSERT;
+   out[0] = vp[0] * in[0] + vp[12];
+   out[1] = vp[5] * in[1] + vp[13];
+   out[2] = vp[14];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_viewport_1( const struct tnl_clipspace_attr *a, GLubyte *v,
+				const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   DEBUG_INSERT;
+   out[0] = vp[0] * in[0] + vp[12];
+   out[1] = vp[13];
+   out[2] = vp[14];
+   out[3] = 1;
+}
+
+static INLINE void insert_3f_viewport_3( const struct tnl_clipspace_attr *a, GLubyte *v,
+				const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   DEBUG_INSERT;
+   out[0] = vp[0] * in[0] + vp[12];
+   out[1] = vp[5] * in[1] + vp[13];
+   out[2] = vp[10] * in[2] + vp[14];
+}
+
+static INLINE void insert_3f_viewport_2( const struct tnl_clipspace_attr *a, GLubyte *v,
+				const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   DEBUG_INSERT;
+   out[0] = vp[0] * in[0] + vp[12];
+   out[1] = vp[5] * in[1] + vp[13];
+   out[2] = vp[14];
+}
+
+static INLINE void insert_3f_viewport_1( const struct tnl_clipspace_attr *a, GLubyte *v,
+				const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   DEBUG_INSERT;
+   out[0] = vp[0] * in[0] + vp[12];
+   out[1] = vp[13];
+   out[2] = vp[14];
+}
+
+static INLINE void insert_2f_viewport_2( const struct tnl_clipspace_attr *a, GLubyte *v,
+				const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   DEBUG_INSERT;
+   out[0] = vp[0] * in[0] + vp[12];
+   out[1] = vp[5] * in[1] + vp[13];
+}
+
+static INLINE void insert_2f_viewport_1( const struct tnl_clipspace_attr *a, GLubyte *v,
+				const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   DEBUG_INSERT;
+   out[0] = vp[0] * in[0] + vp[12];
+   out[1] = vp[13];
+}
+
+
+/*
+ * These functions do the same as above, except for the viewport mapping.
+ */
+
+static INLINE void insert_4f_4( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)(v);
+   (void) a;
+   DEBUG_INSERT;
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = in[3];
+}
+
+static INLINE void insert_4f_3( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)(v);
+   (void) a;
+   DEBUG_INSERT;
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_2( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)(v);
+   (void) a;
+   DEBUG_INSERT;
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_1( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)(v);
+   (void) a;
+   DEBUG_INSERT;
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_3f_xyw_4( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)(v);
+   (void) a;
+   DEBUG_INSERT;
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[3];
+}
+
+static INLINE void insert_3f_xyw_err( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   (void) a; (void) v; (void) in;
+   DEBUG_INSERT;
+   exit(1);
+}
+
+static INLINE void insert_3f_3( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)(v);
+   (void) a;
+   DEBUG_INSERT;
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+}
+
+static INLINE void insert_3f_2( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)(v);
+   (void) a;
+   DEBUG_INSERT;
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+}
+
+static INLINE void insert_3f_1( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)(v);
+   (void) a;
+   DEBUG_INSERT;
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+}
+
+
+static INLINE void insert_2f_2( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)(v);
+   (void) a;
+   DEBUG_INSERT;
+   out[0] = in[0];
+   out[1] = in[1];
+}
+
+static INLINE void insert_2f_1( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)(v);
+   (void) a;
+   DEBUG_INSERT;
+   out[0] = in[0];
+   out[1] = 0;
+}
+
+static INLINE void insert_1f_1( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   GLfloat *out = (GLfloat *)(v);
+   (void) a;
+   DEBUG_INSERT;
+   out[0] = in[0];
+}
+
+static INLINE void insert_null( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a; (void) v; (void) in;
+}
+
+static INLINE void insert_4chan_4f_rgba_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				  const GLfloat *in )
+{
+   GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[3], in[3]);
+}
+
+static INLINE void insert_4chan_4f_rgba_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				  const GLfloat *in )
+{
+   GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
+   c[3] = CHAN_MAX;
+}
+
+static INLINE void insert_4chan_4f_rgba_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				  const GLfloat *in )
+{
+   GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
+   c[2] = 0;
+   c[3] = CHAN_MAX;
+}
+
+static INLINE void insert_4chan_4f_rgba_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				  const GLfloat *in )
+{
+   GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   c[1] = 0;
+   c[2] = 0;
+   c[3] = CHAN_MAX;
+}
+
+static INLINE void insert_4ub_4f_rgba_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_rgba_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_bgra_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_argb_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   v[2] = 0x00;
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_abgr_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   v[2] = 0x00;
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_3ub_3f_rgb_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
+			       const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+}
+
+static INLINE void insert_3ub_3f_rgb_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
+			       const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_rgb_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
+			       const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				 const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+}
+
+static INLINE void insert_3ub_3f_bgr_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				 const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
+				 const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+}
+
+
+static INLINE void insert_1ub_1f_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
+			   const GLfloat *in )
+{
+   DEBUG_INSERT;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+}
+
+
+/***********************************************************************
+ * Functions to perform the reverse operations to the above, for
+ * swrast translation and clip-interpolation.
+ * 
+ * Currently always extracts a full 4 floats.
+ */
+
+static void extract_4f_viewport( const struct tnl_clipspace_attr *a, GLfloat *out, 
+				 const GLubyte *v )
+{
+   const GLfloat *in = (const GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   
+   /* Although included for completeness, the position coordinate is
+    * usually handled differently during clipping.
+    */
+   DEBUG_INSERT;
+   out[0] = (in[0] - vp[12]) / vp[0];
+   out[1] = (in[1] - vp[13]) / vp[5];
+   out[2] = (in[2] - vp[14]) / vp[10];
+   out[3] = in[3];
+}
+
+static void extract_3f_viewport( const struct tnl_clipspace_attr *a, GLfloat *out, 
+				 const GLubyte *v )
+{
+   const GLfloat *in = (const GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   DEBUG_INSERT;
+   out[0] = (in[0] - vp[12]) / vp[0];
+   out[1] = (in[1] - vp[13]) / vp[5];
+   out[2] = (in[2] - vp[14]) / vp[10];
+   out[3] = 1;
+}
+
+
+static void extract_2f_viewport( const struct tnl_clipspace_attr *a, GLfloat *out, 
+				 const GLubyte *v )
+{
+   const GLfloat *in = (const GLfloat *)v;
+   const GLfloat * const vp = a->vp;
+   DEBUG_INSERT;
+   out[0] = (in[0] - vp[12]) / vp[0];
+   out[1] = (in[1] - vp[13]) / vp[5];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+
+static void extract_4f( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v  )
+{
+   const GLfloat *in = (const GLfloat *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = in[3];
+}
+
+static void extract_3f_xyw( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v )
+{
+   const GLfloat *in = (const GLfloat *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = in[2];
+}
+
+
+static void extract_3f( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v )
+{
+   const GLfloat *in = (const GLfloat *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = 1;
+}
+
+
+static void extract_2f( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v )
+{
+   const GLfloat *in = (const GLfloat *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static void extract_1f( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v )
+{
+   const GLfloat *in = (const GLfloat *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static void extract_4chan_4f_rgba( const struct tnl_clipspace_attr *a, GLfloat *out, 
+				 const GLubyte *v )
+{
+   GLchan *c = (GLchan *)v;
+   (void) a;
+
+   out[0] = CHAN_TO_FLOAT(c[0]);
+   out[1] = CHAN_TO_FLOAT(c[1]);
+   out[2] = CHAN_TO_FLOAT(c[2]);
+   out[3] = CHAN_TO_FLOAT(c[3]);
+}
+
+static void extract_4ub_4f_rgba( const struct tnl_clipspace_attr *a, GLfloat *out, 
+				 const GLubyte *v )
+{
+   (void) a;
+   out[0] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[2] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_4ub_4f_bgra( const struct tnl_clipspace_attr *a, GLfloat *out, 
+				 const GLubyte *v )
+{
+   (void) a;
+   out[2] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[0] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_4ub_4f_argb( const struct tnl_clipspace_attr *a, GLfloat *out, 
+				 const GLubyte *v )
+{
+   (void) a;
+   out[3] = UBYTE_TO_FLOAT(v[0]);
+   out[0] = UBYTE_TO_FLOAT(v[1]);
+   out[1] = UBYTE_TO_FLOAT(v[2]);
+   out[2] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_4ub_4f_abgr( const struct tnl_clipspace_attr *a, GLfloat *out, 
+				 const GLubyte *v )
+{
+   (void) a;
+   out[3] = UBYTE_TO_FLOAT(v[0]);
+   out[2] = UBYTE_TO_FLOAT(v[1]);
+   out[1] = UBYTE_TO_FLOAT(v[2]);
+   out[0] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_3ub_3f_rgb( const struct tnl_clipspace_attr *a, GLfloat *out, 
+				const GLubyte *v )
+{
+   (void) a;
+   out[0] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[2] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = 1;
+}
+
+static void extract_3ub_3f_bgr( const struct tnl_clipspace_attr *a, GLfloat *out, 
+				const GLubyte *v )
+{
+   (void) a;
+   out[2] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[0] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = 1;
+}
+
+static void extract_1ub_1f( const struct tnl_clipspace_attr *a, GLfloat *out, const GLubyte *v )
+{
+   (void) a;
+   out[0] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+
+const struct tnl_format_info _tnl_format_info[EMIT_MAX] = 
+{
+   { "1f",
+     extract_1f,
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(GLfloat) },
+
+   { "2f",
+     extract_2f,
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(GLfloat) },
+
+   { "3f",
+     extract_3f,
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(GLfloat) },
+
+   { "4f",
+     extract_4f,
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(GLfloat) },
+
+   { "2f_viewport",
+     extract_2f_viewport,
+     { insert_2f_viewport_1, insert_2f_viewport_2, insert_2f_viewport_2,
+       insert_2f_viewport_2 },
+     2 * sizeof(GLfloat) },
+
+   { "3f_viewport",
+     extract_3f_viewport,
+     { insert_3f_viewport_1, insert_3f_viewport_2, insert_3f_viewport_3,
+       insert_3f_viewport_3 },
+     3 * sizeof(GLfloat) },
+
+   { "4f_viewport",
+     extract_4f_viewport,
+     { insert_4f_viewport_1, insert_4f_viewport_2, insert_4f_viewport_3,
+       insert_4f_viewport_4 }, 
+     4 * sizeof(GLfloat) },
+
+   { "3f_xyw",
+     extract_3f_xyw,
+     { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
+       insert_3f_xyw_4 },
+     3 * sizeof(GLfloat) },
+
+   { "1ub_1f",
+     extract_1ub_1f,
+     { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
+     sizeof(GLubyte) },
+
+   { "3ub_3f_rgb",
+     extract_3ub_3f_rgb,
+     { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
+       insert_3ub_3f_rgb_3 },
+     3 * sizeof(GLubyte) },
+
+   { "3ub_3f_bgr",
+     extract_3ub_3f_bgr,
+     { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
+       insert_3ub_3f_bgr_3 },
+     3 * sizeof(GLubyte) },
+
+   { "4ub_4f_rgba",
+     extract_4ub_4f_rgba,
+     { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
+       insert_4ub_4f_rgba_4 },
+     4 * sizeof(GLubyte) },
+
+   { "4ub_4f_bgra",
+     extract_4ub_4f_bgra,
+     { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
+       insert_4ub_4f_bgra_4 },
+     4 * sizeof(GLubyte) },
+
+   { "4ub_4f_argb",
+     extract_4ub_4f_argb,
+     { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
+       insert_4ub_4f_argb_4 },
+     4 * sizeof(GLubyte) },
+
+   { "4ub_4f_abgr",
+     extract_4ub_4f_abgr,
+     { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
+       insert_4ub_4f_abgr_4 },
+     4 * sizeof(GLubyte) },
+
+   { "4chan_4f_rgba",
+     extract_4chan_4f_rgba,
+     { insert_4chan_4f_rgba_1, insert_4chan_4f_rgba_2, insert_4chan_4f_rgba_3,
+       insert_4chan_4f_rgba_4 },
+     4 * sizeof(GLchan) },
+
+   { "pad",
+     NULL,
+     { NULL, NULL, NULL, NULL },
+     0 }
+
+};
+
+
+
+    
+/***********************************************************************
+ * Hardwired fastpaths for emitting whole vertices or groups of
+ * vertices
+ */
+#define EMIT5(NR, F0, F1, F2, F3, F4, NAME)				\
+static void NAME( struct gl_context *ctx,					\
+		  GLuint count,						\
+		  GLubyte *v )						\
+{									\
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);			\
+   struct tnl_clipspace_attr *a = vtx->attr;				\
+   GLuint i;								\
+									\
+   for (i = 0 ; i < count ; i++, v += vtx->vertex_size) {		\
+      if (NR > 0) {							\
+	 F0( &a[0], v + a[0].vertoffset, (GLfloat *)a[0].inputptr );	\
+	 a[0].inputptr += a[0].inputstride;				\
+      }									\
+      									\
+      if (NR > 1) {							\
+	 F1( &a[1], v + a[1].vertoffset, (GLfloat *)a[1].inputptr );	\
+	 a[1].inputptr += a[1].inputstride;				\
+      }									\
+      									\
+      if (NR > 2) {							\
+	 F2( &a[2], v + a[2].vertoffset, (GLfloat *)a[2].inputptr );	\
+	 a[2].inputptr += a[2].inputstride;				\
+      }									\
+      									\
+      if (NR > 3) {							\
+	 F3( &a[3], v + a[3].vertoffset, (GLfloat *)a[3].inputptr );	\
+	 a[3].inputptr += a[3].inputstride;				\
+      }									\
+									\
+      if (NR > 4) {							\
+	 F4( &a[4], v + a[4].vertoffset, (GLfloat *)a[4].inputptr );	\
+	 a[4].inputptr += a[4].inputstride;				\
+      }									\
+   }									\
+}
+
+   
+#define EMIT2(F0, F1, NAME) EMIT5(2, F0, F1, insert_null, \
+				  insert_null, insert_null, NAME)
+
+#define EMIT3(F0, F1, F2, NAME) EMIT5(3, F0, F1, F2, insert_null, \
+				      insert_null, NAME)
+   
+#define EMIT4(F0, F1, F2, F3, NAME) EMIT5(4, F0, F1, F2, F3, \
+				          insert_null, NAME)
+   
+
+EMIT2(insert_3f_viewport_3, insert_4ub_4f_rgba_4, emit_viewport3_rgba4)
+EMIT2(insert_3f_viewport_3, insert_4ub_4f_bgra_4, emit_viewport3_bgra4)
+EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
+
+EMIT3(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_viewport4_rgba4_st2)
+EMIT3(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2,  emit_viewport4_bgra4_st2)
+EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
+
+EMIT4(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_viewport4_rgba4_st2_st2)
+EMIT4(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2, insert_2f_2,  emit_viewport4_bgra4_st2_st2)
+EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
+
+
+/* Use the codegen paths to select one of a number of hardwired
+ * fastpaths.
+ */
+void _tnl_generate_hardwired_emit( struct gl_context *ctx )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   tnl_emit_func func = NULL;
+
+   /* Does it fit a hardwired fastpath?  Help! this is growing out of
+    * control!
+    */
+   switch (vtx->attr_count) {
+   case 2:
+      if (vtx->attr[0].emit == insert_3f_viewport_3) {
+	 if (vtx->attr[1].emit == insert_4ub_4f_bgra_4) 
+	    func = emit_viewport3_bgra4;
+	 else if (vtx->attr[1].emit == insert_4ub_4f_rgba_4) 
+	    func = emit_viewport3_rgba4;
+      }
+      else if (vtx->attr[0].emit == insert_3f_3 &&
+	       vtx->attr[1].emit == insert_4ub_4f_rgba_4) {
+ 	 func = emit_xyz3_rgba4; 
+      }
+      break;
+   case 3:
+      if (vtx->attr[2].emit == insert_2f_2) {
+	 if (vtx->attr[1].emit == insert_4ub_4f_rgba_4) {
+	    if (vtx->attr[0].emit == insert_4f_viewport_4)
+	       func = emit_viewport4_rgba4_st2;
+	    else if (vtx->attr[0].emit == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2;
+	 }
+	 else if (vtx->attr[1].emit == insert_4ub_4f_bgra_4 &&
+		  vtx->attr[0].emit == insert_4f_viewport_4)
+	    func = emit_viewport4_bgra4_st2;
+      }
+      break;
+   case 4:
+      if (vtx->attr[2].emit == insert_2f_2 &&
+	  vtx->attr[3].emit == insert_2f_2) {
+	 if (vtx->attr[1].emit == insert_4ub_4f_rgba_4) {
+	    if (vtx->attr[0].emit == insert_4f_viewport_4)
+	       func = emit_viewport4_rgba4_st2_st2;
+	    else if (vtx->attr[0].emit == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2_st2;
+	 }
+	 else if (vtx->attr[1].emit == insert_4ub_4f_bgra_4 &&
+		  vtx->attr[0].emit == insert_4f_viewport_4)
+	    func = emit_viewport4_bgra4_st2_st2;
+      }
+      break;
+   }
+
+   vtx->emit = func;
+}
+
+/***********************************************************************
+ * Generic (non-codegen) functions for whole vertices or groups of
+ * vertices
+ */
+
+void _tnl_generic_emit( struct gl_context *ctx,
+			GLuint count,
+			GLubyte *v )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   struct tnl_clipspace_attr *a = vtx->attr;
+   const GLuint attr_count = vtx->attr_count;
+   const GLuint stride = vtx->vertex_size;
+   GLuint i, j;
+
+   for (i = 0 ; i < count ; i++, v += stride) {
+      for (j = 0; j < attr_count; j++) {
+	 GLfloat *in = (GLfloat *)a[j].inputptr;
+	 a[j].inputptr += a[j].inputstride;
+	 a[j].emit( &a[j], v + a[j].vertoffset, in );
+      }
+   }
+}
+
+
+void _tnl_generic_interp( struct gl_context *ctx,
+			    GLfloat t,
+			    GLuint edst, GLuint eout, GLuint ein,
+			    GLboolean force_boundary )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   const GLubyte *vin  = vtx->vertex_buf + ein  * vtx->vertex_size;
+   const GLubyte *vout = vtx->vertex_buf + eout * vtx->vertex_size;
+   GLubyte *vdst = vtx->vertex_buf + edst * vtx->vertex_size;
+   const struct tnl_clipspace_attr *a = vtx->attr;
+   const GLuint attr_count = vtx->attr_count;
+   GLuint j;
+   (void) force_boundary;
+
+   if (tnl->NeedNdcCoords) {
+      const GLfloat *dstclip = VB->ClipPtr->data[edst];
+      if (dstclip[3] != 0.0) {
+	 const GLfloat w = 1.0f / dstclip[3];
+	 GLfloat pos[4];
+
+	 pos[0] = dstclip[0] * w;
+	 pos[1] = dstclip[1] * w;
+	 pos[2] = dstclip[2] * w;
+	 pos[3] = w;
+
+	 a[0].insert[4-1]( &a[0], vdst, pos );
+      }
+   }
+   else {
+      a[0].insert[4-1]( &a[0], vdst, VB->ClipPtr->data[edst] );
+   }
+
+
+   for (j = 1; j < attr_count; j++) {
+      GLfloat fin[4], fout[4], fdst[4];
+	 
+      a[j].extract( &a[j], fin, vin + a[j].vertoffset );
+      a[j].extract( &a[j], fout, vout + a[j].vertoffset );
+
+      INTERP_F( t, fdst[3], fout[3], fin[3] );
+      INTERP_F( t, fdst[2], fout[2], fin[2] );
+      INTERP_F( t, fdst[1], fout[1], fin[1] );
+      INTERP_F( t, fdst[0], fout[0], fin[0] );
+
+      a[j].insert[4-1]( &a[j], vdst + a[j].vertoffset, fdst );
+   }
+}
+
+
+/* Extract color attributes from one vertex and insert them into
+ * another.  (Shortcircuit extract/insert with memcpy).
+ */
+void _tnl_generic_copy_pv( struct gl_context *ctx, GLuint edst, GLuint esrc )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   GLubyte *vsrc = vtx->vertex_buf + esrc * vtx->vertex_size;
+   GLubyte *vdst = vtx->vertex_buf + edst * vtx->vertex_size;
+   const struct tnl_clipspace_attr *a = vtx->attr;
+   const GLuint attr_count = vtx->attr_count;
+   GLuint j;
+
+   for (j = 0; j < attr_count; j++) {
+      if (a[j].attrib == VERT_ATTRIB_COLOR0 ||
+	  a[j].attrib == VERT_ATTRIB_COLOR1) {
+
+	 memcpy( vdst + a[j].vertoffset,
+                 vsrc + a[j].vertoffset,
+                 a[j].vertattrsize );
+      }
+   }
+}
+
+
+/* Helper functions for hardware which doesn't put back colors and/or
+ * edgeflags into vertices.
+ */
+void _tnl_generic_interp_extras( struct gl_context *ctx,
+				   GLfloat t,
+				   GLuint dst, GLuint out, GLuint in,
+				   GLboolean force_boundary )
+{
+   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
+
+   /* If stride is zero, BackfaceColorPtr is constant across the VB, so
+    * there is no point interpolating between two values as they will
+    * be identical.  In all other cases, this value is generated by
+    * t_vb_lighttmp.h and has a stride of 4 dwords.
+    */
+   if (VB->BackfaceColorPtr && VB->BackfaceColorPtr->stride) {
+      assert(VB->BackfaceColorPtr->stride == 4 * sizeof(GLfloat));
+
+      INTERP_4F( t,
+		 VB->BackfaceColorPtr->data[dst],
+		 VB->BackfaceColorPtr->data[out],
+		 VB->BackfaceColorPtr->data[in] );
+   }
+
+   if (VB->BackfaceSecondaryColorPtr) {
+      assert(VB->BackfaceSecondaryColorPtr->stride == 4 * sizeof(GLfloat));
+      
+      INTERP_3F( t,
+		 VB->BackfaceSecondaryColorPtr->data[dst],
+		 VB->BackfaceSecondaryColorPtr->data[out],
+		 VB->BackfaceSecondaryColorPtr->data[in] );
+   }
+   
+   if (VB->BackfaceIndexPtr) {
+      VB->BackfaceIndexPtr->data[dst][0] = LINTERP( t,
+					       VB->BackfaceIndexPtr->data[out][0],
+					       VB->BackfaceIndexPtr->data[in][0] );
+   }
+
+   if (VB->EdgeFlag) {
+      VB->EdgeFlag[dst] = VB->EdgeFlag[out] || force_boundary;
+   }
+
+   _tnl_generic_interp(ctx, t, dst, out, in, force_boundary);
+}
+
+void _tnl_generic_copy_pv_extras( struct gl_context *ctx, 
+				  GLuint dst, GLuint src )
+{
+   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
+
+   /* See above comment:
+    */
+   if (VB->BackfaceColorPtr && VB->BackfaceColorPtr->stride) {
+      COPY_4FV( VB->BackfaceColorPtr->data[dst],
+		VB->BackfaceColorPtr->data[src] );
+   }
+
+   if (VB->BackfaceSecondaryColorPtr) {
+      COPY_4FV( VB->BackfaceSecondaryColorPtr->data[dst],
+		VB->BackfaceSecondaryColorPtr->data[src] );
+   }
+
+   if (VB->BackfaceIndexPtr) {
+      VB->BackfaceIndexPtr->data[dst][0] = VB->BackfaceIndexPtr->data[src][0];
+   }
+
+   _tnl_generic_copy_pv(ctx, dst, src);
+}
+
+
diff --git a/mesalib/src/mesa/tnl/t_vertex_sse.c b/mesalib/src/mesa/tnl/t_vertex_sse.c
index f164cd77e..e0141c36f 100644
--- a/mesalib/src/mesa/tnl/t_vertex_sse.c
+++ b/mesalib/src/mesa/tnl/t_vertex_sse.c
@@ -1,684 +1,685 @@
-/*
- * Copyright 2003 Tungsten Graphics, inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Keith Whitwell <keithw@tungstengraphics.com>
- */
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/colormac.h"
-#include "main/simple_list.h"
-#include "main/enums.h"
-#include "t_context.h"
-#include "t_vertex.h"
-
-#if defined(USE_SSE_ASM)
-
-#include "x86/rtasm/x86sse.h"
-#include "x86/common_x86_asm.h"
-
-
-/**
- * Number of bytes to allocate for generated SSE functions
- */
-#define MAX_SSE_CODE_SIZE 1024
-
-
-#define X    0
-#define Y    1
-#define Z    2
-#define W    3
-
-
-struct x86_program {
-   struct x86_function func;
-
-   struct gl_context *ctx;
-   GLboolean inputs_safe;
-   GLboolean outputs_safe;
-   GLboolean have_sse2;
-   
-   struct x86_reg identity;
-   struct x86_reg chan0;
-};
-
-
-static struct x86_reg get_identity( struct x86_program *p )
-{
-   return p->identity;
-}
-
-static void emit_load4f_4( struct x86_program *p, 			   
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   sse_movups(&p->func, dest, arg0);
-}
-
-static void emit_load4f_3( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   /* Have to jump through some hoops:
-    *
-    * c 0 0 0
-    * c 0 0 1
-    * 0 0 c 1
-    * a b c 1
-    */
-   sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
-   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
-   sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
-   sse_movlps(&p->func, dest, arg0);
-}
-
-static void emit_load4f_2( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   /* Initialize from identity, then pull in low two words:
-    */
-   sse_movups(&p->func, dest, get_identity(p));
-   sse_movlps(&p->func, dest, arg0);
-}
-
-static void emit_load4f_1( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   /* Pull in low word, then swizzle in identity */
-   sse_movss(&p->func, dest, arg0);
-   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
-}
-
-
-
-static void emit_load3f_3( struct x86_program *p, 			   
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   /* Over-reads by 1 dword - potential SEGV if input is a vertex
-    * array.
-    */
-   if (p->inputs_safe) {
-      sse_movups(&p->func, dest, arg0);
-   } 
-   else {
-      /* c 0 0 0
-       * c c c c
-       * a b c c 
-       */
-      sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
-      sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
-      sse_movlps(&p->func, dest, arg0);
-   }
-}
-
-static void emit_load3f_2( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   emit_load4f_2(p, dest, arg0);
-}
-
-static void emit_load3f_1( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   /* Loading from memory erases the upper bits. */
-   sse_movss(&p->func, dest, arg0);
-}
-
-static void emit_load2f_2( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   sse_movlps(&p->func, dest, arg0);
-}
-
-static void emit_load2f_1( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   /* Loading from memory erases the upper bits. */
-   sse_movss(&p->func, dest, arg0);
-}
-
-static void emit_load1f_1( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   sse_movss(&p->func, dest, arg0);
-}
-
-static void (*load[4][4])( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 ) = {
-   { emit_load1f_1, 
-     emit_load1f_1, 
-     emit_load1f_1, 
-     emit_load1f_1 },
-
-   { emit_load2f_1, 
-     emit_load2f_2, 
-     emit_load2f_2, 
-     emit_load2f_2 },
-
-   { emit_load3f_1, 
-     emit_load3f_2, 
-     emit_load3f_3, 
-     emit_load3f_3 },
-
-   { emit_load4f_1, 
-     emit_load4f_2, 
-     emit_load4f_3, 
-     emit_load4f_4 } 
-};
-
-static void emit_load( struct x86_program *p,
-		       struct x86_reg dest,
-		       GLuint sz,
-		       struct x86_reg src,
-		       GLuint src_sz)
-{
-   load[sz-1][src_sz-1](p, dest, src);
-}
-
-static void emit_store4f( struct x86_program *p, 			   
-			  struct x86_reg dest,
-			  struct x86_reg arg0 )
-{
-   sse_movups(&p->func, dest, arg0);
-}
-
-static void emit_store3f( struct x86_program *p, 
-			  struct x86_reg dest,
-			  struct x86_reg arg0 )
-{
-   if (p->outputs_safe) {
-      /* Emit the extra dword anyway.  This may hurt writecombining,
-       * may cause other problems.
-       */
-      sse_movups(&p->func, dest, arg0);
-   }
-   else {
-      /* Alternate strategy - emit two, shuffle, emit one.
-       */
-      sse_movlps(&p->func, dest, arg0);
-      sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
-      sse_movss(&p->func, x86_make_disp(dest,8), arg0);
-   }
-}
-
-static void emit_store2f( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   sse_movlps(&p->func, dest, arg0);
-}
-
-static void emit_store1f( struct x86_program *p, 
-			  struct x86_reg dest,
-			  struct x86_reg arg0 )
-{
-   sse_movss(&p->func, dest, arg0);
-}
-
-
-static void (*store[4])( struct x86_program *p, 
-			 struct x86_reg dest,
-			 struct x86_reg arg0 ) = 
-{
-   emit_store1f, 
-   emit_store2f, 
-   emit_store3f, 
-   emit_store4f 
-};
-
-static void emit_store( struct x86_program *p,
-			struct x86_reg dest,
-			GLuint sz,
-			struct x86_reg temp )
-
-{
-   store[sz-1](p, dest, temp);
-}
-
-static void emit_pack_store_4ub( struct x86_program *p,
-				 struct x86_reg dest,
-				 struct x86_reg temp )
-{
-   /* Scale by 255.0
-    */
-   sse_mulps(&p->func, temp, p->chan0);
-
-   if (p->have_sse2) {
-      sse2_cvtps2dq(&p->func, temp, temp);
-      sse2_packssdw(&p->func, temp, temp);
-      sse2_packuswb(&p->func, temp, temp);
-      sse_movss(&p->func, dest, temp);
-   }
-   else {
-      struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
-      struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
-      sse_cvtps2pi(&p->func, mmx0, temp);
-      sse_movhlps(&p->func, temp, temp);
-      sse_cvtps2pi(&p->func, mmx1, temp);
-      mmx_packssdw(&p->func, mmx0, mmx1);
-      mmx_packuswb(&p->func, mmx0, mmx0);
-      mmx_movd(&p->func, dest, mmx0);
-   }
-}
-
-static GLint get_offset( const void *a, const void *b )
-{
-   return (const char *)b - (const char *)a;
-}
-
-/* Not much happens here.  Eventually use this function to try and
- * avoid saving/reloading the source pointers each vertex (if some of
- * them can fit in registers).
- */
-static void get_src_ptr( struct x86_program *p,
-			 struct x86_reg srcREG,
-			 struct x86_reg vtxREG,
-			 struct tnl_clipspace_attr *a )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
-   struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
-
-   /* Load current a[j].inputptr
-    */
-   x86_mov(&p->func, srcREG, ptr_to_src);
-}
-
-static void update_src_ptr( struct x86_program *p,
-			 struct x86_reg srcREG,
-			 struct x86_reg vtxREG,
-			 struct tnl_clipspace_attr *a )
-{
-   if (a->inputstride) {
-      struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
-      struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
-
-      /* add a[j].inputstride (hardcoded value - could just as easily
-       * pull the stride value from memory each time).
-       */
-      x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
-      
-      /* save new value of a[j].inputptr 
-       */
-      x86_mov(&p->func, ptr_to_src, srcREG);
-   }
-}
-
-
-/* Lots of hardcoding
- *
- * EAX -- pointer to current output vertex
- * ECX -- pointer to current attribute 
- * 
- */
-static GLboolean build_vertex_emit( struct x86_program *p )
-{
-   struct gl_context *ctx = p->ctx;
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   GLuint j = 0;
-
-   struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
-   struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
-   struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
-   struct x86_reg vtxESI = x86_make_reg(file_REG32, reg_SI);
-   struct x86_reg temp = x86_make_reg(file_XMM, 0);
-   struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
-   struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
-   struct x86_reg temp2 = x86_make_reg(file_XMM, 3);
-   GLubyte *fixup, *label;
-
-   /* Push a few regs?
-    */
-   x86_push(&p->func, countEBP);
-   x86_push(&p->func, vtxESI);
-
-
-   /* Get vertex count, compare to zero
-    */
-   x86_xor(&p->func, srcECX, srcECX);
-   x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
-   x86_cmp(&p->func, countEBP, srcECX);
-   fixup = x86_jcc_forward(&p->func, cc_E);
-
-   /* Initialize destination register. 
-    */
-   x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
-
-   /* Dereference ctx to get tnl, then vtx:
-    */
-   x86_mov(&p->func, vtxESI, x86_fn_arg(&p->func, 1));
-   x86_mov(&p->func, vtxESI, x86_make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));
-   vtxESI = x86_make_disp(vtxESI, get_offset(tnl, &tnl->clipspace));
-
-   
-   /* Possibly load vp0, vp1 for viewport calcs:
-    */
-   if (vtx->need_viewport) {
-      sse_movups(&p->func, vp0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0])));
-      sse_movups(&p->func, vp1, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));
-   }
-
-   /* always load, needed or not:
-    */
-   sse_movups(&p->func, p->chan0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0])));
-   sse_movups(&p->func, p->identity, x86_make_disp(vtxESI, get_offset(vtx, &vtx->identity[0])));
-
-   /* Note address for loop jump */
-   label = x86_get_label(&p->func);
-
-   /* Emit code for each of the attributes.  Currently routes
-    * everything through SSE registers, even when it might be more
-    * efficient to stick with regular old x86.  No optimization or
-    * other tricks - enough new ground to cover here just getting
-    * things working.
-    */
-   while (j < vtx->attr_count) {
-      struct tnl_clipspace_attr *a = &vtx->attr[j];
-      struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
-
-      /* Now, load an XMM reg from src, perhaps transform, then save.
-       * Could be shortcircuited in specific cases:
-       */
-      switch (a->format) {
-      case EMIT_1F:
-	 get_src_ptr(p, srcECX, vtxESI, a);
-	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
-	 emit_store(p, dest, 1, temp);
-	 update_src_ptr(p, srcECX, vtxESI, a);
-	 break;
-      case EMIT_2F:
-	 get_src_ptr(p, srcECX, vtxESI, a);
-	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
-	 emit_store(p, dest, 2, temp);
-	 update_src_ptr(p, srcECX, vtxESI, a);
-	 break;
-      case EMIT_3F:
-	 /* Potentially the worst case - hardcode 2+1 copying:
-	  */
-	 if (0) {
-	    get_src_ptr(p, srcECX, vtxESI, a);
-	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
-	    emit_store(p, dest, 3, temp);
-	    update_src_ptr(p, srcECX, vtxESI, a);
-	 }
-	 else {
-	    get_src_ptr(p, srcECX, vtxESI, a);
-	    emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
-	    emit_store(p, dest, 2, temp);
-	    if (a->inputsize > 2) {
-	       emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
-	       emit_store(p, x86_make_disp(dest,8), 1, temp);
-	    }
-	    else {
-	       sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
-	    }
-	    update_src_ptr(p, srcECX, vtxESI, a);
-	 }
-	 break;
-      case EMIT_4F:
-	 get_src_ptr(p, srcECX, vtxESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 emit_store(p, dest, 4, temp);
-	 update_src_ptr(p, srcECX, vtxESI, a);
-	 break;
-      case EMIT_2F_VIEWPORT: 
-	 get_src_ptr(p, srcECX, vtxESI, a);
-	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
-	 sse_mulps(&p->func, temp, vp0);
-	 sse_addps(&p->func, temp, vp1);
-	 emit_store(p, dest, 2, temp);
-	 update_src_ptr(p, srcECX, vtxESI, a);
-	 break;
-      case EMIT_3F_VIEWPORT: 
-	 get_src_ptr(p, srcECX, vtxESI, a);
-	 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
-	 sse_mulps(&p->func, temp, vp0);
-	 sse_addps(&p->func, temp, vp1);
-	 emit_store(p, dest, 3, temp);
-	 update_src_ptr(p, srcECX, vtxESI, a);
-	 break;
-      case EMIT_4F_VIEWPORT: 
-	 get_src_ptr(p, srcECX, vtxESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 sse_mulps(&p->func, temp, vp0);
-	 sse_addps(&p->func, temp, vp1);
-	 emit_store(p, dest, 4, temp);
-	 update_src_ptr(p, srcECX, vtxESI, a);
-	 break;
-      case EMIT_3F_XYW:
-	 get_src_ptr(p, srcECX, vtxESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
-	 emit_store(p, dest, 3, temp);
-	 update_src_ptr(p, srcECX, vtxESI, a);
-	 break;
-
-      case EMIT_1UB_1F:	 
-	 /* Test for PAD3 + 1UB:
-	  */
-	 if (j > 0 &&
-	     a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
-	 {
-	    get_src_ptr(p, srcECX, vtxESI, a);
-	    emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
-	    sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
-	    emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
-	    update_src_ptr(p, srcECX, vtxESI, a);
-	 }
-	 else {
-	    printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
-	    return GL_FALSE;
-	 }
-	 break;
-      case EMIT_3UB_3F_RGB:
-      case EMIT_3UB_3F_BGR:
-	 /* Test for 3UB + PAD1:
-	  */
-	 if (j == vtx->attr_count - 1 ||
-	     a[1].vertoffset >= a->vertoffset + 4) {
-	    get_src_ptr(p, srcECX, vtxESI, a);
-	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
-	    if (a->format == EMIT_3UB_3F_BGR)
-	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
-	    emit_pack_store_4ub(p, dest, temp);
-	    update_src_ptr(p, srcECX, vtxESI, a);
-	 }
-	 /* Test for 3UB + 1UB:
-	  */
-	 else if (j < vtx->attr_count - 1 &&
-		  a[1].format == EMIT_1UB_1F &&
-		  a[1].vertoffset == a->vertoffset + 3) {
-	    get_src_ptr(p, srcECX, vtxESI, a);
-	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
-	    update_src_ptr(p, srcECX, vtxESI, a);
-
-	    /* Make room for incoming value:
-	     */
-	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
-
-	    get_src_ptr(p, srcECX, vtxESI, &a[1]);
-	    emit_load(p, temp2, 1, x86_deref(srcECX), a[1].inputsize);
-	    sse_movss(&p->func, temp, temp2);
-	    update_src_ptr(p, srcECX, vtxESI, &a[1]);
-
-	    /* Rearrange and possibly do BGR conversion:
-	     */
-	    if (a->format == EMIT_3UB_3F_BGR)
-	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
-	    else
-	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
-
-	    emit_pack_store_4ub(p, dest, temp);
-	    j++;		/* NOTE: two attrs consumed */
-	 }
-	 else {
-	    printf("Can't emit 3ub\n");
-	    return GL_FALSE;	/* add this later */
-	 }
-	 break;
-
-      case EMIT_4UB_4F_RGBA:
-	 get_src_ptr(p, srcECX, vtxESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 emit_pack_store_4ub(p, dest, temp);
-	 update_src_ptr(p, srcECX, vtxESI, a);
-	 break;
-      case EMIT_4UB_4F_BGRA:
-	 get_src_ptr(p, srcECX, vtxESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
-	 emit_pack_store_4ub(p, dest, temp);
-	 update_src_ptr(p, srcECX, vtxESI, a);
-	 break;
-      case EMIT_4UB_4F_ARGB:
-	 get_src_ptr(p, srcECX, vtxESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
-	 emit_pack_store_4ub(p, dest, temp);
-	 update_src_ptr(p, srcECX, vtxESI, a);
-	 break;
-      case EMIT_4UB_4F_ABGR:
-	 get_src_ptr(p, srcECX, vtxESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
-	 emit_pack_store_4ub(p, dest, temp);
-	 update_src_ptr(p, srcECX, vtxESI, a);
-	 break;
-      case EMIT_4CHAN_4F_RGBA:
-	 switch (CHAN_TYPE) {
-	 case GL_UNSIGNED_BYTE:
-	    get_src_ptr(p, srcECX, vtxESI, a);
-	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	    emit_pack_store_4ub(p, dest, temp);
-	    update_src_ptr(p, srcECX, vtxESI, a);
-	    break;
-	 case GL_FLOAT:
-	    get_src_ptr(p, srcECX, vtxESI, a);
-	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	    emit_store(p, dest, 4, temp);
-	    update_src_ptr(p, srcECX, vtxESI, a);
-	    break;
-	 case GL_UNSIGNED_SHORT:
-	 default:
-	    printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
-	    return GL_FALSE;
-	 }
-	 break;
-      default:
-	 printf("unknown a[%d].format %d\n", j, a->format);
-	 return GL_FALSE;	/* catch any new opcodes */
-      }
-      
-      /* Increment j by at least 1 - may have been incremented above also:
-       */
-      j++;
-   }
-
-   /* Next vertex:
-    */
-   x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vtx->vertex_size));
-
-   /* decr count, loop if not zero
-    */
-   x86_dec(&p->func, countEBP);
-   x86_test(&p->func, countEBP, countEBP); 
-   x86_jcc(&p->func, cc_NZ, label);
-
-   /* Exit mmx state?
-    */
-   if (p->func.need_emms)
-      mmx_emms(&p->func);
-
-   /* Land forward jump here:
-    */
-   x86_fixup_fwd_jump(&p->func, fixup);
-
-   /* Pop regs and return
-    */
-   x86_pop(&p->func, x86_get_base_reg(vtxESI));
-   x86_pop(&p->func, countEBP);
-   x86_ret(&p->func);
-
-   assert(!vtx->emit);
-   vtx->emit = (tnl_emit_func)x86_get_func(&p->func);
-
-   assert( (char *) p->func.csr - (char *) p->func.store <= MAX_SSE_CODE_SIZE );
-   return GL_TRUE;
-}
-
-
-
-void _tnl_generate_sse_emit( struct gl_context *ctx )
-{
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   struct x86_program p;   
-
-   if (!cpu_has_xmm) {
-      vtx->codegen_emit = NULL;
-      return;
-   }
-
-   memset(&p, 0, sizeof(p));
-
-   p.ctx = ctx;
-   p.inputs_safe = 0;		/* for now */
-   p.outputs_safe = 0;		/* for now */
-   p.have_sse2 = cpu_has_xmm2;
-   p.identity = x86_make_reg(file_XMM, 6);
-   p.chan0 = x86_make_reg(file_XMM, 7);
-
-   if (!x86_init_func_size(&p.func, MAX_SSE_CODE_SIZE)) {
-      vtx->emit = NULL;
-      return;
-   }
-
-   if (build_vertex_emit(&p)) {
-      _tnl_register_fastpath( vtx, GL_TRUE );
-   }
-   else {
-      /* Note the failure so that we don't keep trying to codegen an
-       * impossible state:
-       */
-      _tnl_register_fastpath( vtx, GL_FALSE );
-      x86_release_func(&p.func);
-   }
-}
-
-#else
-
-void _tnl_generate_sse_emit( struct gl_context *ctx )
-{
-   /* Dummy version for when USE_SSE_ASM not defined */
-}
-
-#endif
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/colormac.h"
+#include "main/simple_list.h"
+#include "main/enums.h"
+#include "swrast/s_chan.h"
+#include "t_context.h"
+#include "t_vertex.h"
+
+#if defined(USE_SSE_ASM)
+
+#include "x86/rtasm/x86sse.h"
+#include "x86/common_x86_asm.h"
+
+
+/**
+ * Number of bytes to allocate for generated SSE functions
+ */
+#define MAX_SSE_CODE_SIZE 1024
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+struct x86_program {
+   struct x86_function func;
+
+   struct gl_context *ctx;
+   GLboolean inputs_safe;
+   GLboolean outputs_safe;
+   GLboolean have_sse2;
+   
+   struct x86_reg identity;
+   struct x86_reg chan0;
+};
+
+
+static struct x86_reg get_identity( struct x86_program *p )
+{
+   return p->identity;
+}
+
+static void emit_load4f_4( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_load4f_3( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Have to jump through some hoops:
+    *
+    * c 0 0 0
+    * c 0 0 1
+    * 0 0 c 1
+    * a b c 1
+    */
+   sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+   sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Initialize from identity, then pull in low two words:
+    */
+   sse_movups(&p->func, dest, get_identity(p));
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Pull in low word, then swizzle in identity */
+   sse_movss(&p->func, dest, arg0);
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+}
+
+
+
+static void emit_load3f_3( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Over-reads by 1 dword - potential SEGV if input is a vertex
+    * array.
+    */
+   if (p->inputs_safe) {
+      sse_movups(&p->func, dest, arg0);
+   } 
+   else {
+      /* c 0 0 0
+       * c c c c
+       * a b c c 
+       */
+      sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+      sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
+      sse_movlps(&p->func, dest, arg0);
+   }
+}
+
+static void emit_load3f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_2(p, dest, arg0);
+}
+
+static void emit_load3f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Loading from memory erases the upper bits. */
+   sse_movss(&p->func, dest, arg0);
+}
+
+static void emit_load2f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load2f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Loading from memory erases the upper bits. */
+   sse_movss(&p->func, dest, arg0);
+}
+
+static void emit_load1f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+static void (*load[4][4])( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 ) = {
+   { emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1 },
+
+   { emit_load2f_1, 
+     emit_load2f_2, 
+     emit_load2f_2, 
+     emit_load2f_2 },
+
+   { emit_load3f_1, 
+     emit_load3f_2, 
+     emit_load3f_3, 
+     emit_load3f_3 },
+
+   { emit_load4f_1, 
+     emit_load4f_2, 
+     emit_load4f_3, 
+     emit_load4f_4 } 
+};
+
+static void emit_load( struct x86_program *p,
+		       struct x86_reg dest,
+		       GLuint sz,
+		       struct x86_reg src,
+		       GLuint src_sz)
+{
+   load[sz-1][src_sz-1](p, dest, src);
+}
+
+static void emit_store4f( struct x86_program *p, 			   
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_store3f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   if (p->outputs_safe) {
+      /* Emit the extra dword anyway.  This may hurt writecombining,
+       * may cause other problems.
+       */
+      sse_movups(&p->func, dest, arg0);
+   }
+   else {
+      /* Alternate strategy - emit two, shuffle, emit one.
+       */
+      sse_movlps(&p->func, dest, arg0);
+      sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+      sse_movss(&p->func, x86_make_disp(dest,8), arg0);
+   }
+}
+
+static void emit_store2f( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_store1f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+
+static void (*store[4])( struct x86_program *p, 
+			 struct x86_reg dest,
+			 struct x86_reg arg0 ) = 
+{
+   emit_store1f, 
+   emit_store2f, 
+   emit_store3f, 
+   emit_store4f 
+};
+
+static void emit_store( struct x86_program *p,
+			struct x86_reg dest,
+			GLuint sz,
+			struct x86_reg temp )
+
+{
+   store[sz-1](p, dest, temp);
+}
+
+static void emit_pack_store_4ub( struct x86_program *p,
+				 struct x86_reg dest,
+				 struct x86_reg temp )
+{
+   /* Scale by 255.0
+    */
+   sse_mulps(&p->func, temp, p->chan0);
+
+   if (p->have_sse2) {
+      sse2_cvtps2dq(&p->func, temp, temp);
+      sse2_packssdw(&p->func, temp, temp);
+      sse2_packuswb(&p->func, temp, temp);
+      sse_movss(&p->func, dest, temp);
+   }
+   else {
+      struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
+      struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
+      sse_cvtps2pi(&p->func, mmx0, temp);
+      sse_movhlps(&p->func, temp, temp);
+      sse_cvtps2pi(&p->func, mmx1, temp);
+      mmx_packssdw(&p->func, mmx0, mmx1);
+      mmx_packuswb(&p->func, mmx0, mmx0);
+      mmx_movd(&p->func, dest, mmx0);
+   }
+}
+
+static GLint get_offset( const void *a, const void *b )
+{
+   return (const char *)b - (const char *)a;
+}
+
+/* Not much happens here.  Eventually use this function to try and
+ * avoid saving/reloading the source pointers each vertex (if some of
+ * them can fit in registers).
+ */
+static void get_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vtxREG,
+			 struct tnl_clipspace_attr *a )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
+   struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
+
+   /* Load current a[j].inputptr
+    */
+   x86_mov(&p->func, srcREG, ptr_to_src);
+}
+
+static void update_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vtxREG,
+			 struct tnl_clipspace_attr *a )
+{
+   if (a->inputstride) {
+      struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
+      struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
+
+      /* add a[j].inputstride (hardcoded value - could just as easily
+       * pull the stride value from memory each time).
+       */
+      x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
+      
+      /* save new value of a[j].inputptr 
+       */
+      x86_mov(&p->func, ptr_to_src, srcREG);
+   }
+}
+
+
+/* Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute 
+ * 
+ */
+static GLboolean build_vertex_emit( struct x86_program *p )
+{
+   struct gl_context *ctx = p->ctx;
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   GLuint j = 0;
+
+   struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
+   struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
+   struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
+   struct x86_reg vtxESI = x86_make_reg(file_REG32, reg_SI);
+   struct x86_reg temp = x86_make_reg(file_XMM, 0);
+   struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
+   struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
+   struct x86_reg temp2 = x86_make_reg(file_XMM, 3);
+   GLubyte *fixup, *label;
+
+   /* Push a few regs?
+    */
+   x86_push(&p->func, countEBP);
+   x86_push(&p->func, vtxESI);
+
+
+   /* Get vertex count, compare to zero
+    */
+   x86_xor(&p->func, srcECX, srcECX);
+   x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
+   x86_cmp(&p->func, countEBP, srcECX);
+   fixup = x86_jcc_forward(&p->func, cc_E);
+
+   /* Initialize destination register. 
+    */
+   x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
+
+   /* Dereference ctx to get tnl, then vtx:
+    */
+   x86_mov(&p->func, vtxESI, x86_fn_arg(&p->func, 1));
+   x86_mov(&p->func, vtxESI, x86_make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));
+   vtxESI = x86_make_disp(vtxESI, get_offset(tnl, &tnl->clipspace));
+
+   
+   /* Possibly load vp0, vp1 for viewport calcs:
+    */
+   if (vtx->need_viewport) {
+      sse_movups(&p->func, vp0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0])));
+      sse_movups(&p->func, vp1, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));
+   }
+
+   /* always load, needed or not:
+    */
+   sse_movups(&p->func, p->chan0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0])));
+   sse_movups(&p->func, p->identity, x86_make_disp(vtxESI, get_offset(vtx, &vtx->identity[0])));
+
+   /* Note address for loop jump */
+   label = x86_get_label(&p->func);
+
+   /* Emit code for each of the attributes.  Currently routes
+    * everything through SSE registers, even when it might be more
+    * efficient to stick with regular old x86.  No optimization or
+    * other tricks - enough new ground to cover here just getting
+    * things working.
+    */
+   while (j < vtx->attr_count) {
+      struct tnl_clipspace_attr *a = &vtx->attr[j];
+      struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
+
+      /* Now, load an XMM reg from src, perhaps transform, then save.
+       * Could be shortcircuited in specific cases:
+       */
+      switch (a->format) {
+      case EMIT_1F:
+	 get_src_ptr(p, srcECX, vtxESI, a);
+	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 1, temp);
+	 update_src_ptr(p, srcECX, vtxESI, a);
+	 break;
+      case EMIT_2F:
+	 get_src_ptr(p, srcECX, vtxESI, a);
+	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 2, temp);
+	 update_src_ptr(p, srcECX, vtxESI, a);
+	 break;
+      case EMIT_3F:
+	 /* Potentially the worst case - hardcode 2+1 copying:
+	  */
+	 if (0) {
+	    get_src_ptr(p, srcECX, vtxESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 3, temp);
+	    update_src_ptr(p, srcECX, vtxESI, a);
+	 }
+	 else {
+	    get_src_ptr(p, srcECX, vtxESI, a);
+	    emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 2, temp);
+	    if (a->inputsize > 2) {
+	       emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
+	       emit_store(p, x86_make_disp(dest,8), 1, temp);
+	    }
+	    else {
+	       sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
+	    }
+	    update_src_ptr(p, srcECX, vtxESI, a);
+	 }
+	 break;
+      case EMIT_4F:
+	 get_src_ptr(p, srcECX, vtxESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 4, temp);
+	 update_src_ptr(p, srcECX, vtxESI, a);
+	 break;
+      case EMIT_2F_VIEWPORT: 
+	 get_src_ptr(p, srcECX, vtxESI, a);
+	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	 sse_mulps(&p->func, temp, vp0);
+	 sse_addps(&p->func, temp, vp1);
+	 emit_store(p, dest, 2, temp);
+	 update_src_ptr(p, srcECX, vtxESI, a);
+	 break;
+      case EMIT_3F_VIEWPORT: 
+	 get_src_ptr(p, srcECX, vtxESI, a);
+	 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	 sse_mulps(&p->func, temp, vp0);
+	 sse_addps(&p->func, temp, vp1);
+	 emit_store(p, dest, 3, temp);
+	 update_src_ptr(p, srcECX, vtxESI, a);
+	 break;
+      case EMIT_4F_VIEWPORT: 
+	 get_src_ptr(p, srcECX, vtxESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_mulps(&p->func, temp, vp0);
+	 sse_addps(&p->func, temp, vp1);
+	 emit_store(p, dest, 4, temp);
+	 update_src_ptr(p, srcECX, vtxESI, a);
+	 break;
+      case EMIT_3F_XYW:
+	 get_src_ptr(p, srcECX, vtxESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
+	 emit_store(p, dest, 3, temp);
+	 update_src_ptr(p, srcECX, vtxESI, a);
+	 break;
+
+      case EMIT_1UB_1F:	 
+	 /* Test for PAD3 + 1UB:
+	  */
+	 if (j > 0 &&
+	     a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
+	 {
+	    get_src_ptr(p, srcECX, vtxESI, a);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	    sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
+	    emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
+	    update_src_ptr(p, srcECX, vtxESI, a);
+	 }
+	 else {
+	    printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
+	    return GL_FALSE;
+	 }
+	 break;
+      case EMIT_3UB_3F_RGB:
+      case EMIT_3UB_3F_BGR:
+	 /* Test for 3UB + PAD1:
+	  */
+	 if (j == vtx->attr_count - 1 ||
+	     a[1].vertoffset >= a->vertoffset + 4) {
+	    get_src_ptr(p, srcECX, vtxESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    if (a->format == EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	    emit_pack_store_4ub(p, dest, temp);
+	    update_src_ptr(p, srcECX, vtxESI, a);
+	 }
+	 /* Test for 3UB + 1UB:
+	  */
+	 else if (j < vtx->attr_count - 1 &&
+		  a[1].format == EMIT_1UB_1F &&
+		  a[1].vertoffset == a->vertoffset + 3) {
+	    get_src_ptr(p, srcECX, vtxESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    update_src_ptr(p, srcECX, vtxESI, a);
+
+	    /* Make room for incoming value:
+	     */
+	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+
+	    get_src_ptr(p, srcECX, vtxESI, &a[1]);
+	    emit_load(p, temp2, 1, x86_deref(srcECX), a[1].inputsize);
+	    sse_movss(&p->func, temp, temp2);
+	    update_src_ptr(p, srcECX, vtxESI, &a[1]);
+
+	    /* Rearrange and possibly do BGR conversion:
+	     */
+	    if (a->format == EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	    else
+	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
+
+	    emit_pack_store_4ub(p, dest, temp);
+	    j++;		/* NOTE: two attrs consumed */
+	 }
+	 else {
+	    printf("Can't emit 3ub\n");
+	    return GL_FALSE;	/* add this later */
+	 }
+	 break;
+
+      case EMIT_4UB_4F_RGBA:
+	 get_src_ptr(p, srcECX, vtxESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vtxESI, a);
+	 break;
+      case EMIT_4UB_4F_BGRA:
+	 get_src_ptr(p, srcECX, vtxESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vtxESI, a);
+	 break;
+      case EMIT_4UB_4F_ARGB:
+	 get_src_ptr(p, srcECX, vtxESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vtxESI, a);
+	 break;
+      case EMIT_4UB_4F_ABGR:
+	 get_src_ptr(p, srcECX, vtxESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vtxESI, a);
+	 break;
+      case EMIT_4CHAN_4F_RGBA:
+	 switch (CHAN_TYPE) {
+	 case GL_UNSIGNED_BYTE:
+	    get_src_ptr(p, srcECX, vtxESI, a);
+	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	    emit_pack_store_4ub(p, dest, temp);
+	    update_src_ptr(p, srcECX, vtxESI, a);
+	    break;
+	 case GL_FLOAT:
+	    get_src_ptr(p, srcECX, vtxESI, a);
+	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 4, temp);
+	    update_src_ptr(p, srcECX, vtxESI, a);
+	    break;
+	 case GL_UNSIGNED_SHORT:
+	 default:
+	    printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
+	    return GL_FALSE;
+	 }
+	 break;
+      default:
+	 printf("unknown a[%d].format %d\n", j, a->format);
+	 return GL_FALSE;	/* catch any new opcodes */
+      }
+      
+      /* Increment j by at least 1 - may have been incremented above also:
+       */
+      j++;
+   }
+
+   /* Next vertex:
+    */
+   x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vtx->vertex_size));
+
+   /* decr count, loop if not zero
+    */
+   x86_dec(&p->func, countEBP);
+   x86_test(&p->func, countEBP, countEBP); 
+   x86_jcc(&p->func, cc_NZ, label);
+
+   /* Exit mmx state?
+    */
+   if (p->func.need_emms)
+      mmx_emms(&p->func);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(&p->func, fixup);
+
+   /* Pop regs and return
+    */
+   x86_pop(&p->func, x86_get_base_reg(vtxESI));
+   x86_pop(&p->func, countEBP);
+   x86_ret(&p->func);
+
+   assert(!vtx->emit);
+   vtx->emit = (tnl_emit_func)x86_get_func(&p->func);
+
+   assert( (char *) p->func.csr - (char *) p->func.store <= MAX_SSE_CODE_SIZE );
+   return GL_TRUE;
+}
+
+
+
+void _tnl_generate_sse_emit( struct gl_context *ctx )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   struct x86_program p;   
+
+   if (!cpu_has_xmm) {
+      vtx->codegen_emit = NULL;
+      return;
+   }
+
+   memset(&p, 0, sizeof(p));
+
+   p.ctx = ctx;
+   p.inputs_safe = 0;		/* for now */
+   p.outputs_safe = 0;		/* for now */
+   p.have_sse2 = cpu_has_xmm2;
+   p.identity = x86_make_reg(file_XMM, 6);
+   p.chan0 = x86_make_reg(file_XMM, 7);
+
+   if (!x86_init_func_size(&p.func, MAX_SSE_CODE_SIZE)) {
+      vtx->emit = NULL;
+      return;
+   }
+
+   if (build_vertex_emit(&p)) {
+      _tnl_register_fastpath( vtx, GL_TRUE );
+   }
+   else {
+      /* Note the failure so that we don't keep trying to codegen an
+       * impossible state:
+       */
+      _tnl_register_fastpath( vtx, GL_FALSE );
+      x86_release_func(&p.func);
+   }
+}
+
+#else
+
+void _tnl_generate_sse_emit( struct gl_context *ctx )
+{
+   /* Dummy version for when USE_SSE_ASM not defined */
+}
+
+#endif
diff --git a/mesalib/src/mesa/vbo/vbo_attrib_tmp.h b/mesalib/src/mesa/vbo/vbo_attrib_tmp.h
index 65717eb45..e1023834a 100644
--- a/mesalib/src/mesa/vbo/vbo_attrib_tmp.h
+++ b/mesalib/src/mesa/vbo/vbo_attrib_tmp.h
@@ -809,6 +809,12 @@ TAG(Materialfv)(GLenum face, GLenum pname,
                  const GLfloat * params)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (face != GL_FRONT && face != GL_BACK && face != GL_FRONT_AND_BACK) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glMaterial(invalid face)");
+      return;
+   }
+
    switch (pname) {
    case GL_EMISSION:
       MAT(VBO_ATTRIB_MAT_FRONT_EMISSION, 4, face, params);
@@ -823,7 +829,12 @@ TAG(Materialfv)(GLenum face, GLenum pname,
       MAT(VBO_ATTRIB_MAT_FRONT_SPECULAR, 4, face, params);
       break;
    case GL_SHININESS:
-      MAT(VBO_ATTRIB_MAT_FRONT_SHININESS, 1, face, params);
+      if (*params < 0 || *params > ctx->Const.MaxShininess)
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "glMaterial(invalid shininess: %f out range [0, %f])",
+		     *params, ctx->Const.MaxShininess);
+      else
+         MAT(VBO_ATTRIB_MAT_FRONT_SHININESS, 1, face, params);
       break;
    case GL_COLOR_INDEXES:
       MAT(VBO_ATTRIB_MAT_FRONT_INDEXES, 3, face, params);
diff --git a/mesalib/src/mesa/vbo/vbo_exec_api.c b/mesalib/src/mesa/vbo/vbo_exec_api.c
index cad7c4639..150589bec 100644
--- a/mesalib/src/mesa/vbo/vbo_exec_api.c
+++ b/mesalib/src/mesa/vbo/vbo_exec_api.c
@@ -42,6 +42,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/light.h"
 #include "main/api_arrayelt.h"
 #include "main/api_noop.h"
+#include "main/api_validate.h"
 #include "main/dispatch.h"
 
 #include "vbo_context.h"
@@ -552,6 +553,7 @@ static void GLAPIENTRY vbo_exec_EvalPoint2( GLint i, GLint j )
 #endif /* FEATURE_evaluators */
 
 
+
 /**
  * Called via glBegin.
  */
@@ -563,6 +565,11 @@ static void GLAPIENTRY vbo_exec_Begin( GLenum mode )
       struct vbo_exec_context *exec = &vbo_context(ctx)->exec;
       int i;
 
+      if (!_mesa_valid_prim_mode(ctx, mode)) {
+         _mesa_error(ctx, GL_INVALID_ENUM, "glBegin");
+         return;
+      }
+
       if (ctx->NewState) {
 	 _mesa_update_state( ctx );